feat: enhance testing workflow with parallel execution and update REA…

…DME for future work sections
iscc · Aug 13, 2024 · 0f21b0e · 0f21b0e
1 parent 2d14293
commit 0f21b0e
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 4 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -58,4 +58,4 @@ jobs:
         run: poetry install
 
       - name: Run Tests
-        run: poetry run pytest --cov=iscc_sci --cov-report=xml -v tests
+        run: poetry run pytest -n auto --cov=iscc_sct --cov-report=xml -v tests
diff --git a/README.md b/README.md
@@ -136,9 +136,31 @@ poetry install
 If you have suggestions for improvements or bug fixes, please open an issue or pull request. For major changes, please
 open an issue first to discuss your ideas.
 
+## Future Work
+
+### Shift Resistant Semantic Chunking
+
+The current chunking strategy uses tries to maximize chunk sizes (up to 127 tokens) wheil still splitting at lexically
+sensible boundaries with an overlap of up to 48 tokens. See
+[text-splitter](https://github.com/benbrandt/text-splitter).
+
+Cross document chunk matching via granular Simprints can likely be improved significantly with a semantically aware and
+shift resistant chunking strategy. Better shift resistance would improve the chances that the bounderies detected for
+semantically similar text sequences in different documents are aligned.
+
+### MRL based Embeddings
+
+A text embedding model trained with [Matryoshka Representation Learning](https://arxiv.org/pdf/2205.13147) may yield
+better results with short 64-bit Semantic Text-Codes.
+
+### Larger Chunk Sizes
+
+A text embedding model with support for a larger `max_token` size (currently 128) may yield higher-order granular
+simprints based on larger chunks of text.
+
 ## Acknowledgements
 
-- Text Chunking: [semantic-text-splitter](https://github.com/benbrandt/text-splitter)
+- Text Chunking: [text-splitter](https://github.com/benbrandt/text-splitter)
 - Text Embedding:
   [Sentence-Transformer](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html#original-models)
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,7 @@ gpu = ["onnxruntime-gpu"]
 
 [tool.poetry.group.test.dependencies]
 pytest = "*"
+pytest-xdist = "*"
 coverage = "*"
 pytest-cov = "*"
 
@@ -84,7 +85,7 @@ line-ending = "lf"
 [tool.poe.tasks]
 format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
 format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
-test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100 --cov-report=term-missing", help = "Run tests with coverage" }
+test = { cmd = "pytest -n auto --cov=iscc_sct --cov-fail-under=100 --cov-report=term-missing --color=yes", help = "Run tests with coverage" }
 all = ["format-code", "format-markdown", "test"]
 
 [build-system]