Merge pull request #79 from koaning/version-0.5.1

Version 0.5.1
koaning · Aug 9, 2023 · 3ecd8dc · 3ecd8dc
2 parents 639710d + ec11107
commit 3ecd8dc
Show file tree

Hide file tree

Showing 7 changed files with 27 additions and 19 deletions.
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
@@ -22,10 +22,8 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
         cache: 'pip'
-    - name: Install Base Dependencies
-      run: python -m pip install -e .
     - name: Install Testing Dependencies
-      run: make install
+      run: python -m pip install black interrogate
     - name: Interrogate
       if: always()
       run: make interrogate

diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ from embetter.grab import ColumnGrabber
 from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder
 
 # Representations for text
-from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, Word2VecEncoder
+from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder
 
 # Representations from multi-modal models
 from embetter.multi import ClipEncoder

diff --git a/docs/API/text.md b/docs/API/text.md
@@ -4,12 +4,6 @@
     options:
         members: false
 
-## BytePairEncoder
-
-::: embetter.text.BytePairEncoder
-    options:
-        members: false
-
 ## KerasNLPEncoder
 
 ::: embetter.text.KerasNLPEncoder
@@ -27,3 +21,17 @@
 ::: embetter.text.Sense2VecEncoder
     options:
         members: false
+
+## BytePairEncoder
+
+::: embetter.text.BytePairEncoder
+    options:
+        members: false
+
+
+## GensimEncoder
+
+::: embetter.text.GensimEncoder
+    options:
+        members: false
+
diff --git a/docs/index.md b/docs/index.md
@@ -6,7 +6,7 @@
 
 <br> 
 
-Embetter implements scikit-learn compatible embeddings for computer vision and text. It should make it very easy to quickly build proof of concepts using scikit-learn pipelines and, in particular, should help with [bulk labelling](https://www.youtube.com/watch?v=gDk7_f3ovIk). It's a also meant to play nice with [bulk](https://github.com/koaning/bulk) and [scikit-partial](https://github.com/koaning/scikit-partial).
+Embetter implements scikit-learn compatible embeddings for computer vision and text. It should make it very easy to quickly build proof of concepts using scikit-learn pipelines and, in particular, should help with [bulk labelling](https://www.youtube.com/watch?v=gDk7_f3ovIk). It's a also meant to play nice with [bulk](https://github.com/koaning/bulk) and [scikit-partial](https://github.com/koaning/scikit-partial) but it can also be used together with your favorite ANN solution like [weaviate](https://weaviate.io/), [chromadb](https://www.trychroma.com/) and [hnswlib](https://github.com/nmslib/hnswlib). 
 
 ## Install 
 
@@ -25,6 +25,7 @@ python -m pip install "embetter[sentence-tfm]"
 python -m pip install "embetter[spacy]"
 python -m pip install "embetter[sense2vec]"
 python -m pip install "embetter[bpemb]"
+python -m pip install "embetter[gensim]"
 python -m pip install "embetter[vision]"
 python -m pip install "embetter[all]"
 ```
@@ -41,7 +42,7 @@ from embetter.grab import ColumnGrabber
 from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder
 
 # Representations for text
-from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder
+from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder
 
 # Representations from multi-modal models
 from embetter.multi import ClipEncoder

diff --git a/embetter/text/__init__.py b/embetter/text/__init__.py
@@ -21,10 +21,11 @@
     spaCyEncoder = NotInstalled("spaCyEncoder", "spacy")
 
 try:
-    from embetter.text._word2vec import Word2VecEncoder
+    from embetter.text._word2vec import GensimEncoder
 except ModuleNotFoundError:
-    Word2VecEncoder = NotInstalled("Word2VecEncoder", "gensim")
+    GensimEncoder = NotInstalled("GensimEncoder", "gensim")
 
+try:
     from embetter.text._keras import KerasNLPEncoder
 except ModuleNotFoundError:
     KerasNLPEncoder = NotInstalled("KerasNLPEncoder", "keras_nlp")
@@ -35,6 +36,6 @@
     "Sense2VecEncoder",
     "BytePairEncoder",
     "spaCyEncoder",
-    "Word2VecEncoder",
+    "GensimEncoder",
     "KerasNLPEncoder",
 ]
diff --git a/embetter/text/_word2vec.py b/embetter/text/_word2vec.py
@@ -8,7 +8,7 @@
 from embetter.base import EmbetterBase
 
 
-class Word2VecEncoder(EmbetterBase):
+class GensimEncoder(EmbetterBase):
     """
     Encodes text using a static word embedding model. The component uses gensim's default tokenizer.
 

diff --git a/tests/test_text.py b/tests/test_text.py
@@ -10,7 +10,7 @@
 from embetter.text import (
     BytePairEncoder,
     SentenceEncoder,
-    Word2VecEncoder,
+    GensimEncoder,
     spaCyEncoder,
 )
 from embetter.utils import cached
@@ -30,15 +30,15 @@ def test_word2vec(setting):
     model = Word2Vec(
         sentences=sentences, vector_size=vector_size, window=3, min_count=1
     )
-    encoder = Word2VecEncoder(model, agg=setting)
+    encoder = GensimEncoder(model, agg=setting)
     output = encoder.fit_transform(test_sentences)
     assert isinstance(output, np.ndarray)
     out_dim = vector_size if setting != "both" else vector_size * 2
     assert output.shape == (len(test_sentences), out_dim)
     # This tests whether it can load the model from disk
     with tempfile.NamedTemporaryFile() as fp:
         model.save(fp)
-        encoder = Word2VecEncoder(fp.name, agg=setting)
+        encoder = GensimEncoder(fp.name, agg=setting)
         encoder.transform(test_sentences)
     assert repr(encoder)