From e449a35447cd8bc0307899048ac1efc236d59271 Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 9 Aug 2023 15:59:28 +0200 Subject: [PATCH 1/3] final changes --- docs/API/text.md | 20 ++++++++++++++------ embetter/text/__init__.py | 7 ++++--- embetter/text/_word2vec.py | 2 +- tests/test_text.py | 6 +++--- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/docs/API/text.md b/docs/API/text.md index 3efd451..98f9847 100644 --- a/docs/API/text.md +++ b/docs/API/text.md @@ -4,12 +4,6 @@ options: members: false -## BytePairEncoder - -::: embetter.text.BytePairEncoder - options: - members: false - ## KerasNLPEncoder ::: embetter.text.KerasNLPEncoder @@ -27,3 +21,17 @@ ::: embetter.text.Sense2VecEncoder options: members: false + +## BytePairEncoder + +::: embetter.text.BytePairEncoder + options: + members: false + + +## GensimEncoder + +::: embetter.text.GensimEncoder + options: + members: false + diff --git a/embetter/text/__init__.py b/embetter/text/__init__.py index 0c69a13..9a4fce3 100644 --- a/embetter/text/__init__.py +++ b/embetter/text/__init__.py @@ -21,10 +21,11 @@ spaCyEncoder = NotInstalled("spaCyEncoder", "spacy") try: - from embetter.text._word2vec import Word2VecEncoder + from embetter.text._word2vec import GensimEncoder except ModuleNotFoundError: - Word2VecEncoder = NotInstalled("Word2VecEncoder", "gensim") + GensimEncoder = NotInstalled("GensimEncoder", "gensim") +try: from embetter.text._keras import KerasNLPEncoder except ModuleNotFoundError: KerasNLPEncoder = NotInstalled("KerasNLPEncoder", "keras_nlp") @@ -35,6 +36,6 @@ "Sense2VecEncoder", "BytePairEncoder", "spaCyEncoder", - "Word2VecEncoder", + "GensimEncoder", "KerasNLPEncoder", ] diff --git a/embetter/text/_word2vec.py b/embetter/text/_word2vec.py index 08c1e20..75df154 100644 --- a/embetter/text/_word2vec.py +++ b/embetter/text/_word2vec.py @@ -8,7 +8,7 @@ from embetter.base import EmbetterBase -class Word2VecEncoder(EmbetterBase): +class GensimEncoder(EmbetterBase): """ Encodes text using a static word embedding model. The component uses gensim's default tokenizer. diff --git a/tests/test_text.py b/tests/test_text.py index e436a55..675a8bd 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -10,7 +10,7 @@ from embetter.text import ( BytePairEncoder, SentenceEncoder, - Word2VecEncoder, + GensimEncoder, spaCyEncoder, ) from embetter.utils import cached @@ -30,7 +30,7 @@ def test_word2vec(setting): model = Word2Vec( sentences=sentences, vector_size=vector_size, window=3, min_count=1 ) - encoder = Word2VecEncoder(model, agg=setting) + encoder = GensimEncoder(model, agg=setting) output = encoder.fit_transform(test_sentences) assert isinstance(output, np.ndarray) out_dim = vector_size if setting != "both" else vector_size * 2 @@ -38,7 +38,7 @@ def test_word2vec(setting): # This tests whether it can load the model from disk with tempfile.NamedTemporaryFile() as fp: model.save(fp) - encoder = Word2VecEncoder(fp.name, agg=setting) + encoder = GensimEncoder(fp.name, agg=setting) encoder.transform(test_sentences) assert repr(encoder) From 4ac67c3fca45f0434b530d462958b7fee0491e9b Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 9 Aug 2023 16:00:03 +0200 Subject: [PATCH 2/3] update landing --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index d0b7f5d..178321b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,7 +6,7 @@
-Embetter implements scikit-learn compatible embeddings for computer vision and text. It should make it very easy to quickly build proof of concepts using scikit-learn pipelines and, in particular, should help with [bulk labelling](https://www.youtube.com/watch?v=gDk7_f3ovIk). It's a also meant to play nice with [bulk](https://github.com/koaning/bulk) and [scikit-partial](https://github.com/koaning/scikit-partial). +Embetter implements scikit-learn compatible embeddings for computer vision and text. It should make it very easy to quickly build proof of concepts using scikit-learn pipelines and, in particular, should help with [bulk labelling](https://www.youtube.com/watch?v=gDk7_f3ovIk). It's a also meant to play nice with [bulk](https://github.com/koaning/bulk) and [scikit-partial](https://github.com/koaning/scikit-partial) but it can also be used together with your favorite ANN solution like [weaviate](https://weaviate.io/), [chromadb](https://www.trychroma.com/) and [hnswlib](https://github.com/nmslib/hnswlib). ## Install From ec11107175df4b76550bbd7ea7c1bf4f91a386ad Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 9 Aug 2023 16:12:21 +0200 Subject: [PATCH 3/3] more lightweight --- .github/workflows/style.yml | 4 +--- README.md | 2 +- docs/index.md | 3 ++- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index 41325ad..58aa07d 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -22,10 +22,8 @@ jobs: with: python-version: ${{ matrix.python-version }} cache: 'pip' - - name: Install Base Dependencies - run: python -m pip install -e . - name: Install Testing Dependencies - run: make install + run: python -m pip install black interrogate - name: Interrogate if: always() run: make interrogate diff --git a/README.md b/README.md index 3887af5..3dbccf1 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ from embetter.grab import ColumnGrabber from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder # Representations for text -from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, Word2VecEncoder +from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder # Representations from multi-modal models from embetter.multi import ClipEncoder diff --git a/docs/index.md b/docs/index.md index 178321b..151fcfc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -25,6 +25,7 @@ python -m pip install "embetter[sentence-tfm]" python -m pip install "embetter[spacy]" python -m pip install "embetter[sense2vec]" python -m pip install "embetter[bpemb]" +python -m pip install "embetter[gensim]" python -m pip install "embetter[vision]" python -m pip install "embetter[all]" ``` @@ -41,7 +42,7 @@ from embetter.grab import ColumnGrabber from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder # Representations for text -from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder +from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder # Representations from multi-modal models from embetter.multi import ClipEncoder