Skip to content

Commit

Permalink
Merge pull request #79 from koaning/version-0.5.1
Browse files Browse the repository at this point in the history
Version 0.5.1
  • Loading branch information
koaning committed Aug 9, 2023
2 parents 639710d + ec11107 commit 3ecd8dc
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 19 deletions.
4 changes: 1 addition & 3 deletions .github/workflows/style.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,8 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
- name: Install Base Dependencies
run: python -m pip install -e .
- name: Install Testing Dependencies
run: make install
run: python -m pip install black interrogate
- name: Interrogate
if: always()
run: make interrogate
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ from embetter.grab import ColumnGrabber
from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder

# Representations for text
from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, Word2VecEncoder
from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder

# Representations from multi-modal models
from embetter.multi import ClipEncoder
Expand Down
20 changes: 14 additions & 6 deletions docs/API/text.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,6 @@
options:
members: false

## BytePairEncoder

::: embetter.text.BytePairEncoder
options:
members: false

## KerasNLPEncoder

::: embetter.text.KerasNLPEncoder
Expand All @@ -27,3 +21,17 @@
::: embetter.text.Sense2VecEncoder
options:
members: false

## BytePairEncoder

::: embetter.text.BytePairEncoder
options:
members: false


## GensimEncoder

::: embetter.text.GensimEncoder
options:
members: false

5 changes: 3 additions & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<br>

Embetter implements scikit-learn compatible embeddings for computer vision and text. It should make it very easy to quickly build proof of concepts using scikit-learn pipelines and, in particular, should help with [bulk labelling](https://www.youtube.com/watch?v=gDk7_f3ovIk). It's a also meant to play nice with [bulk](https://github.com/koaning/bulk) and [scikit-partial](https://github.com/koaning/scikit-partial).
Embetter implements scikit-learn compatible embeddings for computer vision and text. It should make it very easy to quickly build proof of concepts using scikit-learn pipelines and, in particular, should help with [bulk labelling](https://www.youtube.com/watch?v=gDk7_f3ovIk). It's a also meant to play nice with [bulk](https://github.com/koaning/bulk) and [scikit-partial](https://github.com/koaning/scikit-partial) but it can also be used together with your favorite ANN solution like [weaviate](https://weaviate.io/), [chromadb](https://www.trychroma.com/) and [hnswlib](https://github.com/nmslib/hnswlib).

## Install

Expand All @@ -25,6 +25,7 @@ python -m pip install "embetter[sentence-tfm]"
python -m pip install "embetter[spacy]"
python -m pip install "embetter[sense2vec]"
python -m pip install "embetter[bpemb]"
python -m pip install "embetter[gensim]"
python -m pip install "embetter[vision]"
python -m pip install "embetter[all]"
```
Expand All @@ -41,7 +42,7 @@ from embetter.grab import ColumnGrabber
from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder

# Representations for text
from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder
from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder

# Representations from multi-modal models
from embetter.multi import ClipEncoder
Expand Down
7 changes: 4 additions & 3 deletions embetter/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@
spaCyEncoder = NotInstalled("spaCyEncoder", "spacy")

try:
from embetter.text._word2vec import Word2VecEncoder
from embetter.text._word2vec import GensimEncoder
except ModuleNotFoundError:
Word2VecEncoder = NotInstalled("Word2VecEncoder", "gensim")
GensimEncoder = NotInstalled("GensimEncoder", "gensim")

try:
from embetter.text._keras import KerasNLPEncoder
except ModuleNotFoundError:
KerasNLPEncoder = NotInstalled("KerasNLPEncoder", "keras_nlp")
Expand All @@ -35,6 +36,6 @@
"Sense2VecEncoder",
"BytePairEncoder",
"spaCyEncoder",
"Word2VecEncoder",
"GensimEncoder",
"KerasNLPEncoder",
]
2 changes: 1 addition & 1 deletion embetter/text/_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from embetter.base import EmbetterBase


class Word2VecEncoder(EmbetterBase):
class GensimEncoder(EmbetterBase):
"""
Encodes text using a static word embedding model. The component uses gensim's default tokenizer.
Expand Down
6 changes: 3 additions & 3 deletions tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from embetter.text import (
BytePairEncoder,
SentenceEncoder,
Word2VecEncoder,
GensimEncoder,
spaCyEncoder,
)
from embetter.utils import cached
Expand All @@ -30,15 +30,15 @@ def test_word2vec(setting):
model = Word2Vec(
sentences=sentences, vector_size=vector_size, window=3, min_count=1
)
encoder = Word2VecEncoder(model, agg=setting)
encoder = GensimEncoder(model, agg=setting)
output = encoder.fit_transform(test_sentences)
assert isinstance(output, np.ndarray)
out_dim = vector_size if setting != "both" else vector_size * 2
assert output.shape == (len(test_sentences), out_dim)
# This tests whether it can load the model from disk
with tempfile.NamedTemporaryFile() as fp:
model.save(fp)
encoder = Word2VecEncoder(fp.name, agg=setting)
encoder = GensimEncoder(fp.name, agg=setting)
encoder.transform(test_sentences)
assert repr(encoder)

Expand Down

0 comments on commit 3ecd8dc

Please sign in to comment.