Merge pull request #46 from koaning/spacy

Make a spaCy component
koaning · Feb 7, 2023 · dddf41d · dddf41d
2 parents e8c6858 + 437b172
commit dddf41d
Show file tree

Hide file tree

Showing 11 changed files with 171 additions and 20 deletions.
diff --git a/Makefile b/Makefile
@@ -11,7 +11,6 @@ test:
 
 install:
 	python -m pip install -e ".[dev]"
-	pre-commit install
 
 interrogate:
 	interrogate -vv --ignore-nested-functions --ignore-semiprivate --ignore-private --ignore-magic --ignore-module --ignore-init-method --fail-under 100 tests

diff --git a/README.md b/README.md
@@ -21,8 +21,9 @@ want to nit-pick to download only the tools that you need:
 
 ```
 python -m pip install "embetter[text]"
-python -m pip install "embetter[sense2vec]"
 python -m pip install "embetter[sentence-tfm]"
+python -m pip install "embetter[spacy]"
+python -m pip install "embetter[sense2vec]"
 python -m pip install "embetter[bpemb]"
 python -m pip install "embetter[vision]"
 python -m pip install "embetter[all]"
@@ -132,6 +133,7 @@ pipelines as well.
 | `ColumnGrabber`           | [docs](https://koaning.github.io/embetter/API/grab/) | `dataframe` → `ColumnGrabber` → `list with column contents`  |
 | `SentenceEncoder`         | [docs](https://koaning.github.io/embetter/API/text/sentence-enc/) | `list of text` → `SentenceEncoder` → `embedding array`  |
 | `Sense2VecEncoder`        | [docs](https://koaning.github.io/embetter/API/text/sense2vec/)    | `list of text` → `Sense2VecEncoder` → `embedding array` |
+| `spaCyEncoder`        | [docs](https://koaning.github.io/embetter/API/text/spacy/)    | `list of text` → `spaCyEncoder` → `embedding array` |
 | `BytePairEncoder`         | [docs](https://koaning.github.io/embetter/API/text/bytepair/)    | `list of text` → `BytePairEncoder` → `embedding array` |
 | `ImageLoader`             | [docs](https://koaning.github.io/embetter/API/vision/imageload/) | `list of paths` → `ImageLoader` → `list of PIL images` |
 | `ColorHistogramEncoder`   | [docs](https://koaning.github.io/embetter/API/vision/colorhist/) | `list of PIL images` → `ColorHistogramEncoder` → `embedding array`           |

diff --git a/docs/API/text/spacy.md b/docs/API/text/spacy.md
@@ -0,0 +1,5 @@
+# spaCyEncoder
+
+## `embetter.text.spaCyEncoder`
+
+::: embetter.text.spaCyEncoder
diff --git a/docs/index.md b/docs/index.md
@@ -21,8 +21,9 @@ want to nit-pick to download only the tools that you need:
 
 ```
 python -m pip install "embetter[text]"
-python -m pip install "embetter[sense2vec]"
 python -m pip install "embetter[sentence-tfm]"
+python -m pip install "embetter[spacy]"
+python -m pip install "embetter[sense2vec]"
 python -m pip install "embetter[bpemb]"
 python -m pip install "embetter[vision]"
 python -m pip install "embetter[all]"
@@ -127,12 +128,12 @@ The goal of the library is remain small but to offer a few general tools
 that might help with bulk labelling in particular, but general scikit-learn
 pipelines as well.
 
-
 |       class               | link                                                 | What it does                                                                                          |
 |:-------------------------:|------------------------------------------------------|--------------------------------------------------------------|
 | `ColumnGrabber`           | [docs](https://koaning.github.io/embetter/API/grab/) | `dataframe` → `ColumnGrabber` → `list with column contents`  |
 | `SentenceEncoder`         | [docs](https://koaning.github.io/embetter/API/text/sentence-enc/) | `list of text` → `SentenceEncoder` → `embedding array`  |
 | `Sense2VecEncoder`        | [docs](https://koaning.github.io/embetter/API/text/sense2vec/)    | `list of text` → `Sense2VecEncoder` → `embedding array` |
+| `spaCyEncoder`            | [docs](https://koaning.github.io/embetter/API/text/spacy/)    | `list of text` → `spaCyEncoder` → `embedding array` |
 | `BytePairEncoder`         | [docs](https://koaning.github.io/embetter/API/text/bytepair/)    | `list of text` → `BytePairEncoder` → `embedding array` |
 | `ImageLoader`             | [docs](https://koaning.github.io/embetter/API/vision/imageload/) | `list of paths` → `ImageLoader` → `list of PIL images` |
 | `ColorHistogramEncoder`   | [docs](https://koaning.github.io/embetter/API/vision/colorhist/) | `list of PIL images` → `ColorHistogramEncoder` → `embedding array`           |

diff --git a/embetter/text/__init__.py b/embetter/text/__init__.py
@@ -15,5 +15,10 @@
 except ModuleNotFoundError:
     Sense2VecEncoder = NotInstalled("BytePairEncoder", "bpemb")
 
+try:
+    from embetter.text._spacy import spaCyEncoder
+except ModuleNotFoundError:
+    spaCyEncoder = NotInstalled("spaCyEncoder", "spacy")
+
 
-__all__ = ["SentenceEncoder", "Sense2VecEncoder", "BytePairEncoder"]
+__all__ = ["SentenceEncoder", "Sense2VecEncoder", "BytePairEncoder", "spaCyEncoder"]
diff --git a/embetter/text/_bpemb.py b/embetter/text/_bpemb.py
@@ -43,7 +43,7 @@ class BytePairEncoder(EmbetterBase):
     })
 
     # This pipeline grabs the `text` column from a dataframe
-    # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
+    # which then get fed into a small English model
     text_emb_pipeline = make_pipeline(
         ColumnGrabber("text"),
         BytePairEncoder(lang="en")

diff --git a/embetter/text/_s2v.py b/embetter/text/_s2v.py
@@ -11,6 +11,31 @@ class Sense2VecEncoder(BaseEstimator):
 
     Arguments:
         path: path to downloaded model
+
+    **Usage**
+
+    ```python
+    import pandas as pd
+    from sklearn.pipeline import make_pipeline
+    from sklearn.linear_model import LogisticRegression
+
+    from embetter.grab import ColumnGrabber
+    from embetter.text import Sense2VecEncoder
+
+    # Let's suppose this is the input dataframe
+    dataf = pd.DataFrame({
+        "text": ["positive sentiment", "super negative"],
+        "label_col": ["pos", "neg"]
+    })
+
+    # This pipeline grabs the `text` column from a dataframe
+    # which is then passed to the sense2vec model.
+    text_emb_pipeline = make_pipeline(
+        ColumnGrabber("text"),
+        Sense2VecEncoder("path/to/s2v")
+    )
+    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
+    ```
     """
 
     def __init__(self, path: str):

diff --git a/embetter/text/_spacy.py b/embetter/text/_spacy.py
@@ -0,0 +1,82 @@
+import numpy as np
+from typing import Union
+
+import spacy
+from spacy.language import Language
+
+from embetter.base import EmbetterBase
+
+
+class spaCyEncoder(EmbetterBase):
+    """
+    **Usage**
+
+    ```python
+    import pandas as pd
+    from sklearn.pipeline import make_pipeline
+    from sklearn.linear_model import LogisticRegression
+
+    from embetter.grab import ColumnGrabber
+    from embetter.text import spaCyEncoder
+
+    # Let's suppose this is the input dataframe
+    dataf = pd.DataFrame({
+        "text": ["positive sentiment", "super negative"],
+        "label_col": ["pos", "neg"]
+    })
+
+    # This pipeline grabs the `text` column from a dataframe
+    # which is then passed to the medium spaCy model.
+    text_emb_pipeline = make_pipeline(
+        ColumnGrabber("text"),
+        spaCyEncoder("en_core_web_md")
+    )
+    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
+
+    # This pipeline can also be trained to make predictions, using
+    # the embedded features.
+    text_clf_pipeline = make_pipeline(
+        text_emb_pipeline,
+        LogisticRegression()
+    )
+
+    # Prediction example
+    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
+    ```
+    """
+
+    def __init__(self, nlp: Union[str, Language], agg: str = "base"):
+        if isinstance(nlp, str):
+            self.nlp = spacy.load(nlp, deactivate=["ner", "tagger", "parser"])
+        elif isinstance(nlp, Language):
+            self.nlp = nlp
+        else:
+            raise ValueError("`nlp` must be `str` or spaCy-language object.")
+        self.agg = agg
+
+    def fit(self, X, y=None):
+        """No-op. Merely checks for object inputs per sklearn standard."""
+        # Scikit-learn also expects this in the `.fit()` command.
+        self._check_inputs(X)
+        return self
+
+    def _check_inputs(self, X):
+        options = ["mean", "max", "both", "base"]
+        if self.agg not in options:
+            raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.")
+
+    def transform(self, X, y=None):
+        """Transforms the phrase text into a numeric representation."""
+        self._check_inputs(X)
+        docs = self.nlp.pipe(X)
+        if self.agg == "base":
+            return np.array([d.vector for d in docs])
+        token_vectors = [np.array([tok.vector for tok in doc]) for doc in docs]
+        if self.agg == "mean":
+            return np.array([v.mean(axis=0) for v in token_vectors])
+        if self.agg == "max":
+            return np.array([v.max(axis=0) for v in token_vectors])
+        if self.agg == "both":
+            mean_arr = np.array([v.mean(axis=0) for v in token_vectors])
+            max_arr = np.array([v.max(axis=0) for v in token_vectors])
+            return np.concatenate([mean_arr, max_arr], axis=1)
diff --git a/setup.py b/setup.py
@@ -7,6 +7,7 @@
 sentence_encoder_pkgs = ["sentence-transformers>=2.2.2"]
 sense2vec_pkgs = ["sense2vec==2.0.0"]
 bpemb_packages = ["bpemb>=0.3.3"]
+spacy_packages = ["spacy>=3.5.0"]
 
 text_packages = sentence_encoder_pkgs + sense2vec_pkgs + bpemb_packages
 
@@ -58,6 +59,7 @@
     extras_require={
         "sense2vec": sense2vec_pkgs + base_packages,
         "sentence-tfm": sentence_encoder_pkgs + base_packages,
+        "spacy": spacy_packages + base_packages,
         "bpemb": bpemb_packages + base_packages,
         "text": text_packages + base_packages,
         "vision": vision_packages + base_packages,

diff --git a/tests/test_docs.py b/tests/test_docs.py
@@ -1,7 +1,7 @@
 import pytest
 from mktestdocs import check_md_file, check_docstring
 from embetter.vision import ColorHistogramEncoder, TimmEncoder, ImageLoader
-from embetter.text import Sense2VecEncoder, SentenceEncoder, BytePairEncoder
+from embetter.text import SentenceEncoder, BytePairEncoder
 from embetter.grab import ColumnGrabber
 
 
@@ -15,10 +15,11 @@ def test_finetune_docs():
     check_md_file(fpath="docs/finetuners.md", memory=True)
 
 
+# I'm not testing spaCy, sense2vec because those docs would require
+# us to download `en_core_web_md`` on every CI. Which is too heavy.
 objects = [
     ColumnGrabber,
     SentenceEncoder,
-    Sense2VecEncoder,
     ColorHistogramEncoder,
     TimmEncoder,
     ImageLoader,

diff --git a/tests/test_text.py b/tests/test_text.py
@@ -1,19 +1,24 @@
 import pytest
 import numpy as np
 
-from embetter.text import SentenceEncoder, BytePairEncoder
+from spacy.vocab import Vocab
+from spacy.language import Language
+
+from embetter.text import SentenceEncoder, BytePairEncoder, spaCyEncoder
+
+
+test_sentences = [
+    "This is a test sentence!",
+    "And this is another one",
+    "\rUnicode stuff: ♣️,♦️,❤️,♠️\n",
+]
 
 
 def test_basic_sentence_encoder():
     """Check correct dimensions and repr for SentenceEncoder."""
     encoder = SentenceEncoder()
     # Embedding dim of underlying model
     output_dim = encoder.tfm._modules["1"].word_embedding_dimension
-    test_sentences = [
-        "This is a test sentence!",
-        "And this is another one",
-        "\rUnicode stuff: ♣️,♦️,❤️,♠️\n",
-    ]
     output = encoder.fit_transform(test_sentences)
     assert isinstance(output, np.ndarray)
     assert output.shape == (len(test_sentences), output_dim)
@@ -24,17 +29,41 @@ def test_basic_sentence_encoder():
 
 @pytest.mark.parametrize("setting", ["max", "mean", "both"])
 def test_basic_bpemb(setting):
-    """Check correct dimensions and repr for SentenceEncoder."""
+    """Check correct dimensions and repr for BytePairEncoder."""
     encoder = BytePairEncoder(lang="en", dim=50, agg=setting)
     # Embedding dim of underlying model
-    test_sentences = [
-        "This is a test sentence!",
-        "And this is another one",
-        "\rUnicode stuff: ♣️,♦️,❤️,♠️\n",
-    ]
     output = encoder.fit_transform(test_sentences)
     assert isinstance(output, np.ndarray)
     assert output.shape == (len(test_sentences), 100 if setting == "both" else 50)
     # scikit-learn configures repr dynamically from defined attributes.
     # To test correct implementation we should test if calling repr breaks.
     assert repr(encoder)
+
+
+@pytest.fixture()
+def nlp():
+    """Just a fixture with a lightweight spaCy lang"""
+    vector_data = {
+        "red": np.array([1.0, 0.0]),
+        "green": np.array([0.5, 0.5]),
+        "blue": np.array([0.0, 1.0]),
+        "purple": np.array([0.0, 1.0]),
+    }
+
+    vocab = Vocab(strings=list(vector_data.keys()))
+    for word, vector in vector_data.items():
+        vocab.set_vector(word, vector)
+    return Language(vocab=vocab)
+
+
+@pytest.mark.parametrize("setting", ["max", "mean", "both"])
+def test_basic_spacy(setting, nlp):
+    """Check correct dimensions and repr for spaCyEncoder."""
+    encoder = spaCyEncoder(nlp, agg=setting)
+    # Embedding dim of underlying model
+    output = encoder.fit_transform(test_sentences)
+    assert isinstance(output, np.ndarray)
+    assert output.shape == (len(test_sentences), 4 if setting == "both" else 2)
+    # scikit-learn configures repr dynamically from defined attributes.
+    # To test correct implementation we should test if calling repr breaks.
+    assert repr(encoder)