Merge pull request #60 from koaning/cache

Add support for a cache
koaning · Jun 17, 2023 · a86901d · a86901d
2 parents a38af79 + 90833e6
commit a86901d
Show file tree

Hide file tree

Showing 8 changed files with 142 additions and 8 deletions.
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.11"]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/docs/API/grab.md b/docs/API/grab.md
@@ -1,7 +1,9 @@
-# ColumnGrabber
+# Grabbers
+
+## ColumnGrabber
 
 ::: embetter.grab.ColumnGrabber
 
-# KeyGrabber
+## KeyGrabber
 
 ::: embetter.grab.KeyGrabber
diff --git a/docs/API/utils.md b/docs/API/utils.md
@@ -0,0 +1,5 @@
+# Utils
+
+## cached
+
+::: embetter.utils.cached
diff --git a/docs/applications.md b/docs/applications.md
@@ -5,15 +5,54 @@ title: Applications
 This document contains some tricks, hints and demos of applications that you might want to consider
 in combination with this library. 
 
+## Cache
+
+Calculating embeddings can be expensive, even costly when you're using external providers. 
+This is why this library offers an integration with [diskcache](https://grantjenks.com/docs/diskcache/). 
+That way, you can infer the embeddings once and store them to disk for later.
+
+Here's an example of how you might run that. 
+
+```python
+from embetter.text import SentenceEncoder
+from embetter.utils import cached
+
+encoder = cached("sentence-enc", SentenceEncoder('all-MiniLM-L6-v2'))
+
+examples = [f"this is a pretty long text, which is more expensive {i}" for i in range(10_000)]
+
+# This might be a bit slow ~17.2s on our machine
+encoder.transform(examples)
+
+# This should be quicker ~4.71s on our machine
+encoder.transform(examples)
+```
+
+Note that you're also able to fetch the precalculated embeddings directly via: 
+
+```python
+from diskcache import Cache
+
+# Make sure that you use the same name as in `cached`
+cache = Cache("sentence-enc")
+# Use a string as a key, if it's precalculated you'll get an array back.
+cache["this is a pretty long text, which is more expensive 0"]
+```
+
+Be mindful of what does in to the encoder that you choose. It's preferable to give it
+text as opposed to numpy arrays. Also note that the first time that you'll run this
+it will take more time due to the overhead of writing into the cache.
+
 ## Speedup with Modal 
 
 Embedding text can be slow, especially when you're running on a CPU. If you wish 
 to speed up your embedding calculations you may enjoy using [modal](https://modal.com/). 
 Modal allows you to add a GPU to a Python function simply by adding a decorator.
 
-Not every encoder in embetter will get a speedup by using a GPU but the
-`SentenceEncoder` as well as `ClipEncoder` should both automatically detect
-when the GPU is available automatically.
+Not every encoder in embetter will get a speedup by using a GPU. But we've done some 
+benchmarks and noticed that
+`SentenceEncoder` as well as `ClipEncoder` should both benefit. These components will
+also automatically detect when the GPU is available automatically.
 
 The code below gives an example. 
 

diff --git a/embetter/utils.py b/embetter/utils.py
@@ -0,0 +1,67 @@
+import numpy as np
+from typing import Callable
+from diskcache import Cache
+from sklearn.base import BaseEstimator
+
+
+def cached(name: str, pipeline: BaseEstimator):
+    """
+    Uses a [diskcache](https://grantjenks.com/docs/diskcache/tutorial.html) in
+    an attempt to fetch precalculated embeddings from disk instead of inferring them.
+    This can save on compute, but also cloud credits, depending on the backend
+    that you're using to generate embeddings.
+
+    Be mindful of what does in to the encoder that you choose. It's preferable to give it
+    text as opposed to numpy arrays. Also note that the first time that you'll run this
+    it will take more time due to the overhead of writing into the cache.
+
+    Arguments:
+        name: the name of the local folder to represent the disk cache
+        pipeline: the pipeline that you want to cache
+
+    Usage:
+    ```python
+    from embetter.text import SentenceEncoder
+    from embetter.utils import cached
+
+    encoder = cached("sentence-enc", SentenceEncoder('all-MiniLM-L6-v2'))
+
+    examples = [f"this is a pretty long text, which is more expensive {i}" for i in range(10_000)]
+
+    # This might be a bit slow ~17.2s on our machine
+    encoder.transform(examples)
+
+    # This should be quicker ~4.71s on our machine
+    encoder.transform(examples)
+    ```
+
+    Note that you're also able to fetch the precalculated embeddings directly via:
+
+    ```python
+    from diskcache import Cache
+
+    # Make sure that you use the same name as in `cached`
+    cache = Cache("sentence-enc")
+    # Use a string as a key, if it's precalculated you'll get an array back.
+    cache["this is a pretty long text, which is more expensive 0"]
+    ```
+    """
+    cache = Cache(name)
+
+    def run_cached(method: Callable):
+        def wrapped(X, y=None):
+            results = {i: cache[x] if x in cache else "TODO" for i, x in enumerate(X)}
+            text_todo = [X[i] for i, x in results.items() if x == "TODO"]
+            i_todo = [i for i, x in results.items() if x == "TODO"]
+            out = method(text_todo)
+            with Cache(cache.directory) as reference:
+                for i, text, x_tfm in zip(i_todo, text_todo, out):
+                    results[i] = x_tfm
+                    reference.set(text, x_tfm)
+            return np.array([arr for i, arr in results.items()])
+
+        return wrapped
+
+    pipeline.transform = run_cached(pipeline.transform)
+
+    return pipeline
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -5,9 +5,14 @@ plugins:
     custom_templates: templates
 theme:
   name: material
+  font: 
+    text: Inter
+    code: Jetbrains Mono
   logo: images/icon.png
   palette:
     primary: white
+  features:
+    - toc.integrate 
 markdown_extensions:
   - pymdownx.highlight:
       use_pygments: true

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 from setuptools import setup, find_packages
 
 
-base_packages = ["scikit-learn>=1.0.0", "pandas>=1.0.0"]
+base_packages = ["scikit-learn>=1.0.0", "pandas>=1.0.0", "diskcache>=5.6.1"]
 
 sentence_encoder_pkgs = ["sentence-transformers>=2.2.2"]
 sense2vec_pkgs = ["sense2vec==2.0.0"]
@@ -42,7 +42,7 @@
 
 setup(
     name="embetter",
-    version="0.3.8",
+    version="0.4.0",
     author="Vincent D. Warmerdam",
     packages=find_packages(exclude=["notebooks", "docs"]),
     description="Just a bunch of useful embeddings to get started quickly.",

diff --git a/tests/test_text.py b/tests/test_text.py
@@ -5,6 +5,7 @@
 from spacy.language import Language
 
 from embetter.text import SentenceEncoder, BytePairEncoder, spaCyEncoder
+from embetter.utils import cached
 
 
 test_sentences = [
@@ -67,3 +68,18 @@ def test_basic_spacy(setting, nlp):
     # scikit-learn configures repr dynamically from defined attributes.
     # To test correct implementation we should test if calling repr breaks.
     assert repr(encoder)
+
+
+def test_basic_spacy_cached(nlp, tmpdir):
+    """Just an e2e test for the cache."""
+    encoder = spaCyEncoder(nlp)
+    output_before = encoder.transform(test_sentences)
+
+    # Now we cache it
+    encoder = cached(tmpdir, encoder)
+    output_during = encoder.transform(test_sentences)
+
+    encoder = cached(tmpdir, encoder)
+    output_after = encoder.transform(test_sentences)
+    assert (output_before == output_during).all()
+    assert (output_during == output_after).all()