Merge pull request #228 from OpenFreeEnergy/npy_dtype_codec

Add NPY dtype codec
OpenFreeEnergy · Oct 10, 2023 · f87e236 · f87e236
2 parents 1b7c832 + 4040056
commit f87e236
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 2 deletions.
diff --git a/docs/guide/serialization.rst b/docs/guide/serialization.rst
@@ -153,6 +153,27 @@ Details of how the object is recognized for encoding or how the dictionary
 is recognized for decoding can be changed by passing functions to the
 ``is_my_obj`` or ``is_my_dict`` parameters of :class:`.JSONCodec`.
 
+.. warning::
+    The Custom encoders & decoders only override the default JSON
+    encoder/decoder if they are not able to natively handle the object.
+    This leads to some odd / lossy behaviour for some objects such
+    as ``np.float64`` which is natively converted to a ``float`` type
+    by the default encoder, whilst other numpy generic types are
+    appropriately roundtripped.
+
+On the use of NumPy generic types
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Due to their inconsistent behaviour in how they are handled by the default
+JSON encoder/decoder routines (see warning above), it is our suggestion
+that Python types should be used preferrentially instead of NumPy generic
+types. For example if one would be looking to store a single float value,
+a ``float`` would be prefered to a ``np.float32`` or ``np.float64``.
+
+Please note that this only applied to generic types being used **outside of
+numpy arrays**. NumPy arrays are, as far as we know, always handled
+in a consistent manner.
+
 Dumping arbitrary objects to JSON
 ---------------------------------
 

diff --git a/gufe/custom_codecs.py b/gufe/custom_codecs.py
@@ -34,6 +34,12 @@ def inherited_is_my_dict(dct, cls):
     return cls in stored.mro()
 
 
+def is_npy_dtype_dict(dct):
+    expected = ["dtype", "bytes"]
+    is_custom = all(exp in dct for exp in expected)
+    return is_custom and ("shape" not in dct)
+
+
 def is_openff_unit_dict(dct):
     expected = ["pint_unit_registry", "unit_name", ":is_custom:"]
     is_custom = all(exp in dct for exp in expected)
@@ -52,18 +58,39 @@ def is_openff_quantity_dict(dct):
     from_dict=lambda dct: pathlib.PosixPath(dct["path"]),
 )
 
+
 BYTES_CODEC = JSONCodec(
     cls=bytes,
     to_dict=lambda obj: {'latin-1': obj.decode('latin-1')},
     from_dict=lambda dct: dct['latin-1'].encode('latin-1'),
 )
 
+
 DATETIME_CODEC = JSONCodec(
     cls=datetime.datetime,
     to_dict=lambda obj: {'isotime': obj.isoformat()},
     from_dict=lambda dct: datetime.datetime.fromisoformat(dct['isotime']),
 )
 
+
+# Note that this has inconsistent behaviour for some generic types
+# which end up being handled by the default JSON encoder/decoder.
+# The main example of this is np.float64 which will be turned into
+# a float type on serialization.
+NPY_DTYPE_CODEC = JSONCodec(
+    cls=np.generic,
+    to_dict=lambda obj: {
+        'dtype': str(obj.dtype),
+        'bytes': obj.tobytes(),
+    },
+    from_dict=lambda dct: np.frombuffer(
+        dct['bytes'], dtype=np.dtype(dct['dtype'])
+    )[0],
+    is_my_obj=lambda obj: isinstance(obj, np.generic),
+    is_my_dict=is_npy_dtype_dict,
+)
+
+
 NUMPY_CODEC = JSONCodec(
     cls=np.ndarray,
     to_dict=lambda obj: {
@@ -76,13 +103,15 @@ def is_openff_quantity_dict(dct):
     ).reshape(dct['shape'])
 )
 
+
 SETTINGS_CODEC = JSONCodec(
     cls=SettingsBaseModel,
     to_dict=lambda obj: {field: getattr(obj, field) for field in obj.__fields__},
     from_dict=default_from_dict,
     is_my_dict=functools.partial(inherited_is_my_dict, cls=SettingsBaseModel),
 )
 
+
 OPENFF_QUANTITY_CODEC = JSONCodec(
     cls=None,
     to_dict=lambda obj: {
@@ -96,6 +125,7 @@ def is_openff_quantity_dict(dct):
     is_my_dict=is_openff_quantity_dict,
 )
 
+
 OPENFF_UNIT_CODEC = JSONCodec(
     cls=None,
     to_dict=lambda unit: {

diff --git a/gufe/tests/test_custom_json.py b/gufe/tests/test_custom_json.py
@@ -15,6 +15,7 @@
 from gufe.custom_codecs import (
     BYTES_CODEC,
     NUMPY_CODEC,
+    NPY_DTYPE_CODEC,
     OPENFF_QUANTITY_CODEC,
     OPENFF_UNIT_CODEC,
     PATH_CODEC,
@@ -46,6 +47,22 @@ def test_add_existing_codec(self):
         assert len(serialization.codecs) == 1
 
 
+@pytest.mark.parametrize('obj', [
+    np.array([[1.0, 0.0], [2.0, 3.2]]),
+    np.float32(1.1)
+])
+@pytest.mark.parametrize('codecs', [
+    [BYTES_CODEC, NUMPY_CODEC, NPY_DTYPE_CODEC],
+    [NPY_DTYPE_CODEC, BYTES_CODEC, NUMPY_CODEC],
+])
+def test_numpy_codec_order_roundtrip(obj, codecs):
+    serialization = JSONSerializerDeserializer(codecs)
+    serialized = serialization.serializer(obj)
+    reconstructed = serialization.deserializer(serialized)
+    npt.assert_equal(obj, reconstructed)
+    assert obj.dtype == reconstructed.dtype
+
+
 class CustomJSONCodingTest:
     """Base class for testing codecs.
 
@@ -89,8 +106,9 @@ def test_not_mine(self):
 class TestNumpyCoding(CustomJSONCodingTest):
     def setup_method(self):
         self.codec = NUMPY_CODEC
-        self.objs = [np.array([[1.0, 0.0], [2.0, 3.2]]), np.array([1, 0])]
-        shapes = [[2, 2], [2,]]
+        self.objs = [np.array([[1.0, 0.0], [2.0, 3.2]]), np.array([1, 0]),
+                     np.array([1.0, 2.0, 3.0], dtype=np.float32)]
+        shapes = [[2, 2], [2,], [3,]]
         dtypes = [str(arr.dtype) for arr in self.objs]  # may change by system?
         byte_reps = [arr.tobytes() for arr in self.objs]
         self.dcts = [
@@ -117,10 +135,38 @@ def test_round_trip(self):
             json_str = json.dumps(obj, cls=encoder)
             reconstructed = json.loads(json_str, cls=decoder)
             npt.assert_array_equal(reconstructed, obj)
+            assert reconstructed.dtype == obj.dtype
             json_str_2 = json.dumps(obj, cls=encoder)
             assert json_str == json_str_2
 
 
+class TestNumpyGenericCodec(TestNumpyCoding):
+    def setup_method(self):
+        self.codec = NPY_DTYPE_CODEC
+        # Note that np.float64 is treated as a float by the
+        # default json encode (and so returns a float not a numpy
+        # object).
+        self.objs = [np.bool_(True), np.float16(1.0), np.float32(1.0),
+                     np.complex128(1.0),
+                     np.clongdouble(1.0), np.uint64(1)]
+        dtypes = [str(a.dtype) for a in self.objs]
+        byte_reps = [a.tobytes() for a in self.objs]
+        # Overly complicated extraction of the class name
+        # to deal with the bool_ -> bool dtype class name problem
+        classes = [str(a.__class__).split("'")[1].split('.')[1]
+                   for a in self.objs]
+        self.dcts = [
+            {
+                ":is_custom:": True,
+                "__class__": classname,
+                "__module__": "numpy",
+                "dtype": dtype,
+                "bytes": byte_rep,
+            }
+            for dtype, byte_rep, classname in zip(dtypes, byte_reps, classes)
+        ]
+
+
 class TestPathCodec(CustomJSONCodingTest):
     def setup_method(self):
         self.codec = PATH_CODEC

diff --git a/gufe/tokenization.py b/gufe/tokenization.py
@@ -15,6 +15,7 @@
 from gufe.custom_codecs import (
     BYTES_CODEC,
     DATETIME_CODEC,
+    NPY_DTYPE_CODEC,
     NUMPY_CODEC,
     OPENFF_QUANTITY_CODEC,
     OPENFF_UNIT_CODEC,
@@ -26,6 +27,7 @@
 _default_json_codecs = [
     PATH_CODEC,
     NUMPY_CODEC,
+    NPY_DTYPE_CODEC,
     BYTES_CODEC,
     DATETIME_CODEC,
     SETTINGS_CODEC,