Skip to content

Commit

Permalink
Merge pull request #228 from OpenFreeEnergy/npy_dtype_codec
Browse files Browse the repository at this point in the history
Add NPY dtype codec
  • Loading branch information
dwhswenson committed Oct 10, 2023
2 parents 1b7c832 + 4040056 commit f87e236
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 2 deletions.
21 changes: 21 additions & 0 deletions docs/guide/serialization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,27 @@ Details of how the object is recognized for encoding or how the dictionary
is recognized for decoding can be changed by passing functions to the
``is_my_obj`` or ``is_my_dict`` parameters of :class:`.JSONCodec`.

.. warning::
The Custom encoders & decoders only override the default JSON
encoder/decoder if they are not able to natively handle the object.
This leads to some odd / lossy behaviour for some objects such
as ``np.float64`` which is natively converted to a ``float`` type
by the default encoder, whilst other numpy generic types are
appropriately roundtripped.

On the use of NumPy generic types
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Due to their inconsistent behaviour in how they are handled by the default
JSON encoder/decoder routines (see warning above), it is our suggestion
that Python types should be used preferrentially instead of NumPy generic
types. For example if one would be looking to store a single float value,
a ``float`` would be prefered to a ``np.float32`` or ``np.float64``.

Please note that this only applied to generic types being used **outside of
numpy arrays**. NumPy arrays are, as far as we know, always handled
in a consistent manner.

Dumping arbitrary objects to JSON
---------------------------------

Expand Down
30 changes: 30 additions & 0 deletions gufe/custom_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ def inherited_is_my_dict(dct, cls):
return cls in stored.mro()


def is_npy_dtype_dict(dct):
expected = ["dtype", "bytes"]
is_custom = all(exp in dct for exp in expected)
return is_custom and ("shape" not in dct)


def is_openff_unit_dict(dct):
expected = ["pint_unit_registry", "unit_name", ":is_custom:"]
is_custom = all(exp in dct for exp in expected)
Expand All @@ -52,18 +58,39 @@ def is_openff_quantity_dict(dct):
from_dict=lambda dct: pathlib.PosixPath(dct["path"]),
)


BYTES_CODEC = JSONCodec(
cls=bytes,
to_dict=lambda obj: {'latin-1': obj.decode('latin-1')},
from_dict=lambda dct: dct['latin-1'].encode('latin-1'),
)


DATETIME_CODEC = JSONCodec(
cls=datetime.datetime,
to_dict=lambda obj: {'isotime': obj.isoformat()},
from_dict=lambda dct: datetime.datetime.fromisoformat(dct['isotime']),
)


# Note that this has inconsistent behaviour for some generic types
# which end up being handled by the default JSON encoder/decoder.
# The main example of this is np.float64 which will be turned into
# a float type on serialization.
NPY_DTYPE_CODEC = JSONCodec(
cls=np.generic,
to_dict=lambda obj: {
'dtype': str(obj.dtype),
'bytes': obj.tobytes(),
},
from_dict=lambda dct: np.frombuffer(
dct['bytes'], dtype=np.dtype(dct['dtype'])
)[0],
is_my_obj=lambda obj: isinstance(obj, np.generic),
is_my_dict=is_npy_dtype_dict,
)


NUMPY_CODEC = JSONCodec(
cls=np.ndarray,
to_dict=lambda obj: {
Expand All @@ -76,13 +103,15 @@ def is_openff_quantity_dict(dct):
).reshape(dct['shape'])
)


SETTINGS_CODEC = JSONCodec(
cls=SettingsBaseModel,
to_dict=lambda obj: {field: getattr(obj, field) for field in obj.__fields__},
from_dict=default_from_dict,
is_my_dict=functools.partial(inherited_is_my_dict, cls=SettingsBaseModel),
)


OPENFF_QUANTITY_CODEC = JSONCodec(
cls=None,
to_dict=lambda obj: {
Expand All @@ -96,6 +125,7 @@ def is_openff_quantity_dict(dct):
is_my_dict=is_openff_quantity_dict,
)


OPENFF_UNIT_CODEC = JSONCodec(
cls=None,
to_dict=lambda unit: {
Expand Down
50 changes: 48 additions & 2 deletions gufe/tests/test_custom_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from gufe.custom_codecs import (
BYTES_CODEC,
NUMPY_CODEC,
NPY_DTYPE_CODEC,
OPENFF_QUANTITY_CODEC,
OPENFF_UNIT_CODEC,
PATH_CODEC,
Expand Down Expand Up @@ -46,6 +47,22 @@ def test_add_existing_codec(self):
assert len(serialization.codecs) == 1


@pytest.mark.parametrize('obj', [
np.array([[1.0, 0.0], [2.0, 3.2]]),
np.float32(1.1)
])
@pytest.mark.parametrize('codecs', [
[BYTES_CODEC, NUMPY_CODEC, NPY_DTYPE_CODEC],
[NPY_DTYPE_CODEC, BYTES_CODEC, NUMPY_CODEC],
])
def test_numpy_codec_order_roundtrip(obj, codecs):
serialization = JSONSerializerDeserializer(codecs)
serialized = serialization.serializer(obj)
reconstructed = serialization.deserializer(serialized)
npt.assert_equal(obj, reconstructed)
assert obj.dtype == reconstructed.dtype


class CustomJSONCodingTest:
"""Base class for testing codecs.
Expand Down Expand Up @@ -89,8 +106,9 @@ def test_not_mine(self):
class TestNumpyCoding(CustomJSONCodingTest):
def setup_method(self):
self.codec = NUMPY_CODEC
self.objs = [np.array([[1.0, 0.0], [2.0, 3.2]]), np.array([1, 0])]
shapes = [[2, 2], [2,]]
self.objs = [np.array([[1.0, 0.0], [2.0, 3.2]]), np.array([1, 0]),
np.array([1.0, 2.0, 3.0], dtype=np.float32)]
shapes = [[2, 2], [2,], [3,]]
dtypes = [str(arr.dtype) for arr in self.objs] # may change by system?
byte_reps = [arr.tobytes() for arr in self.objs]
self.dcts = [
Expand All @@ -117,10 +135,38 @@ def test_round_trip(self):
json_str = json.dumps(obj, cls=encoder)
reconstructed = json.loads(json_str, cls=decoder)
npt.assert_array_equal(reconstructed, obj)
assert reconstructed.dtype == obj.dtype
json_str_2 = json.dumps(obj, cls=encoder)
assert json_str == json_str_2


class TestNumpyGenericCodec(TestNumpyCoding):
def setup_method(self):
self.codec = NPY_DTYPE_CODEC
# Note that np.float64 is treated as a float by the
# default json encode (and so returns a float not a numpy
# object).
self.objs = [np.bool_(True), np.float16(1.0), np.float32(1.0),
np.complex128(1.0),
np.clongdouble(1.0), np.uint64(1)]
dtypes = [str(a.dtype) for a in self.objs]
byte_reps = [a.tobytes() for a in self.objs]
# Overly complicated extraction of the class name
# to deal with the bool_ -> bool dtype class name problem
classes = [str(a.__class__).split("'")[1].split('.')[1]
for a in self.objs]
self.dcts = [
{
":is_custom:": True,
"__class__": classname,
"__module__": "numpy",
"dtype": dtype,
"bytes": byte_rep,
}
for dtype, byte_rep, classname in zip(dtypes, byte_reps, classes)
]


class TestPathCodec(CustomJSONCodingTest):
def setup_method(self):
self.codec = PATH_CODEC
Expand Down
2 changes: 2 additions & 0 deletions gufe/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from gufe.custom_codecs import (
BYTES_CODEC,
DATETIME_CODEC,
NPY_DTYPE_CODEC,
NUMPY_CODEC,
OPENFF_QUANTITY_CODEC,
OPENFF_UNIT_CODEC,
Expand All @@ -26,6 +27,7 @@
_default_json_codecs = [
PATH_CODEC,
NUMPY_CODEC,
NPY_DTYPE_CODEC,
BYTES_CODEC,
DATETIME_CODEC,
SETTINGS_CODEC,
Expand Down

0 comments on commit f87e236

Please sign in to comment.