From a0e25efc9e40d71e61b404b4f48ed14b379b5f6b Mon Sep 17 00:00:00 2001 From: IAlibay Date: Mon, 11 Sep 2023 09:28:13 +0100 Subject: [PATCH 1/9] Start adding npy dtype codec --- gufe/custom_codecs.py | 23 +++++++++++++++++++++++ gufe/tests/test_custom_json.py | 20 ++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/gufe/custom_codecs.py b/gufe/custom_codecs.py index 87e3828e..33c99634 100644 --- a/gufe/custom_codecs.py +++ b/gufe/custom_codecs.py @@ -33,6 +33,10 @@ def inherited_is_my_dict(dct, cls): stored = gufe.tokenization.get_class(module, classname) return cls in stored.mro() +def is_npy_dtype_dict(dct): + expected = ["dtype", "bytes"] + is_custom = all(exp in dct for exp in expected) + return is_custom and ("shape" not in dct) def is_openff_unit_dict(dct): expected = ["pint_unit_registry", "unit_name", ":is_custom:"] @@ -52,18 +56,34 @@ def is_openff_quantity_dict(dct): from_dict=lambda dct: pathlib.PosixPath(dct["path"]), ) + BYTES_CODEC = JSONCodec( cls=bytes, to_dict=lambda obj: {'latin-1': obj.decode('latin-1')}, from_dict=lambda dct: dct['latin-1'].encode('latin-1'), ) + DATETIME_CODEC = JSONCodec( cls=datetime.datetime, to_dict=lambda obj: {'isotime': obj.isoformat()}, from_dict=lambda dct: datetime.datetime.fromisoformat(dct['isotime']), ) + +NPY_DTYPE_CODEC = JSONCodec( + cls=np.generic, + to_dict=lambda obj: { + 'dtype': str(obj.dtype), + 'bytes': obj.tobytes(), + }, + from_dict=lambda dct: np.frombuffer( + dct['bytes'], dtype=np.dtype(dct['dtype']) + )[0], + is_my_dict=is_npy_dtype_dict, +) + + NUMPY_CODEC = JSONCodec( cls=np.ndarray, to_dict=lambda obj: { @@ -76,6 +96,7 @@ def is_openff_quantity_dict(dct): ).reshape(dct['shape']) ) + SETTINGS_CODEC = JSONCodec( cls=SettingsBaseModel, to_dict=lambda obj: {field: getattr(obj, field) for field in obj.__fields__}, @@ -83,6 +104,7 @@ def is_openff_quantity_dict(dct): is_my_dict=functools.partial(inherited_is_my_dict, cls=SettingsBaseModel), ) + OPENFF_QUANTITY_CODEC = JSONCodec( cls=None, to_dict=lambda obj: { @@ -98,6 +120,7 @@ def is_openff_quantity_dict(dct): is_my_dict=is_openff_quantity_dict, ) + OPENFF_UNIT_CODEC = JSONCodec( cls=None, to_dict=lambda unit: { diff --git a/gufe/tests/test_custom_json.py b/gufe/tests/test_custom_json.py index 1efcb2f5..2d4b92f4 100644 --- a/gufe/tests/test_custom_json.py +++ b/gufe/tests/test_custom_json.py @@ -14,6 +14,7 @@ from gufe.custom_codecs import ( BYTES_CODEC, NUMPY_CODEC, + NPY_DTYPE_CODEC, OPENFF_QUANTITY_CODEC, OPENFF_UNIT_CODEC, PATH_CODEC, @@ -115,9 +116,28 @@ def test_round_trip(self): json_str = json.dumps(obj, cls=encoder) reconstructed = json.loads(json_str, cls=decoder) npt.assert_array_equal(reconstructed, obj) + assert reconstructed.dtype == obj.dtype json_str_2 = json.dumps(obj, cls=encoder) assert json_str == json_str_2 +class TestNumpyGenericCodec(TestNumpyCoding): + def setup_method(self): + self.codec = NPY_DTYPE_CODEC + self.objs = [np.float16(1.0), np.float32(1.0), np.float64(1.0), + np.complex128(1.0), np.clongdouble(1.0), np.uint64(1),] + dtypes = [str(a.dtype) for a in self.objs] + byte_reps = [a.tobytes() for a in self.objs] + classes = [str(a.dtype) for a in self.objs] + self.dcts = [ + { + ":is_custom:": True, + "__class__": classname, + "__module__": "numpy", + "dtype": dtype, + "bytes": byte_rep, + } + for dtype, byte_rep, classname in zip(dtypes, byte_reps, classes) + ] class TestPathCodec(CustomJSONCodingTest): def setup_method(self): From 4ea0388a36272cc7943c00fd4469a87c130b901a Mon Sep 17 00:00:00 2001 From: IAlibay Date: Mon, 11 Sep 2023 10:49:59 +0100 Subject: [PATCH 2/9] fixing the bool problem --- gufe/custom_codecs.py | 3 +++ gufe/tests/test_custom_json.py | 29 ++++++++++++++++++++++++----- gufe/tokenization.py | 2 ++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/gufe/custom_codecs.py b/gufe/custom_codecs.py index 33c99634..dd09a043 100644 --- a/gufe/custom_codecs.py +++ b/gufe/custom_codecs.py @@ -33,11 +33,13 @@ def inherited_is_my_dict(dct, cls): stored = gufe.tokenization.get_class(module, classname) return cls in stored.mro() + def is_npy_dtype_dict(dct): expected = ["dtype", "bytes"] is_custom = all(exp in dct for exp in expected) return is_custom and ("shape" not in dct) + def is_openff_unit_dict(dct): expected = ["pint_unit_registry", "unit_name", ":is_custom:"] is_custom = all(exp in dct for exp in expected) @@ -80,6 +82,7 @@ def is_openff_quantity_dict(dct): from_dict=lambda dct: np.frombuffer( dct['bytes'], dtype=np.dtype(dct['dtype']) )[0], + is_my_obj=lambda obj: isinstance(obj, np.generic), is_my_dict=is_npy_dtype_dict, ) diff --git a/gufe/tests/test_custom_json.py b/gufe/tests/test_custom_json.py index 2d4b92f4..9c52deb4 100644 --- a/gufe/tests/test_custom_json.py +++ b/gufe/tests/test_custom_json.py @@ -88,8 +88,9 @@ def test_not_mine(self): class TestNumpyCoding(CustomJSONCodingTest): def setup_method(self): self.codec = NUMPY_CODEC - self.objs = [np.array([[1.0, 0.0], [2.0, 3.2]]), np.array([1, 0])] - shapes = [[2, 2], [2,]] + self.objs = [np.array([[1.0, 0.0], [2.0, 3.2]]), np.array([1, 0]), + np.array([1.0, 2.0, 3.0], dtype=np.float32),] + shapes = [[2, 2], [2,], [3,]] dtypes = [str(arr.dtype) for arr in self.objs] # may change by system? byte_reps = [arr.tobytes() for arr in self.objs] self.dcts = [ @@ -123,11 +124,15 @@ def test_round_trip(self): class TestNumpyGenericCodec(TestNumpyCoding): def setup_method(self): self.codec = NPY_DTYPE_CODEC - self.objs = [np.float16(1.0), np.float32(1.0), np.float64(1.0), - np.complex128(1.0), np.clongdouble(1.0), np.uint64(1),] + self.objs = [np.bool_(True), np.float16(1.0), np.float32(1.0), + np.float64(1.0), np.complex128(1.0), + np.clongdouble(1.0), np.uint64(1),] dtypes = [str(a.dtype) for a in self.objs] byte_reps = [a.tobytes() for a in self.objs] - classes = [str(a.dtype) for a in self.objs] + # Overly complicated extraction of the class name + # to deal with the bool_ -> bool dtype class name problem + classes = [str(a.__class__).split("'")[1].split('.')[1] + for a in self.objs] self.dcts = [ { ":is_custom:": True, @@ -139,6 +144,20 @@ def setup_method(self): for dtype, byte_rep, classname in zip(dtypes, byte_reps, classes) ] + def test_round_trip(self): + encoder, decoder = custom_json_factory([self.codec, BYTES_CODEC]) + for (obj, dct) in zip(self.objs, self.dcts): + print(dct) + print(encoder) + json_str = json.dumps(obj, cls=encoder) + print(json_str) + reconstructed = json.loads(json_str, cls=decoder) + print(type(reconstructed)) + npt.assert_array_equal(reconstructed, obj) + assert reconstructed.dtype == obj.dtype + json_str_2 = json.dumps(obj, cls=encoder) + assert json_str == json_str_2 + class TestPathCodec(CustomJSONCodingTest): def setup_method(self): self.codec = PATH_CODEC diff --git a/gufe/tokenization.py b/gufe/tokenization.py index 25364891..66e8accd 100644 --- a/gufe/tokenization.py +++ b/gufe/tokenization.py @@ -15,6 +15,7 @@ from gufe.custom_codecs import ( BYTES_CODEC, DATETIME_CODEC, + NPY_DTYPE_CODEC, NUMPY_CODEC, OPENFF_QUANTITY_CODEC, OPENFF_UNIT_CODEC, @@ -26,6 +27,7 @@ _default_json_codecs = [ PATH_CODEC, NUMPY_CODEC, + NPY_DTYPE_CODEC, BYTES_CODEC, DATETIME_CODEC, SETTINGS_CODEC, From 1796f2ccc661d50e7d627f84ca5a3e10432a484b Mon Sep 17 00:00:00 2001 From: IAlibay Date: Mon, 11 Sep 2023 10:51:30 +0100 Subject: [PATCH 3/9] pep8 --- gufe/tests/test_custom_json.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gufe/tests/test_custom_json.py b/gufe/tests/test_custom_json.py index 9c52deb4..18961c42 100644 --- a/gufe/tests/test_custom_json.py +++ b/gufe/tests/test_custom_json.py @@ -121,6 +121,7 @@ def test_round_trip(self): json_str_2 = json.dumps(obj, cls=encoder) assert json_str == json_str_2 + class TestNumpyGenericCodec(TestNumpyCoding): def setup_method(self): self.codec = NPY_DTYPE_CODEC @@ -158,6 +159,7 @@ def test_round_trip(self): json_str_2 = json.dumps(obj, cls=encoder) assert json_str == json_str_2 + class TestPathCodec(CustomJSONCodingTest): def setup_method(self): self.codec = PATH_CODEC From de84d46036a8e490ea483f0ea571c68d02e3eea3 Mon Sep 17 00:00:00 2001 From: IAlibay Date: Mon, 11 Sep 2023 11:20:08 +0100 Subject: [PATCH 4/9] cleanup ahead of review --- gufe/tests/test_custom_json.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/gufe/tests/test_custom_json.py b/gufe/tests/test_custom_json.py index 18961c42..f63a2e53 100644 --- a/gufe/tests/test_custom_json.py +++ b/gufe/tests/test_custom_json.py @@ -145,20 +145,6 @@ def setup_method(self): for dtype, byte_rep, classname in zip(dtypes, byte_reps, classes) ] - def test_round_trip(self): - encoder, decoder = custom_json_factory([self.codec, BYTES_CODEC]) - for (obj, dct) in zip(self.objs, self.dcts): - print(dct) - print(encoder) - json_str = json.dumps(obj, cls=encoder) - print(json_str) - reconstructed = json.loads(json_str, cls=decoder) - print(type(reconstructed)) - npt.assert_array_equal(reconstructed, obj) - assert reconstructed.dtype == obj.dtype - json_str_2 = json.dumps(obj, cls=encoder) - assert json_str == json_str_2 - class TestPathCodec(CustomJSONCodingTest): def setup_method(self): From 50040d780c2444406b7efa673f043bf851a13919 Mon Sep 17 00:00:00 2001 From: IAlibay Date: Tue, 19 Sep 2023 08:55:56 +0100 Subject: [PATCH 5/9] switch to cls=None --- gufe/custom_codecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gufe/custom_codecs.py b/gufe/custom_codecs.py index dd09a043..ff5f7901 100644 --- a/gufe/custom_codecs.py +++ b/gufe/custom_codecs.py @@ -74,7 +74,7 @@ def is_openff_quantity_dict(dct): NPY_DTYPE_CODEC = JSONCodec( - cls=np.generic, + cls=None, to_dict=lambda obj: { 'dtype': str(obj.dtype), 'bytes': obj.tobytes(), From b509d86f22ac453991b8e55327b35c0abd8279cb Mon Sep 17 00:00:00 2001 From: IAlibay Date: Tue, 19 Sep 2023 09:13:13 +0100 Subject: [PATCH 6/9] Add codec roundtrip test --- gufe/tests/test_custom_json.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/gufe/tests/test_custom_json.py b/gufe/tests/test_custom_json.py index f63a2e53..f82a54c3 100644 --- a/gufe/tests/test_custom_json.py +++ b/gufe/tests/test_custom_json.py @@ -45,6 +45,22 @@ def test_add_existing_codec(self): assert len(serialization.codecs) == 1 +@pytest.mark.parametrize('obj', + [np.array([[1.0, 0.0], [2.0, 3.2]]), + np.float32(1.1)], +) +@pytest.mark.parametrize('codecs', [ + [BYTES_CODEC, NUMPY_CODEC, NPY_DTYPE_CODEC], + [NPY_DTYPE_CODEC, BYTES_CODEC, NUMPY_CODEC], +]) +def test_numpy_codec_order_roundtrip(obj, codecs): + serialization = JSONSerializerDeserializer(codecs) + serialized = serialization.serializer(obj) + reconstructed = serialization.deserializer(serialized) + npt.assert_equal(obj, reconstructed) + assert obj.dtype == reconstructed.dtype + + class CustomJSONCodingTest: """Base class for testing codecs. @@ -125,8 +141,11 @@ def test_round_trip(self): class TestNumpyGenericCodec(TestNumpyCoding): def setup_method(self): self.codec = NPY_DTYPE_CODEC + # Note that np.float64 is treated as a float by the + # default json encode (and so returns a float not a numpy + # object). self.objs = [np.bool_(True), np.float16(1.0), np.float32(1.0), - np.float64(1.0), np.complex128(1.0), + np.complex128(1.0), np.clongdouble(1.0), np.uint64(1),] dtypes = [str(a.dtype) for a in self.objs] byte_reps = [a.tobytes() for a in self.objs] From eb98c1dfbcb92b407b96cd2d8b33a53b612fecfa Mon Sep 17 00:00:00 2001 From: IAlibay Date: Tue, 19 Sep 2023 09:16:18 +0100 Subject: [PATCH 7/9] Go back to using np.generic --- gufe/custom_codecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gufe/custom_codecs.py b/gufe/custom_codecs.py index ff5f7901..dd09a043 100644 --- a/gufe/custom_codecs.py +++ b/gufe/custom_codecs.py @@ -74,7 +74,7 @@ def is_openff_quantity_dict(dct): NPY_DTYPE_CODEC = JSONCodec( - cls=None, + cls=np.generic, to_dict=lambda obj: { 'dtype': str(obj.dtype), 'bytes': obj.tobytes(), From 1fb1e5c1c03f58accabc6e0f82d78d8e9df2f60a Mon Sep 17 00:00:00 2001 From: IAlibay Date: Wed, 20 Sep 2023 08:33:27 +0100 Subject: [PATCH 8/9] Add some documentation --- docs/guide/serialization.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/guide/serialization.rst b/docs/guide/serialization.rst index 0ed596d5..6557a3e4 100644 --- a/docs/guide/serialization.rst +++ b/docs/guide/serialization.rst @@ -153,6 +153,27 @@ Details of how the object is recognized for encoding or how the dictionary is recognized for decoding can be changed by passing functions to the ``is_my_obj`` or ``is_my_dict`` parameters of :class:`.JSONCodec`. +.. warning:: + The Custom encoders & decoders only override the default JSON + encoder/decoder if they are not able to natively handle the object. + This leads to some odd / lossy behaviour for some objects such + as ``np.float64`` which is natively converted to a ``float`` type + by the default encoder, whilst other numpy generic types are + appropriately roundtripped. + +On the use of NumPy generic types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Due to their inconsistent behaviour in how they are handled by the default +JSON encoder/decoder routines (see warning above), it is our suggestion +that Python types should be used preferrentially instead of NumPy generic +types. For example if one would be looking to store a single float value, +a ``float`` would be prefered to a ``np.float32`` or ``np.float64``. + +Please note that this only applied to generic types being used **outside of +numpy arrays**. NumPy arrays are, as far as we know, always handled +in a consistent manner. + Dumping arbitrary objects to JSON --------------------------------- From a4af42247cff6e1c6e90b62435c2c5ebd4e61a83 Mon Sep 17 00:00:00 2001 From: IAlibay Date: Wed, 20 Sep 2023 08:33:43 +0100 Subject: [PATCH 9/9] more docs --- gufe/custom_codecs.py | 4 ++++ gufe/tests/test_custom_json.py | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/gufe/custom_codecs.py b/gufe/custom_codecs.py index dd09a043..f334e266 100644 --- a/gufe/custom_codecs.py +++ b/gufe/custom_codecs.py @@ -73,6 +73,10 @@ def is_openff_quantity_dict(dct): ) +# Note that this has inconsistent behaviour for some generic types +# which end up being handled by the default JSON encoder/decoder. +# The main example of this is np.float64 which will be turned into +# a float type on serialization. NPY_DTYPE_CODEC = JSONCodec( cls=np.generic, to_dict=lambda obj: { diff --git a/gufe/tests/test_custom_json.py b/gufe/tests/test_custom_json.py index f82a54c3..8222e51a 100644 --- a/gufe/tests/test_custom_json.py +++ b/gufe/tests/test_custom_json.py @@ -45,10 +45,10 @@ def test_add_existing_codec(self): assert len(serialization.codecs) == 1 -@pytest.mark.parametrize('obj', - [np.array([[1.0, 0.0], [2.0, 3.2]]), - np.float32(1.1)], -) +@pytest.mark.parametrize('obj', [ + np.array([[1.0, 0.0], [2.0, 3.2]]), + np.float32(1.1) +]) @pytest.mark.parametrize('codecs', [ [BYTES_CODEC, NUMPY_CODEC, NPY_DTYPE_CODEC], [NPY_DTYPE_CODEC, BYTES_CODEC, NUMPY_CODEC], @@ -105,7 +105,7 @@ class TestNumpyCoding(CustomJSONCodingTest): def setup_method(self): self.codec = NUMPY_CODEC self.objs = [np.array([[1.0, 0.0], [2.0, 3.2]]), np.array([1, 0]), - np.array([1.0, 2.0, 3.0], dtype=np.float32),] + np.array([1.0, 2.0, 3.0], dtype=np.float32)] shapes = [[2, 2], [2,], [3,]] dtypes = [str(arr.dtype) for arr in self.objs] # may change by system? byte_reps = [arr.tobytes() for arr in self.objs] @@ -146,7 +146,7 @@ def setup_method(self): # object). self.objs = [np.bool_(True), np.float16(1.0), np.float32(1.0), np.complex128(1.0), - np.clongdouble(1.0), np.uint64(1),] + np.clongdouble(1.0), np.uint64(1)] dtypes = [str(a.dtype) for a in self.objs] byte_reps = [a.tobytes() for a in self.objs] # Overly complicated extraction of the class name