From 6dffc270369d4c5051d16fd429ee78b84d1f991c Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Thu, 27 Feb 2020 08:34:14 -0700 Subject: [PATCH 1/3] Add more tests --- setup.cfg | 2 +- tests/test_zarr_compat.py | 94 +++++++++++++++++++++++++++++++--- tests/utils.py | 104 +++++++++++++++++++++++++++++++++++++- xpublish/rest.py | 86 +++++++++++++++---------------- 4 files changed, 233 insertions(+), 53 deletions(-) diff --git a/setup.cfg b/setup.cfg index 0b88016..0f03252 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,7 +7,7 @@ select = B,C,E,F,W,T4,B9 [isort] known_first_party=xpublish -known_third_party=fastapi,numcodecs,numpy,pkg_resources,pytest,setuptools,starlette,uvicorn,xarray,zarr +known_third_party=dask,fastapi,numcodecs,numpy,pandas,pkg_resources,pytest,setuptools,starlette,uvicorn,xarray,zarr multi_line_output=3 include_trailing_comma=True force_grid_wrap=0 diff --git a/tests/test_zarr_compat.py b/tests/test_zarr_compat.py index 8466c45..aadf517 100644 --- a/tests/test_zarr_compat.py +++ b/tests/test_zarr_compat.py @@ -5,12 +5,12 @@ import xpublish # noqa: F401 -from .utils import TestMapper +from .utils import TestMapper, create_dataset -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def airtemp_ds(): - ds = xr.tutorial.open_dataset("air_temperature") + ds = xr.tutorial.open_dataset('air_temperature') return ds.chunk(dict(ds.dims)) @@ -18,13 +18,91 @@ def test_zmetadata_identical(airtemp_ds): zarr_dict = {} airtemp_ds.to_zarr(zarr_dict, consolidated=True) mapper = TestMapper(airtemp_ds.rest.app) - actual = json.loads(mapper[".zmetadata"].decode()) - expected = json.loads(zarr_dict[".zmetadata"].decode()) + actual = json.loads(mapper['.zmetadata'].decode()) + expected = json.loads(zarr_dict['.zmetadata'].decode()) assert actual == expected -def test_roundtrip(airtemp_ds): - mapper = TestMapper(airtemp_ds.rest.app) +@pytest.mark.parametrize( + 'start, end, freq, nlats, nlons, var_const, calendar, use_cftime', + [ + ('2018-01-01', '2021-01-01', 'MS', 180, 360, True, 'standard', False), + ('2018-01-01', '2021-01-01', 'D', 180, 360, False, 'noleap', True), + ('2018-01-01', '2021-01-01', '6H', 180, 360, True, 'gregorian', False), + ('2018-01-01', '2050-01-01', 'A', 180, 360, None, '360_day', True), + ], +) +def test_roundtrip(start, end, freq, nlats, nlons, var_const, calendar, use_cftime): + ds = create_dataset( + start=start, + end=end, + nlats=nlats, + nlons=nlons, + var_const=var_const, + use_cftime=use_cftime, + calendar=calendar, + ) + ds = ds.chunk(ds.dims) + mapper = TestMapper(ds.rest.app) + actual = xr.open_zarr(mapper, consolidated=True) + + xr.testing.assert_identical(actual, ds) + + +@pytest.mark.xfail(reason='Custom chunking not working properly') +@pytest.mark.parametrize( + 'start, end, freq, nlats, nlons, var_const, calendar, use_cftime, chunks', + [ + ('2018-01-01', '2021-01-01', 'MS', 180, 360, True, 'standard', False, {'time': 10}), + ( + '2018-01-01', + '2021-01-01', + 'D', + 300, + 600, + False, + 'noleap', + True, + {'time': 10, 'lat': 300, 'lon': 300}, + ), + ( + '2018-01-01', + '2021-01-01', + '12H', + 180, + 360, + True, + 'gregorian', + False, + {'time': 36, 'lat': 10}, + ), + ( + '2018-01-01', + '2050-01-01', + 'A', + 300, + 600, + None, + '360_day', + True, + {'time': 10, 'lat': 30, 'lon': 30}, + ), + ], +) +def test_roundtrip_custom_chunks( + start, end, freq, nlats, nlons, var_const, calendar, use_cftime, chunks +): + ds = create_dataset( + start=start, + end=end, + nlats=nlats, + nlons=nlons, + var_const=var_const, + use_cftime=use_cftime, + calendar=calendar, + ) + ds = ds.chunk(chunks) + mapper = TestMapper(ds.rest.app) actual = xr.open_zarr(mapper, consolidated=True) - xr.testing.assert_identical(actual, airtemp_ds) + xr.testing.assert_identical(actual, ds) diff --git a/tests/utils.py b/tests/utils.py index 3f13cf9..63c9394 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,5 +1,13 @@ +from functools import reduce +from operator import mul + +import numpy as np +import pandas as pd +import xarray as xr from starlette.testclient import TestClient +rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(123456789))) + class TestMapper(TestClient): """ @@ -9,5 +17,99 @@ class TestMapper(TestClient): def __getitem__(self, key): response = self.get(key) if response.status_code != 200: - raise KeyError("{} not found. status_code = {}".format(key, response.status_code)) + raise KeyError('{} not found. status_code = {}'.format(key, response.status_code)) return response.content + + +def create_dataset( + start='2018-01', + end='2020-12', + freq='MS', + calendar='standard', + units='days since 1980-01-01', + use_cftime=True, + decode_times=True, + nlats=1, + nlons=1, + var_const=None, +): + """ Utility function for creating test data """ + + if use_cftime: + end = xr.coding.cftime_offsets.to_cftime_datetime(end, calendar=calendar) + dates = xr.cftime_range(start=start, end=end, freq=freq, calendar=calendar) + + else: + dates = pd.date_range(start=pd.to_datetime(start), end=pd.to_datetime(end), freq=freq) + + decoded_time_bounds = np.vstack((dates[:-1], dates[1:])).T + + encoded_time_bounds = xr.coding.times.encode_cf_datetime( + decoded_time_bounds, units=units, calendar=calendar + )[0] + + encoded_times = xr.DataArray( + encoded_time_bounds.mean(axis=1), + dims=('time'), + name='time', + attrs={'units': units, 'calendar': calendar}, + ) + + decoded_times = xr.DataArray( + xr.coding.times.decode_cf_datetime( + encoded_times, units=units, calendar=calendar, use_cftime=use_cftime + ), + dims=['time'], + ) + decoded_time_bounds = xr.DataArray( + decoded_time_bounds, name='time_bounds', dims=('time', 'd2'), coords={'time': decoded_times} + ) + + if decode_times: + times = decoded_times + time_bounds = decoded_time_bounds + + else: + times = encoded_times + time_bounds = xr.DataArray( + encoded_time_bounds, name='time_bounds', dims=('time', 'd2'), coords={'time': times} + ) + + lats = np.linspace(start=-90, stop=90, num=nlats, dtype='float32') + lons = np.linspace(start=-180, stop=180, num=nlons, dtype='float32') + + shape = (times.size, lats.size, lons.size) + num = reduce(mul, shape) + + if var_const is None: + annual_cycle = np.sin(2 * np.pi * (decoded_times.dt.dayofyear.values / 365.25 - 0.28)) + base = 10 + 15 * annual_cycle.reshape(-1, 1) + tmin_values = base + 3 * np.random.randn(annual_cycle.size, nlats * nlons) + tmax_values = base + 10 + 3 * np.random.randn(annual_cycle.size, nlats * nlons) + tmin_values = tmin_values.reshape(shape) + tmax_values = tmax_values.reshape(shape) + + elif var_const: + tmin_values = np.ones(shape=shape) + tmax_values = np.ones(shape=shape) + 2 + + else: + tmin_values = np.arange(1, num + 1).reshape(shape) + tmax_values = np.arange(1, num + 1).reshape(shape) + + ds = xr.Dataset( + { + 'tmin': (('time', 'lat', 'lon'), tmin_values.astype('float32')), + 'tmax': (('time', 'lat', 'lon'), tmax_values.astype('float32')), + 'time_bounds': (('time', 'd2'), time_bounds), + }, + {'time': times, 'lat': lats, 'lon': lons}, + ) + + ds.tmin.encoding['_FillValue'] = np.float32(-9999999) + ds.tmax.encoding['_FillValue'] = np.float32(-9999999) + ds.time.encoding['bounds'] = 'time_bounds' + ds.time.encoding['units'] = units + ds.time.encoding['calendar'] = calendar + + return ds diff --git a/xpublish/rest.py b/xpublish/rest.py index a556a21..4abaf65 100644 --- a/xpublish/rest.py +++ b/xpublish/rest.py @@ -20,12 +20,12 @@ zarr_format = 2 zarr_consolidated_format = 1 -zarr_metadata_key = ".zmetadata" +zarr_metadata_key = '.zmetadata' -logger = logging.getLogger("api") +logger = logging.getLogger('api') -@xr.register_dataset_accessor("rest") +@xr.register_dataset_accessor('rest') class RestAccessor: """ REST API Accessor @@ -48,18 +48,18 @@ def __init__(self, xarray_obj): def _get_zmetadata(self): """ helper method to create consolidated zmetadata dictionary """ - zmeta = {"zarr_consolidated_format": zarr_consolidated_format, "metadata": {}} - zmeta["metadata"][group_meta_key] = {"zarr_format": zarr_format} - zmeta["metadata"][attrs_key] = self._get_zattrs() + zmeta = {'zarr_consolidated_format': zarr_consolidated_format, 'metadata': {}} + zmeta['metadata'][group_meta_key] = {'zarr_format': zarr_format} + zmeta['metadata'][attrs_key] = self._get_zattrs() for key, da in self._obj.variables.items(): # encode variable encoded_da = encode_zarr_variable(da) self._variables[key] = encoded_da self._encoding[key] = _extract_zarr_variable_encoding(da) - zmeta["metadata"][f"{key}/{attrs_key}"] = extract_zattrs(encoded_da) - zmeta["metadata"][f"{key}/{array_meta_key}"] = extract_zarray( - encoded_da, self._encoding.get(key, {}), da.encoding["dtype"] + zmeta['metadata'][f'{key}/{attrs_key}'] = extract_zattrs(encoded_da) + zmeta['metadata'][f'{key}/{array_meta_key}'] = extract_zarray( + encoded_da, self._encoding.get(key, {}), da.encoding.get('dtype', encoded_da.dtype) ) return zmeta @@ -83,26 +83,26 @@ def zmetadata_json(self): zjson = copy.deepcopy(self.zmetadata) for key in list(self._obj.variables): # convert compressor to dict - compressor_config = zjson["metadata"][f"{key}/{array_meta_key}"][ - "compressor" + compressor_config = zjson['metadata'][f'{key}/{array_meta_key}'][ + 'compressor' ].get_config() - zjson["metadata"][f"{key}/{array_meta_key}"]["compressor"] = compressor_config + zjson['metadata'][f'{key}/{array_meta_key}']['compressor'] = compressor_config return zjson async def get_key(self, var, chunk): - logger.debug("var is %s", var) - logger.debug("chunk is %s", chunk) + logger.debug('var is %s', var) + logger.debug('chunk is %s', chunk) da = self._variables[var].data - arr_meta = self.zmetadata["metadata"][f"{var}/{array_meta_key}"] + arr_meta = self.zmetadata['metadata'][f'{var}/{array_meta_key}'] - data_chunk = get_data_chunk(da, chunk, out_shape=arr_meta["chunks"]) + data_chunk = get_data_chunk(da, chunk, out_shape=arr_meta['chunks']) echunk = _encode_chunk( - data_chunk.tobytes(), filters=arr_meta["filters"], compressor=arr_meta["compressor"], + data_chunk.tobytes(), filters=arr_meta['filters'], compressor=arr_meta['compressor'], ) - return Response(echunk, media_type="application/octet-stream") + return Response(echunk, media_type='application/octet-stream') def init_app( self, @@ -150,20 +150,20 @@ def init_app( **kwargs, ) - @self._app.get(f"/{zarr_metadata_key}") + @self._app.get(f'/{zarr_metadata_key}') def get_zmetadata(): return self.zmetadata_json() - @self._app.get("/keys") + @self._app.get('/keys') def list_keys(): return list(self._obj.variables) - @self._app.get("/") + @self._app.get('/') def repr(): - with xr.set_options(display_style="html"): + with xr.set_options(display_style='html'): return HTMLResponse(self._obj._repr_html_()) - @self._app.get("/info") + @self._app.get('/info') def info(): import io @@ -172,16 +172,16 @@ def info(): info = buffer.getvalue() return info - @self._app.get("/dict") + @self._app.get('/dict') def to_dict(data: bool = False): return self._obj.to_dict(data=data) - @self._app.get("/{var}/{chunk}") + @self._app.get('/{var}/{chunk}') async def get_key(var, chunk): result = await self.get_key(var, chunk) return result - @self._app.get("/versions") + @self._app.get('/versions') def versions(): import io @@ -199,7 +199,7 @@ def app(self): self.init_app() return self._app - def serve(self, host="0.0.0.0", port=9000, log_level="debug", **kwargs): + def serve(self, host='0.0.0.0', port=9000, log_level='debug', **kwargs): """ Serve this app via ``uvicorn.run``. Parameters @@ -230,34 +230,34 @@ def extract_zattrs(da): # We don't want `_FillValue` in `.zattrs` # It should go in `fill_value` section of `.zarray` - _ = zattrs.pop("_FillValue", None) + _ = zattrs.pop('_FillValue', None) return zattrs def _extract_fill_value(da, dtype): """ helper function to extract fill value from DataArray. """ - fill_value = da.attrs.pop("_FillValue", None) + fill_value = da.attrs.pop('_FillValue', None) return encode_fill_value(fill_value, dtype) def extract_zarray(da, encoding, dtype): """ helper function to extract zarr array metadata. """ meta = { - "compressor": encoding.get("compressor", da.encoding.get("compressor", default_compressor)), - "filters": encoding.get("filters", da.encoding.get("filters", None)), - "chunks": encoding.get("chunks", None), - "dtype": dtype.str, - "fill_value": _extract_fill_value(da, dtype), - "order": "C", - "shape": list(normalize_shape(da.shape)), - "zarr_format": zarr_format, + 'compressor': encoding.get('compressor', da.encoding.get('compressor', default_compressor)), + 'filters': encoding.get('filters', da.encoding.get('filters', None)), + 'chunks': encoding.get('chunks', None), + 'dtype': dtype.str, + 'fill_value': _extract_fill_value(da, dtype), + 'order': 'C', + 'shape': list(normalize_shape(da.shape)), + 'zarr_format': zarr_format, } - if meta["chunks"] is None: + if meta['chunks'] is None: if da.chunks is not None: - meta["chunks"] = list([c[0] for c in da.chunks]) + meta['chunks'] = list([c[0] for c in da.chunks]) else: - meta["chunks"] = list(da.shape) + meta['chunks'] = list(da.shape) return meta @@ -271,7 +271,7 @@ def _encode_chunk(chunk, filters=None, compressor=None): # check object encoding if ensure_ndarray(chunk).dtype == object: - raise RuntimeError("cannot write object array without object codec") + raise RuntimeError('cannot write object array without object codec') # compress if compressor: @@ -288,13 +288,13 @@ def get_data_chunk(da, chunk_id, out_shape): If this is an incomplete edge chunk, pad the returned array to match out_shape. """ - ikeys = tuple(map(int, chunk_id.split("."))) + ikeys = tuple(map(int, chunk_id.split('.'))) try: chunk_data = da.blocks[ikeys] except: chunk_data = np.asarray(da) - logger.debug("checking chunk output size, %s == %s" % (chunk_data.shape, out_shape)) + logger.debug('checking chunk output size, %s == %s' % (chunk_data.shape, out_shape)) if isinstance(chunk_data, dask_array_type): chunk_data = chunk_data.compute() From c7019de4627ed6cb731e3a2adf3811563ec6a3a4 Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Wed, 4 Mar 2020 17:53:00 -0700 Subject: [PATCH 2/3] parametrize test_zmetadata_identical() --- tests/test_zarr_compat.py | 28 ++++++++++++++++++++++++---- xpublish/rest.py | 2 +- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/tests/test_zarr_compat.py b/tests/test_zarr_compat.py index aadf517..123ae96 100644 --- a/tests/test_zarr_compat.py +++ b/tests/test_zarr_compat.py @@ -14,10 +14,30 @@ def airtemp_ds(): return ds.chunk(dict(ds.dims)) -def test_zmetadata_identical(airtemp_ds): +@pytest.mark.parametrize( + 'start, end, freq, nlats, nlons, var_const, calendar, use_cftime', + [ + ('2018-01-01', '2021-01-01', 'MS', 180, 360, True, 'standard', False), + ('2018-01-01', '2021-01-01', 'D', 180, 360, False, 'noleap', True), + ('2018-01-01', '2021-01-01', '6H', 180, 360, True, 'gregorian', False), + ('2018-01-01', '2050-01-01', 'A', 180, 360, None, '360_day', True), + ], +) +def test_zmetadata_identical(start, end, freq, nlats, nlons, var_const, calendar, use_cftime): + ds = create_dataset( + start=start, + end=end, + nlats=nlats, + nlons=nlons, + var_const=var_const, + use_cftime=use_cftime, + calendar=calendar, + ) + + ds = ds.chunk(ds.dims) zarr_dict = {} - airtemp_ds.to_zarr(zarr_dict, consolidated=True) - mapper = TestMapper(airtemp_ds.rest.app) + ds.to_zarr(zarr_dict, consolidated=True) + mapper = TestMapper(ds.rest.app) actual = json.loads(mapper['.zmetadata'].decode()) expected = json.loads(zarr_dict['.zmetadata'].decode()) assert actual == expected @@ -43,13 +63,13 @@ def test_roundtrip(start, end, freq, nlats, nlons, var_const, calendar, use_cfti calendar=calendar, ) ds = ds.chunk(ds.dims) + mapper = TestMapper(ds.rest.app) actual = xr.open_zarr(mapper, consolidated=True) xr.testing.assert_identical(actual, ds) -@pytest.mark.xfail(reason='Custom chunking not working properly') @pytest.mark.parametrize( 'start, end, freq, nlats, nlons, var_const, calendar, use_cftime, chunks', [ diff --git a/xpublish/rest.py b/xpublish/rest.py index f8138a1..202dc70 100644 --- a/xpublish/rest.py +++ b/xpublish/rest.py @@ -63,7 +63,7 @@ def _get_zmetadata(self): self._encoding[key] = _extract_zarr_variable_encoding(da) zmeta['metadata'][f'{key}/{attrs_key}'] = extract_zattrs(encoded_da) zmeta['metadata'][f'{key}/{array_meta_key}'] = extract_zarray( - encoded_da, self._encoding[key], da.encoding.get('dtype', da.dtype) + encoded_da, self._encoding[key], encoded_da.dtype ) return zmeta From 9f7149c3062751f960ad522c07b7c153da2c8443 Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Thu, 5 Mar 2020 12:39:37 -0700 Subject: [PATCH 3/3] xfail datasets with datetime variables --- tests/test_zarr_compat.py | 49 ++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/tests/test_zarr_compat.py b/tests/test_zarr_compat.py index 123ae96..b8e42e4 100644 --- a/tests/test_zarr_compat.py +++ b/tests/test_zarr_compat.py @@ -70,10 +70,17 @@ def test_roundtrip(start, end, freq, nlats, nlons, var_const, calendar, use_cfti xr.testing.assert_identical(actual, ds) +xfail_reason = """Currently, xarray casts datetimes arrays to NumPy compatible arrays. +This ends up producing unexpected behavior when calling encode_zarr_varible() +on datasets with variables containing datetime like dtypes. + +See: https://github.com/jhamman/xpublish/pull/10#discussion_r388028417""" + + @pytest.mark.parametrize( - 'start, end, freq, nlats, nlons, var_const, calendar, use_cftime, chunks', + 'start, end, freq, nlats, nlons, var_const, calendar, use_cftime, chunks, decode_times', [ - ('2018-01-01', '2021-01-01', 'MS', 180, 360, True, 'standard', False, {'time': 10}), + ('2018-01-01', '2021-01-01', 'MS', 180, 360, True, 'standard', False, {'time': 10}, False), ( '2018-01-01', '2021-01-01', @@ -84,6 +91,20 @@ def test_roundtrip(start, end, freq, nlats, nlons, var_const, calendar, use_cfti 'noleap', True, {'time': 10, 'lat': 300, 'lon': 300}, + False, + ), + pytest.param( + '2018-01-01', + '2021-01-01', + 'D', + 300, + 600, + False, + 'noleap', + True, + {'time': 10, 'lat': 300, 'lon': 300}, + True, + marks=pytest.mark.xfail(reason=xfail_reason), ), ( '2018-01-01', @@ -95,22 +116,37 @@ def test_roundtrip(start, end, freq, nlats, nlons, var_const, calendar, use_cfti 'gregorian', False, {'time': 36, 'lat': 10}, + False, ), ( '2018-01-01', - '2050-01-01', + '2038-01-01', 'A', 300, 600, None, '360_day', True, - {'time': 10, 'lat': 30, 'lon': 30}, + {'time': 10, 'lat': 75, 'lon': 120}, + False, + ), + pytest.param( + '2018-01-01', + '2038-01-01', + 'A', + 300, + 600, + None, + '360_day', + True, + {'time': 10, 'lat': 75, 'lon': 120}, + True, + marks=pytest.mark.xfail(reason=xfail_reason), ), ], ) def test_roundtrip_custom_chunks( - start, end, freq, nlats, nlons, var_const, calendar, use_cftime, chunks + start, end, freq, nlats, nlons, var_const, calendar, use_cftime, chunks, decode_times ): ds = create_dataset( start=start, @@ -120,9 +156,10 @@ def test_roundtrip_custom_chunks( var_const=var_const, use_cftime=use_cftime, calendar=calendar, + decode_times=decode_times, ) ds = ds.chunk(chunks) mapper = TestMapper(ds.rest.app) - actual = xr.open_zarr(mapper, consolidated=True) + actual = xr.open_zarr(mapper, consolidated=True, decode_times=decode_times) xr.testing.assert_identical(actual, ds)