From e028634b885356b95fc5705cb4444223b14be2e5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 31 Jul 2024 10:50:55 +0200 Subject: [PATCH] create v3/metadata module refactor metadata into v2 and v3 modules remove version signifiers from function names fix tests --- src/zarr/array.py | 10 +- src/zarr/codecs/sharding.py | 2 +- src/zarr/metadata/__init__.py | 4 + src/zarr/metadata/common.py | 70 ++++ src/zarr/metadata/v2.py | 221 +++++++++++ src/zarr/{metadata.py => metadata/v3.py} | 456 ++++++----------------- tests/v3/test_metadata/test_v2.py | 7 +- tests/v3/test_metadata/test_v3.py | 5 +- 8 files changed, 414 insertions(+), 361 deletions(-) create mode 100644 src/zarr/metadata/__init__.py create mode 100644 src/zarr/metadata/common.py create mode 100644 src/zarr/metadata/v2.py rename src/zarr/{metadata.py => metadata/v3.py} (60%) diff --git a/src/zarr/array.py b/src/zarr/array.py index e41118805..e40ec397a 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -59,7 +59,7 @@ is_scalar, pop_fields, ) -from zarr.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata +from zarr.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.registry import get_pipeline_class from zarr.store import StoreLike, StorePath, make_store_path from zarr.store.core import ( @@ -92,14 +92,14 @@ def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> CodecP @dataclass(frozen=True) class AsyncArray: - metadata: ArrayMetadata + metadata: ArrayV3Metadata | ArrayV2Metadata store_path: StorePath codec_pipeline: CodecPipeline = field(init=False) order: Literal["C", "F"] def __init__( self, - metadata: ArrayMetadata, + metadata: ArrayV2Metadata | ArrayV3Metadata, store_path: StorePath, order: Literal["C", "F"] | None = None, ): @@ -497,7 +497,7 @@ async def getitem( ) return await self._get_selection(indexer, prototype=prototype) - async def _save_metadata(self, metadata: ArrayMetadata) -> None: + async def _save_metadata(self, metadata: ArrayV2Metadata | ArrayV3Metadata) -> None: to_save = metadata.to_buffer_dict(default_buffer_prototype()) awaitables = [set_or_delete(self.store_path / key, value) for key, value in to_save.items()] await gather(*awaitables) @@ -716,7 +716,7 @@ def basename(self) -> str | None: return self._async_array.basename @property - def metadata(self) -> ArrayMetadata: + def metadata(self) -> ArrayV2Metadata | ArrayV3Metadata: return self._async_array.metadata @property diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index ef8b80c02..f4da64f19 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -38,7 +38,7 @@ product, ) from zarr.indexing import BasicIndexer, SelectorTuple, c_order_iter, get_indexer, morton_order_iter -from zarr.metadata import parse_codecs +from zarr.metadata.v3 import parse_codecs from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec if TYPE_CHECKING: diff --git a/src/zarr/metadata/__init__.py b/src/zarr/metadata/__init__.py new file mode 100644 index 000000000..0c3122d7f --- /dev/null +++ b/src/zarr/metadata/__init__.py @@ -0,0 +1,4 @@ +from .v2 import ArrayV2Metadata +from .v3 import ArrayV3Metadata + +__all__ = ["ArrayV3Metadata", "ArrayV2Metadata"] diff --git a/src/zarr/metadata/common.py b/src/zarr/metadata/common.py new file mode 100644 index 000000000..9637bd8ef --- /dev/null +++ b/src/zarr/metadata/common.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing import Any, Literal + + from typing_extensions import Self + +from abc import ABC, abstractmethod +from dataclasses import dataclass + +import numpy as np + +from zarr.abc.metadata import Metadata +from zarr.array_spec import ArraySpec +from zarr.buffer import Buffer, BufferPrototype +from zarr.chunk_grids import ChunkGrid +from zarr.common import JSON, ChunkCoords, ZarrFormat + + +@dataclass(frozen=True, kw_only=True) +class ArrayMetadata(Metadata, ABC): + shape: ChunkCoords + fill_value: Any + chunk_grid: ChunkGrid + attributes: dict[str, JSON] + zarr_format: ZarrFormat + + @property + @abstractmethod + def dtype(self) -> np.dtype[Any]: + pass + + @property + @abstractmethod + def ndim(self) -> int: + pass + + @abstractmethod + def get_chunk_spec( + self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype + ) -> ArraySpec: + pass + + @abstractmethod + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: + pass + + @abstractmethod + def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: + pass + + @abstractmethod + def update_shape(self, shape: ChunkCoords) -> Self: + pass + + @abstractmethod + def update_attributes(self, attributes: dict[str, JSON]) -> Self: + pass + + +def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]: + """ + Normalize `None` to an empty dict. All other values pass through. + """ + if data is None: + return {} + + return data diff --git a/src/zarr/metadata/v2.py b/src/zarr/metadata/v2.py new file mode 100644 index 000000000..9bc80e86e --- /dev/null +++ b/src/zarr/metadata/v2.py @@ -0,0 +1,221 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import numpy.typing as npt + from typing_extensions import Self + +import json +from dataclasses import dataclass, field, replace +from typing import Any, Literal + +import numpy as np + +from zarr.array_spec import ArraySpec +from zarr.buffer import Buffer, BufferPrototype +from zarr.chunk_grids import RegularChunkGrid +from zarr.chunk_key_encodings import parse_separator +from zarr.common import JSON, ZARRAY_JSON, ZATTRS_JSON, ChunkCoords, parse_dtype, parse_shapelike +from zarr.config import config, parse_indexing_order +from zarr.metadata.common import ArrayMetadata, parse_attributes + + +@dataclass(frozen=True, kw_only=True) +class ArrayV2Metadata(ArrayMetadata): + shape: ChunkCoords + chunk_grid: RegularChunkGrid + data_type: np.dtype[Any] + fill_value: None | int | float = 0 + order: Literal["C", "F"] = "C" + filters: list[dict[str, JSON]] | None = None + dimension_separator: Literal[".", "/"] = "." + compressor: dict[str, JSON] | None = None + attributes: dict[str, JSON] = field(default_factory=dict) + zarr_format: Literal[2] = field(init=False, default=2) + + def __init__( + self, + *, + shape: ChunkCoords, + dtype: npt.DTypeLike, + chunks: ChunkCoords, + fill_value: Any, + order: Literal["C", "F"], + dimension_separator: Literal[".", "/"] = ".", + compressor: dict[str, JSON] | None = None, + filters: list[dict[str, JSON]] | None = None, + attributes: dict[str, JSON] | None = None, + ): + """ + Metadata for a Zarr version 2 array. + """ + shape_parsed = parse_shapelike(shape) + data_type_parsed = parse_dtype(dtype) + chunks_parsed = parse_shapelike(chunks) + compressor_parsed = parse_compressor(compressor) + order_parsed = parse_indexing_order(order) + dimension_separator_parsed = parse_separator(dimension_separator) + filters_parsed = parse_filters(filters) + fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) + attributes_parsed = parse_attributes(attributes) + + object.__setattr__(self, "shape", shape_parsed) + object.__setattr__(self, "data_type", data_type_parsed) + object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed)) + object.__setattr__(self, "compressor", compressor_parsed) + object.__setattr__(self, "order", order_parsed) + object.__setattr__(self, "dimension_separator", dimension_separator_parsed) + object.__setattr__(self, "filters", filters_parsed) + object.__setattr__(self, "fill_value", fill_value_parsed) + object.__setattr__(self, "attributes", attributes_parsed) + + # ensure that the metadata document is consistent + _ = parse_metadata(self) + + @property + def ndim(self) -> int: + return len(self.shape) + + @property + def dtype(self) -> np.dtype[Any]: + return self.data_type + + @property + def chunks(self) -> ChunkCoords: + return self.chunk_grid.chunk_shape + + def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: + def _json_convert( + o: Any, + ) -> Any: + if isinstance(o, np.dtype): + if o.fields is None: + return o.str + else: + return o.descr + if np.isscalar(o): + # convert numpy scalar to python type, and pass + # python types through + return getattr(o, "item", lambda: o)() + raise TypeError + + zarray_dict = self.to_dict() + assert isinstance(zarray_dict, dict) + zattrs_dict = zarray_dict.pop("attributes", {}) + assert isinstance(zattrs_dict, dict) + json_indent = config.get("json_indent") + return { + ZARRAY_JSON: prototype.buffer.from_bytes( + json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode() + ), + ZATTRS_JSON: prototype.buffer.from_bytes( + json.dumps(zattrs_dict, indent=json_indent).encode() + ), + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: + # make a copy to protect the original from modification + _data = data.copy() + # check that the zarr_format attribute is correct + _ = parse_zarr_format(_data.pop("zarr_format")) + return cls(**_data) + + def to_dict(self) -> JSON: + zarray_dict = super().to_dict() + + assert isinstance(zarray_dict, dict) + + _ = zarray_dict.pop("chunk_grid") + zarray_dict["chunks"] = self.chunk_grid.chunk_shape + + _ = zarray_dict.pop("data_type") + zarray_dict["dtype"] = self.data_type.str + + return zarray_dict + + def get_chunk_spec( + self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype + ) -> ArraySpec: + return ArraySpec( + shape=self.chunk_grid.chunk_shape, + dtype=self.dtype, + fill_value=self.fill_value, + order=order, + prototype=prototype, + ) + + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: + chunk_identifier = self.dimension_separator.join(map(str, chunk_coords)) + return "0" if chunk_identifier == "" else chunk_identifier + + def update_shape(self, shape: ChunkCoords) -> Self: + return replace(self, shape=shape) + + def update_attributes(self, attributes: dict[str, JSON]) -> Self: + return replace(self, attributes=attributes) + + +def parse_zarr_format(data: Literal[2]) -> Literal[2]: + if data == 2: + return data + raise ValueError(f"Invalid value. Expected 2. Got {data}.") + + +def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None: + return data + + +def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None: + return data + + +def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: + if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)): + msg = ( + f"The `shape` and `chunks` attributes must have the same length. " + f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}." + ) + raise ValueError(msg) + return data + + +def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: + """ + Parse a fill value, given a dtype. + + This is copied from the `normalize_fill_value` function from zarr-python 2.x. + """ + if fill_value is None or dtype.hasobject: + # no fill value + pass + elif not isinstance(fill_value, np.void) and fill_value == 0: + # this should be compatible across numpy versions for any array type, including + # structured arrays + fill_value = np.zeros((), dtype=dtype)[()] + + elif dtype.kind == "U": + # special case unicode because of encoding issues on Windows if passed through numpy + # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 + + if not isinstance(fill_value, str): + msg = ( + f"fill_value {fill_value!r} is not valid for dtype {dtype}." + f"Expected a unicode string, got {type(fill_value)}." + ) + raise ValueError(msg) + + else: + try: + if isinstance(fill_value, bytes) and dtype.kind == "V": + # special case for numpy 1.14 compatibility + fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()] + else: + fill_value = np.array(fill_value, dtype=dtype)[()] + + except Exception as e: + msg = f"fill_value {fill_value!r} is not valid for dtype {dtype}" + raise ValueError(msg) from e + + return fill_value diff --git a/src/zarr/metadata.py b/src/zarr/metadata/v3.py similarity index 60% rename from src/zarr/metadata.py rename to src/zarr/metadata/v3.py index e801a6f96..828ac6c41 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata/v3.py @@ -1,157 +1,40 @@ from __future__ import annotations +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + from typing import TypeAlias + + import numpy.typing as npt + from typing_extensions import Self + + _bool: TypeAlias = bool + import json -from abc import ABC, abstractmethod from collections.abc import Iterable, Sequence from dataclasses import dataclass, field, replace from enum import Enum -from typing import TYPE_CHECKING, Any, Literal, cast, overload +from typing import Any, Literal, overload +import numcodecs.abc import numpy as np -import numpy.typing as npt from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec, CodecPipeline -from zarr.abc.metadata import Metadata +from zarr.array_spec import ArraySpec from zarr.buffer import Buffer, BufferPrototype, default_buffer_prototype from zarr.chunk_grids import ChunkGrid, RegularChunkGrid -from zarr.chunk_key_encodings import ChunkKeyEncoding, parse_separator -from zarr.config import config -from zarr.registry import get_codec_class, get_pipeline_class - -if TYPE_CHECKING: - from typing_extensions import Self - -import numcodecs.abc - -from zarr.array_spec import ArraySpec +from zarr.chunk_key_encodings import ChunkKeyEncoding from zarr.common import ( JSON, ZARR_JSON, - ZARRAY_JSON, - ZATTRS_JSON, ChunkCoords, - ZarrFormat, parse_dtype, parse_named_configuration, parse_shapelike, ) -from zarr.config import parse_indexing_order - -# For type checking -_bool = bool - -__all__ = ["ArrayMetadata"] - - -class DataType(Enum): - bool = "bool" - int8 = "int8" - int16 = "int16" - int32 = "int32" - int64 = "int64" - uint8 = "uint8" - uint16 = "uint16" - uint32 = "uint32" - uint64 = "uint64" - float32 = "float32" - float64 = "float64" - - @property - def byte_count(self) -> int: - data_type_byte_counts = { - DataType.bool: 1, - DataType.int8: 1, - DataType.int16: 2, - DataType.int32: 4, - DataType.int64: 8, - DataType.uint8: 1, - DataType.uint16: 2, - DataType.uint32: 4, - DataType.uint64: 8, - DataType.float32: 4, - DataType.float64: 8, - } - return data_type_byte_counts[self] - - @property - def has_endianness(self) -> _bool: - # This might change in the future, e.g. for a complex with 2 8-bit floats - return self.byte_count != 1 - - def to_numpy_shortname(self) -> str: - data_type_to_numpy = { - DataType.bool: "bool", - DataType.int8: "i1", - DataType.int16: "i2", - DataType.int32: "i4", - DataType.int64: "i8", - DataType.uint8: "u1", - DataType.uint16: "u2", - DataType.uint32: "u4", - DataType.uint64: "u8", - DataType.float32: "f4", - DataType.float64: "f8", - } - return data_type_to_numpy[self] - - @classmethod - def from_dtype(cls, dtype: np.dtype[Any]) -> DataType: - dtype_to_data_type = { - "|b1": "bool", - "bool": "bool", - "|i1": "int8", - " np.dtype[Any]: - pass - - @property - @abstractmethod - def ndim(self) -> int: - pass - - @abstractmethod - def get_chunk_spec( - self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype - ) -> ArraySpec: - pass - - @abstractmethod - def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: - pass - - @abstractmethod - def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: - pass - - @abstractmethod - def update_shape(self, shape: ChunkCoords) -> Self: - pass - - @abstractmethod - def update_attributes(self, attributes: dict[str, JSON]) -> Self: - pass +from zarr.config import config +from zarr.metadata.common import ArrayMetadata, parse_attributes +from zarr.registry import get_codec_class, get_pipeline_class @dataclass(frozen=True, kw_only=True) @@ -187,7 +70,7 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - fill_value_parsed = parse_fill_value_v3(fill_value, dtype=data_type_parsed) + fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) @@ -287,7 +170,7 @@ def from_dict(cls, data: dict[str, JSON]) -> ArrayV3Metadata: _data = data.copy() # TODO: Remove the type: ignores[] comments below and use a TypedDict to type `data` # check that the zarr_format attribute is correct - _ = parse_zarr_format_v3(_data.pop("zarr_format")) # type: ignore[arg-type] + _ = parse_zarr_format(_data.pop("zarr_format")) # type: ignore[arg-type] # check that the node_type attribute is correct _ = parse_node_type_array(_data.pop("node_type")) # type: ignore[arg-type] @@ -316,202 +199,18 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) -@dataclass(frozen=True, kw_only=True) -class ArrayV2Metadata(ArrayMetadata): - shape: ChunkCoords - chunk_grid: RegularChunkGrid - data_type: np.dtype[Any] - fill_value: None | int | float = 0 - order: Literal["C", "F"] = "C" - filters: list[dict[str, JSON]] | None = None - dimension_separator: Literal[".", "/"] = "." - compressor: dict[str, JSON] | None = None - attributes: dict[str, JSON] = field(default_factory=dict) - zarr_format: Literal[2] = field(init=False, default=2) - - def __init__( - self, - *, - shape: ChunkCoords, - dtype: npt.DTypeLike, - chunks: ChunkCoords, - fill_value: Any, - order: Literal["C", "F"], - dimension_separator: Literal[".", "/"] = ".", - compressor: dict[str, JSON] | None = None, - filters: list[dict[str, JSON]] | None = None, - attributes: dict[str, JSON] | None = None, - ): - """ - Metadata for a Zarr version 2 array. - """ - shape_parsed = parse_shapelike(shape) - data_type_parsed = parse_dtype(dtype) - chunks_parsed = parse_shapelike(chunks) - compressor_parsed = parse_compressor(compressor) - order_parsed = parse_indexing_order(order) - dimension_separator_parsed = parse_separator(dimension_separator) - filters_parsed = parse_filters(filters) - fill_value_parsed = parse_fill_value_v2(fill_value, dtype=data_type_parsed) - attributes_parsed = parse_attributes(attributes) - - object.__setattr__(self, "shape", shape_parsed) - object.__setattr__(self, "data_type", data_type_parsed) - object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed)) - object.__setattr__(self, "compressor", compressor_parsed) - object.__setattr__(self, "order", order_parsed) - object.__setattr__(self, "dimension_separator", dimension_separator_parsed) - object.__setattr__(self, "filters", filters_parsed) - object.__setattr__(self, "fill_value", fill_value_parsed) - object.__setattr__(self, "attributes", attributes_parsed) - - # ensure that the metadata document is consistent - _ = parse_v2_metadata(self) - - @property - def ndim(self) -> int: - return len(self.shape) - - @property - def dtype(self) -> np.dtype[Any]: - return self.data_type - - @property - def chunks(self) -> ChunkCoords: - return self.chunk_grid.chunk_shape - - def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: - def _json_convert( - o: Any, - ) -> Any: - if isinstance(o, np.dtype): - if o.fields is None: - return o.str - else: - return o.descr - if np.isscalar(o): - # convert numpy scalar to python type, and pass - # python types through - return getattr(o, "item", lambda: o)() - raise TypeError - - zarray_dict = self.to_dict() - assert isinstance(zarray_dict, dict) - zattrs_dict = zarray_dict.pop("attributes", {}) - assert isinstance(zattrs_dict, dict) - json_indent = config.get("json_indent") - return { - ZARRAY_JSON: prototype.buffer.from_bytes( - json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode() - ), - ZATTRS_JSON: prototype.buffer.from_bytes( - json.dumps(zattrs_dict, indent=json_indent).encode() - ), - } - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: - # make a copy to protect the original from modification - _data = data.copy() - # check that the zarr_format attribute is correct - _ = parse_zarr_format_v2(_data.pop("zarr_format")) - return cls(**_data) - - def to_dict(self) -> JSON: - zarray_dict = super().to_dict() - - assert isinstance(zarray_dict, dict) - - _ = zarray_dict.pop("chunk_grid") - zarray_dict["chunks"] = self.chunk_grid.chunk_shape - - _ = zarray_dict.pop("data_type") - zarray_dict["dtype"] = self.data_type.str - - return zarray_dict - - def get_chunk_spec( - self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype - ) -> ArraySpec: - return ArraySpec( - shape=self.chunk_grid.chunk_shape, - dtype=self.dtype, - fill_value=self.fill_value, - order=order, - prototype=prototype, - ) - - def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: - chunk_identifier = self.dimension_separator.join(map(str, chunk_coords)) - return "0" if chunk_identifier == "" else chunk_identifier - - def update_shape(self, shape: ChunkCoords) -> Self: - return replace(self, shape=shape) - - def update_attributes(self, attributes: dict[str, JSON]) -> Self: - return replace(self, attributes=attributes) - - -def parse_dimension_names(data: None | Iterable[str | None]) -> tuple[str | None, ...] | None: - if data is None: - return data - elif all(isinstance(x, type(None) | str) for x in data): - return tuple(data) - else: - msg = f"Expected either None or a iterable of str, got {type(data)}" - raise TypeError(msg) - - -# todo: real validation -def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]: - if data is None: - return {} - - return data - - -# todo: move to its own module and drop _v3 suffix -# todo: consider folding all the literal parsing into a single function -# that takes 2 arguments -def parse_zarr_format_v3(data: Literal[3]) -> Literal[3]: +def parse_zarr_format(data: Literal[3]) -> Literal[3]: if data == 3: - return data + return 3 raise ValueError(f"Invalid value. Expected 3. Got {data}.") -# todo: move to its own module and drop _v2 suffix -def parse_zarr_format_v2(data: Literal[2]) -> Literal[2]: - if data == 2: - return data - raise ValueError(f"Invalid value. Expected 2. Got {data}.") - - def parse_node_type_array(data: Literal["array"]) -> Literal["array"]: if data == "array": - return data + return "array" raise ValueError(f"Invalid value. Expected 'array'. Got {data}.") -# todo: real validation -def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None: - return data - - -# todo: real validation -def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None: - return data - - -def parse_v2_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: - if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)): - msg = ( - f"The `shape` and `chunks` attributes must have the same length. " - f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}." - ) - raise ValueError(msg) - return data - - def create_pipeline(data: Iterable[Codec | JSON]) -> CodecPipeline: if not isinstance(data, Iterable): raise TypeError(f"Expected iterable, got {type(data)}") @@ -536,27 +235,6 @@ def parse_codecs(data: Iterable[Codec | dict[str, JSON]]) -> tuple[Codec, ...]: return out -def parse_fill_value_v2(fill_value: Any, dtype: np.dtype[Any]) -> Any: - """ - Parse a potential fill value into a value that is compatible with the provided dtype. - - This is a light wrapper around zarr.v2.util.normalize_fill_value. - - Parameters - ---------- - fill_value: Any - A potential fill value. - dtype: np.dtype[Any] - A numpy dtype. - - Returns - An instance of `dtype`, or `None`, or any python object (in the case of an object dtype) - """ - from zarr.v2.util import normalize_fill_value - - return normalize_fill_value(fill_value=fill_value, dtype=dtype) - - BOOL = np.bool_ BOOL_DTYPE = np.dtypes.BoolDType @@ -580,22 +258,22 @@ def parse_fill_value_v2(fill_value: Any, dtype: np.dtype[Any]) -> Any: @overload -def parse_fill_value_v3(fill_value: Any, dtype: BOOL_DTYPE) -> BOOL: ... +def parse_fill_value(fill_value: Any, dtype: BOOL_DTYPE) -> BOOL: ... @overload -def parse_fill_value_v3(fill_value: Any, dtype: INTEGER_DTYPE) -> INTEGER: ... +def parse_fill_value(fill_value: Any, dtype: INTEGER_DTYPE) -> INTEGER: ... @overload -def parse_fill_value_v3(fill_value: Any, dtype: FLOAT_DTYPE) -> FLOAT: ... +def parse_fill_value(fill_value: Any, dtype: FLOAT_DTYPE) -> FLOAT: ... @overload -def parse_fill_value_v3(fill_value: Any, dtype: COMPLEX_DTYPE) -> COMPLEX: ... +def parse_fill_value(fill_value: Any, dtype: COMPLEX_DTYPE) -> COMPLEX: ... -def parse_fill_value_v3( +def parse_fill_value( fill_value: Any, dtype: BOOL_DTYPE | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE ) -> BOOL | INTEGER | FLOAT | COMPLEX: """ @@ -636,3 +314,83 @@ def parse_fill_value_v3( msg = f"Cannot parse non-string sequence {fill_value} as a scalar with type {dtype}." raise TypeError(msg) return dtype.type(fill_value) + + +def parse_dimension_names(data: None | Iterable[str | None]) -> tuple[str | None, ...] | None: + if data is None: + return data + elif all(isinstance(x, type(None) | str) for x in data): + return tuple(data) + else: + msg = f"Expected either None or a iterable of str, got {type(data)}" + raise TypeError(msg) + + +class DataType(Enum): + bool = "bool" + int8 = "int8" + int16 = "int16" + int32 = "int32" + int64 = "int64" + uint8 = "uint8" + uint16 = "uint16" + uint32 = "uint32" + uint64 = "uint64" + float32 = "float32" + float64 = "float64" + + @property + def byte_count(self) -> int: + data_type_byte_counts = { + DataType.bool: 1, + DataType.int8: 1, + DataType.int16: 2, + DataType.int32: 4, + DataType.int64: 8, + DataType.uint8: 1, + DataType.uint16: 2, + DataType.uint32: 4, + DataType.uint64: 8, + DataType.float32: 4, + DataType.float64: 8, + } + return data_type_byte_counts[self] + + @property + def has_endianness(self) -> _bool: + # This might change in the future, e.g. for a complex with 2 8-bit floats + return self.byte_count != 1 + + def to_numpy_shortname(self) -> str: + data_type_to_numpy = { + DataType.bool: "bool", + DataType.int8: "i1", + DataType.int16: "i2", + DataType.int32: "i4", + DataType.int64: "i8", + DataType.uint8: "u1", + DataType.uint16: "u2", + DataType.uint32: "u4", + DataType.uint64: "u8", + DataType.float32: "f4", + DataType.float64: "f8", + } + return data_type_to_numpy[self] + + @classmethod + def from_dtype(cls, dtype: np.dtype[Any]) -> DataType: + dtype_to_data_type = { + "|b1": "bool", + "bool": "bool", + "|i1": "int8", + " None: - assert parse_zarr_format_v2(2) == 2 + assert parse_zarr_format(2) == 2 @pytest.mark.parametrize("data", [None, 1, 3, 4, 5, "3"]) def test_parse_zarr_format_invalid(data: Any) -> None: with pytest.raises(ValueError, match=f"Invalid value. Expected 2. Got {data}"): - parse_zarr_format_v2(data) + parse_zarr_format(data) @pytest.mark.parametrize("attributes", [None, {"foo": "bar"}]) diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index 2b25c776e..ed568462d 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -15,9 +15,8 @@ import numpy as np import pytest -from zarr.metadata import ArrayV3Metadata, parse_dimension_names -from zarr.metadata import parse_fill_value_v3 as parse_fill_value -from zarr.metadata import parse_zarr_format_v3 as parse_zarr_format +from zarr.metadata import ArrayV3Metadata +from zarr.metadata.v3 import parse_dimension_names, parse_fill_value, parse_zarr_format bool_dtypes = ("bool",)