From 1c8d97959c3ed929508547754b8dfc217ddfd510 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 9 Sep 2024 21:54:00 +0200 Subject: [PATCH 1/5] refactor: split metadata into v2 and v3 modules --- src/zarr/api/asynchronous.py | 3 +- src/zarr/codecs/sharding.py | 2 +- src/zarr/core/array.py | 4 +- src/zarr/core/metadata/__init__.py | 4 + src/zarr/core/metadata/common.py | 67 +++ src/zarr/core/metadata/v2.py | 205 +++++++ src/zarr/core/{metadata.py => metadata/v3.py} | 505 +++++------------- tests/v3/test_metadata/test_v2.py | 4 +- tests/v3/test_metadata/test_v3.py | 5 +- 9 files changed, 416 insertions(+), 383 deletions(-) create mode 100644 src/zarr/core/metadata/__init__.py create mode 100644 src/zarr/core/metadata/common.py create mode 100644 src/zarr/core/metadata/v2.py rename src/zarr/core/{metadata.py => metadata/v3.py} (61%) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 7f59517f3..a1c821696 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -10,7 +10,8 @@ from zarr.core.array import Array, AsyncArray from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup -from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.store import ( StoreLike, make_store_path, diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 528110472..df7f5978a 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -44,7 +44,7 @@ get_indexer, morton_order_iter, ) -from zarr.core.metadata import parse_codecs +from zarr.core.metadata.v3 import parse_codecs from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec if TYPE_CHECKING: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index dcd7217d7..7311b6eec 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -55,7 +55,8 @@ is_scalar, pop_fields, ) -from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.registry import get_pipeline_class from zarr.store import StoreLike, StorePath, make_store_path @@ -67,6 +68,7 @@ from collections.abc import Iterable from zarr.abc.codec import Codec, CodecPipeline + from zarr.core.metadata.common import ArrayMetadata # Array and AsyncArray are defined in the base ``zarr`` namespace __all__ = ["parse_array_metadata", "create_codec_pipeline"] diff --git a/src/zarr/core/metadata/__init__.py b/src/zarr/core/metadata/__init__.py new file mode 100644 index 000000000..addf47339 --- /dev/null +++ b/src/zarr/core/metadata/__init__.py @@ -0,0 +1,4 @@ +from .v2 import ArrayV2Metadata +from .v3 import ArrayV3Metadata + +__all__ = ["ArrayV2Metadata", "ArrayV3Metadata"] diff --git a/src/zarr/core/metadata/common.py b/src/zarr/core/metadata/common.py new file mode 100644 index 000000000..583375b4b --- /dev/null +++ b/src/zarr/core/metadata/common.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing import Any, Literal + + import numpy as np + from typing_extensions import Self + + from zarr.core.array_spec import ArraySpec + from zarr.core.buffer import Buffer, BufferPrototype + from zarr.core.chunk_grids import ChunkGrid + from zarr.core.common import JSON, ChunkCoords, ZarrFormat + +from abc import ABC, abstractmethod +from dataclasses import dataclass + +from zarr.abc.metadata import Metadata + + +@dataclass(frozen=True, kw_only=True) +class ArrayMetadata(Metadata, ABC): + shape: ChunkCoords + fill_value: Any + chunk_grid: ChunkGrid + attributes: dict[str, JSON] + zarr_format: ZarrFormat + + @property + @abstractmethod + def dtype(self) -> np.dtype[Any]: + pass + + @property + @abstractmethod + def ndim(self) -> int: + pass + + @abstractmethod + def get_chunk_spec( + self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype + ) -> ArraySpec: + pass + + @abstractmethod + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: + pass + + @abstractmethod + def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: + pass + + @abstractmethod + def update_shape(self, shape: ChunkCoords) -> Self: + pass + + @abstractmethod + def update_attributes(self, attributes: dict[str, JSON]) -> Self: + pass + + +def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]: + if data is None: + return {} + + return data diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py new file mode 100644 index 000000000..813adeff5 --- /dev/null +++ b/src/zarr/core/metadata/v2.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing import Any, Literal + + import numpy.typing as npt + from typing_extensions import Self + + from zarr.core.buffer import Buffer, BufferPrototype + from zarr.core.common import JSON, ChunkCoords + +import json +from dataclasses import dataclass, field, replace + +import numpy as np + +from zarr.core.array_spec import ArraySpec +from zarr.core.chunk_grids import RegularChunkGrid +from zarr.core.chunk_key_encodings import parse_separator +from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_dtype, parse_shapelike +from zarr.core.config import config, parse_indexing_order +from zarr.core.metadata.common import ArrayMetadata, parse_attributes + + +@dataclass(frozen=True, kw_only=True) +class ArrayV2Metadata(ArrayMetadata): + shape: ChunkCoords + chunk_grid: RegularChunkGrid + data_type: np.dtype[Any] + fill_value: None | int | float = 0 + order: Literal["C", "F"] = "C" + filters: list[dict[str, JSON]] | None = None + dimension_separator: Literal[".", "/"] = "." + compressor: dict[str, JSON] | None = None + attributes: dict[str, JSON] = field(default_factory=dict) + zarr_format: Literal[2] = field(init=False, default=2) + + def __init__( + self, + *, + shape: ChunkCoords, + dtype: npt.DTypeLike, + chunks: ChunkCoords, + fill_value: Any, + order: Literal["C", "F"], + dimension_separator: Literal[".", "/"] = ".", + compressor: dict[str, JSON] | None = None, + filters: list[dict[str, JSON]] | None = None, + attributes: dict[str, JSON] | None = None, + ): + """ + Metadata for a Zarr version 2 array. + """ + shape_parsed = parse_shapelike(shape) + data_type_parsed = parse_dtype(dtype) + chunks_parsed = parse_shapelike(chunks) + compressor_parsed = parse_compressor(compressor) + order_parsed = parse_indexing_order(order) + dimension_separator_parsed = parse_separator(dimension_separator) + filters_parsed = parse_filters(filters) + fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) + attributes_parsed = parse_attributes(attributes) + + object.__setattr__(self, "shape", shape_parsed) + object.__setattr__(self, "data_type", data_type_parsed) + object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed)) + object.__setattr__(self, "compressor", compressor_parsed) + object.__setattr__(self, "order", order_parsed) + object.__setattr__(self, "dimension_separator", dimension_separator_parsed) + object.__setattr__(self, "filters", filters_parsed) + object.__setattr__(self, "fill_value", fill_value_parsed) + object.__setattr__(self, "attributes", attributes_parsed) + + # ensure that the metadata document is consistent + _ = parse_metadata(self) + + @property + def ndim(self) -> int: + return len(self.shape) + + @property + def dtype(self) -> np.dtype[Any]: + return self.data_type + + @property + def chunks(self) -> ChunkCoords: + return self.chunk_grid.chunk_shape + + def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: + def _json_convert( + o: Any, + ) -> Any: + if isinstance(o, np.dtype): + if o.fields is None: + return o.str + else: + return o.descr + if np.isscalar(o): + # convert numpy scalar to python type, and pass + # python types through + return getattr(o, "item", lambda: o)() + raise TypeError + + zarray_dict = self.to_dict() + assert isinstance(zarray_dict, dict) + zattrs_dict = zarray_dict.pop("attributes", {}) + assert isinstance(zattrs_dict, dict) + json_indent = config.get("json_indent") + return { + ZARRAY_JSON: prototype.buffer.from_bytes( + json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode() + ), + ZATTRS_JSON: prototype.buffer.from_bytes( + json.dumps(zattrs_dict, indent=json_indent).encode() + ), + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: + # make a copy to protect the original from modification + _data = data.copy() + # check that the zarr_format attribute is correct + _ = parse_zarr_format_v2(_data.pop("zarr_format")) + return cls(**_data) + + def to_dict(self) -> JSON: + zarray_dict = super().to_dict() + + assert isinstance(zarray_dict, dict) + + _ = zarray_dict.pop("chunk_grid") + zarray_dict["chunks"] = self.chunk_grid.chunk_shape + + _ = zarray_dict.pop("data_type") + zarray_dict["dtype"] = self.data_type.str + + return zarray_dict + + def get_chunk_spec( + self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype + ) -> ArraySpec: + return ArraySpec( + shape=self.chunk_grid.chunk_shape, + dtype=self.dtype, + fill_value=self.fill_value, + order=order, + prototype=prototype, + ) + + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: + chunk_identifier = self.dimension_separator.join(map(str, chunk_coords)) + return "0" if chunk_identifier == "" else chunk_identifier + + def update_shape(self, shape: ChunkCoords) -> Self: + return replace(self, shape=shape) + + def update_attributes(self, attributes: dict[str, JSON]) -> Self: + return replace(self, attributes=attributes) + + +def parse_zarr_format_v2(data: Literal[2]) -> Literal[2]: + if data == 2: + return data + raise ValueError(f"Invalid value. Expected 2. Got {data}.") + + +def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None: + return data + + +def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None: + return data + + +def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: + if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)): + msg = ( + f"The `shape` and `chunks` attributes must have the same length. " + f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}." + ) + raise ValueError(msg) + return data + + +def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: + """ + Parse a potential fill value into a value that is compatible with the provided dtype. + + This is a light wrapper around zarr.v2.util.normalize_fill_value. + + Parameters + ---------- + fill_value: Any + A potential fill value. + dtype: np.dtype[Any] + A numpy dtype. + + Returns + An instance of `dtype`, or `None`, or any python object (in the case of an object dtype) + """ + from zarr.v2.util import normalize_fill_value + + return normalize_fill_value(fill_value=fill_value, dtype=dtype) diff --git a/src/zarr/core/metadata.py b/src/zarr/core/metadata/v3.py similarity index 61% rename from src/zarr/core/metadata.py rename to src/zarr/core/metadata/v3.py index d25559cd5..195c3bd0a 100644 --- a/src/zarr/core/metadata.py +++ b/src/zarr/core/metadata/v3.py @@ -1,156 +1,73 @@ from __future__ import annotations +from typing import TYPE_CHECKING, cast, overload + +if TYPE_CHECKING: + import numpy.typing as npt + from typing_extensions import Self + + from zarr.core.buffer import Buffer, BufferPrototype + from zarr.core.chunk_grids import ChunkGrid + from zarr.core.common import JSON, ChunkCoords + import json -from abc import ABC, abstractmethod from collections.abc import Iterable, Sequence from dataclasses import dataclass, field, replace from enum import Enum -from typing import TYPE_CHECKING, Any, Literal, cast, overload +from typing import Any, Literal +import numcodecs.abc import numpy as np -import numpy.typing as npt from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec, CodecPipeline -from zarr.abc.metadata import Metadata -from zarr.core.buffer import Buffer, BufferPrototype, default_buffer_prototype +from zarr.core.array_spec import ArraySpec +from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid -from zarr.core.chunk_key_encodings import ChunkKeyEncoding, parse_separator +from zarr.core.chunk_key_encodings import ChunkKeyEncoding +from zarr.core.common import ZARR_JSON, parse_dtype, parse_named_configuration, parse_shapelike +from zarr.core.config import config +from zarr.core.metadata.common import ArrayMetadata, parse_attributes from zarr.registry import get_codec_class, get_pipeline_class -if TYPE_CHECKING: - from typing_extensions import Self - -import numcodecs.abc - -from zarr.core.array_spec import ArraySpec -from zarr.core.common import ( - JSON, - ZARR_JSON, - ZARRAY_JSON, - ZATTRS_JSON, - ChunkCoords, - ZarrFormat, - parse_dtype, - parse_named_configuration, - parse_shapelike, -) -from zarr.core.config import config, parse_indexing_order - -# For type checking -_bool = bool -__all__ = ["ArrayMetadata"] - - -class DataType(Enum): - bool = "bool" - int8 = "int8" - int16 = "int16" - int32 = "int32" - int64 = "int64" - uint8 = "uint8" - uint16 = "uint16" - uint32 = "uint32" - uint64 = "uint64" - float32 = "float32" - float64 = "float64" - - @property - def byte_count(self) -> int: - data_type_byte_counts = { - DataType.bool: 1, - DataType.int8: 1, - DataType.int16: 2, - DataType.int32: 4, - DataType.int64: 8, - DataType.uint8: 1, - DataType.uint16: 2, - DataType.uint32: 4, - DataType.uint64: 8, - DataType.float32: 4, - DataType.float64: 8, - } - return data_type_byte_counts[self] - - @property - def has_endianness(self) -> _bool: - # This might change in the future, e.g. for a complex with 2 8-bit floats - return self.byte_count != 1 - - def to_numpy_shortname(self) -> str: - data_type_to_numpy = { - DataType.bool: "bool", - DataType.int8: "i1", - DataType.int16: "i2", - DataType.int32: "i4", - DataType.int64: "i8", - DataType.uint8: "u1", - DataType.uint16: "u2", - DataType.uint32: "u4", - DataType.uint64: "u8", - DataType.float32: "f4", - DataType.float64: "f8", - } - return data_type_to_numpy[self] - - @classmethod - def from_dtype(cls, dtype: np.dtype[Any]) -> DataType: - dtype_to_data_type = { - "|b1": "bool", - "bool": "bool", - "|i1": "int8", - " Literal[3]: + if data == 3: + return data + raise ValueError(f"Invalid value. Expected 3. Got {data}.") -@dataclass(frozen=True, kw_only=True) -class ArrayMetadata(Metadata, ABC): - shape: ChunkCoords - fill_value: Any - chunk_grid: ChunkGrid - attributes: dict[str, JSON] - zarr_format: ZarrFormat +def parse_node_type_array(data: Literal["array"]) -> Literal["array"]: + if data == "array": + return data + raise ValueError(f"Invalid value. Expected 'array'. Got {data}.") - @property - @abstractmethod - def dtype(self) -> np.dtype[Any]: - pass - @property - @abstractmethod - def ndim(self) -> int: - pass +def parse_codecs(data: Iterable[Codec | dict[str, JSON]]) -> tuple[Codec, ...]: + out: tuple[Codec, ...] = () - @abstractmethod - def get_chunk_spec( - self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype - ) -> ArraySpec: - pass + if not isinstance(data, Iterable): + raise TypeError(f"Expected iterable, got {type(data)}") - @abstractmethod - def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: - pass + for c in data: + if isinstance( + c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec + ): # Can't use Codec here because of mypy limitation + out += (c,) + else: + name_parsed, _ = parse_named_configuration(c, require_configuration=False) + out += (get_codec_class(name_parsed).from_dict(c),) - @abstractmethod - def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: - pass + return out - @abstractmethod - def update_shape(self, shape: ChunkCoords) -> Self: - pass - @abstractmethod - def update_attributes(self, attributes: dict[str, JSON]) -> Self: - pass +def parse_dimension_names(data: None | Iterable[str | None]) -> tuple[str | None, ...] | None: + if data is None: + return data + elif all(isinstance(x, type(None) | str) for x in data): + return tuple(data) + else: + msg = f"Expected either None or a iterable of str, got {type(data)}" + raise TypeError(msg) @dataclass(frozen=True, kw_only=True) @@ -186,7 +103,7 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - fill_value_parsed = parse_fill_value_v3(fill_value, dtype=data_type_parsed) + fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) @@ -294,7 +211,7 @@ def from_dict(cls, data: dict[str, JSON]) -> ArrayV3Metadata: _data = data.copy() # TODO: Remove the type: ignores[] comments below and use a TypedDict to type `data` # check that the zarr_format attribute is correct - _ = parse_zarr_format_v3(_data.pop("zarr_format")) # type: ignore[arg-type] + _ = parse_zarr_format(_data.pop("zarr_format")) # type: ignore[arg-type] # check that the node_type attribute is correct _ = parse_node_type_array(_data.pop("node_type")) # type: ignore[arg-type] @@ -323,250 +240,14 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) -@dataclass(frozen=True, kw_only=True) -class ArrayV2Metadata(ArrayMetadata): - shape: ChunkCoords - chunk_grid: RegularChunkGrid - data_type: np.dtype[Any] - fill_value: None | int | float = 0 - order: Literal["C", "F"] = "C" - filters: list[dict[str, JSON]] | None = None - dimension_separator: Literal[".", "/"] = "." - compressor: dict[str, JSON] | None = None - attributes: dict[str, JSON] = field(default_factory=dict) - zarr_format: Literal[2] = field(init=False, default=2) - - def __init__( - self, - *, - shape: ChunkCoords, - dtype: npt.DTypeLike, - chunks: ChunkCoords, - fill_value: Any, - order: Literal["C", "F"], - dimension_separator: Literal[".", "/"] = ".", - compressor: dict[str, JSON] | None = None, - filters: list[dict[str, JSON]] | None = None, - attributes: dict[str, JSON] | None = None, - ): - """ - Metadata for a Zarr version 2 array. - """ - shape_parsed = parse_shapelike(shape) - data_type_parsed = parse_dtype(dtype) - chunks_parsed = parse_shapelike(chunks) - compressor_parsed = parse_compressor(compressor) - order_parsed = parse_indexing_order(order) - dimension_separator_parsed = parse_separator(dimension_separator) - filters_parsed = parse_filters(filters) - fill_value_parsed = parse_fill_value_v2(fill_value, dtype=data_type_parsed) - attributes_parsed = parse_attributes(attributes) - - object.__setattr__(self, "shape", shape_parsed) - object.__setattr__(self, "data_type", data_type_parsed) - object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed)) - object.__setattr__(self, "compressor", compressor_parsed) - object.__setattr__(self, "order", order_parsed) - object.__setattr__(self, "dimension_separator", dimension_separator_parsed) - object.__setattr__(self, "filters", filters_parsed) - object.__setattr__(self, "fill_value", fill_value_parsed) - object.__setattr__(self, "attributes", attributes_parsed) - - # ensure that the metadata document is consistent - _ = parse_v2_metadata(self) - - @property - def ndim(self) -> int: - return len(self.shape) - - @property - def dtype(self) -> np.dtype[Any]: - return self.data_type - - @property - def chunks(self) -> ChunkCoords: - return self.chunk_grid.chunk_shape - - def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: - def _json_convert( - o: Any, - ) -> Any: - if isinstance(o, np.dtype): - if o.fields is None: - return o.str - else: - return o.descr - if np.isscalar(o): - # convert numpy scalar to python type, and pass - # python types through - return getattr(o, "item", lambda: o)() - raise TypeError - - zarray_dict = self.to_dict() - assert isinstance(zarray_dict, dict) - zattrs_dict = zarray_dict.pop("attributes", {}) - assert isinstance(zattrs_dict, dict) - json_indent = config.get("json_indent") - return { - ZARRAY_JSON: prototype.buffer.from_bytes( - json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode() - ), - ZATTRS_JSON: prototype.buffer.from_bytes( - json.dumps(zattrs_dict, indent=json_indent).encode() - ), - } - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: - # make a copy to protect the original from modification - _data = data.copy() - # check that the zarr_format attribute is correct - _ = parse_zarr_format_v2(_data.pop("zarr_format")) - return cls(**_data) - - def to_dict(self) -> JSON: - zarray_dict = super().to_dict() - - assert isinstance(zarray_dict, dict) - - _ = zarray_dict.pop("chunk_grid") - zarray_dict["chunks"] = self.chunk_grid.chunk_shape - - _ = zarray_dict.pop("data_type") - zarray_dict["dtype"] = self.data_type.str - - return zarray_dict - - def get_chunk_spec( - self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype - ) -> ArraySpec: - return ArraySpec( - shape=self.chunk_grid.chunk_shape, - dtype=self.dtype, - fill_value=self.fill_value, - order=order, - prototype=prototype, - ) - - def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: - chunk_identifier = self.dimension_separator.join(map(str, chunk_coords)) - return "0" if chunk_identifier == "" else chunk_identifier - - def update_shape(self, shape: ChunkCoords) -> Self: - return replace(self, shape=shape) - - def update_attributes(self, attributes: dict[str, JSON]) -> Self: - return replace(self, attributes=attributes) - - -def parse_dimension_names(data: None | Iterable[str | None]) -> tuple[str | None, ...] | None: - if data is None: - return data - elif all(isinstance(x, type(None) | str) for x in data): - return tuple(data) - else: - msg = f"Expected either None or a iterable of str, got {type(data)}" - raise TypeError(msg) - - -# todo: real validation -def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]: - if data is None: - return {} - - return data - - -# todo: move to its own module and drop _v3 suffix -# todo: consider folding all the literal parsing into a single function -# that takes 2 arguments -def parse_zarr_format_v3(data: Literal[3]) -> Literal[3]: - if data == 3: - return data - raise ValueError(f"Invalid value. Expected 3. Got {data}.") - - -# todo: move to its own module and drop _v2 suffix -def parse_zarr_format_v2(data: Literal[2]) -> Literal[2]: - if data == 2: - return data - raise ValueError(f"Invalid value. Expected 2. Got {data}.") - - -def parse_node_type_array(data: Literal["array"]) -> Literal["array"]: - if data == "array": - return data - raise ValueError(f"Invalid value. Expected 'array'. Got {data}.") - - -# todo: real validation -def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None: - return data - - -# todo: real validation -def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None: - return data - - -def parse_v2_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: - if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)): - msg = ( - f"The `shape` and `chunks` attributes must have the same length. " - f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}." - ) - raise ValueError(msg) - return data - - def create_pipeline(data: Iterable[Codec | JSON]) -> CodecPipeline: if not isinstance(data, Iterable): raise TypeError(f"Expected iterable, got {type(data)}") return get_pipeline_class().from_dict(data) -def parse_codecs(data: Iterable[Codec | dict[str, JSON]]) -> tuple[Codec, ...]: - out: tuple[Codec, ...] = () - - if not isinstance(data, Iterable): - raise TypeError(f"Expected iterable, got {type(data)}") - - for c in data: - if isinstance( - c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec - ): # Can't use Codec here because of mypy limitation - out += (c,) - else: - name_parsed, _ = parse_named_configuration(c, require_configuration=False) - out += (get_codec_class(name_parsed).from_dict(c),) - - return out - - -def parse_fill_value_v2(fill_value: Any, dtype: np.dtype[Any]) -> Any: - """ - Parse a potential fill value into a value that is compatible with the provided dtype. - - This is a light wrapper around zarr.v2.util.normalize_fill_value. - - Parameters - ---------- - fill_value: Any - A potential fill value. - dtype: np.dtype[Any] - A numpy dtype. - - Returns - An instance of `dtype`, or `None`, or any python object (in the case of an object dtype) - """ - from zarr.v2.util import normalize_fill_value - - return normalize_fill_value(fill_value=fill_value, dtype=dtype) - - BOOL = np.bool_ BOOL_DTYPE = np.dtypes.BoolDType - INTEGER_DTYPE = ( np.dtypes.Int8DType | np.dtypes.Int16DType @@ -577,33 +258,31 @@ def parse_fill_value_v2(fill_value: Any, dtype: np.dtype[Any]) -> Any: | np.dtypes.UInt32DType | np.dtypes.UInt64DType ) - INTEGER = np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 FLOAT_DTYPE = np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType FLOAT = np.float16 | np.float32 | np.float64 COMPLEX_DTYPE = np.dtypes.Complex64DType | np.dtypes.Complex128DType COMPLEX = np.complex64 | np.complex128 -# todo: r* dtypes @overload -def parse_fill_value_v3(fill_value: Any, dtype: BOOL_DTYPE) -> BOOL: ... +def parse_fill_value(fill_value: Any, dtype: BOOL_DTYPE) -> BOOL: ... @overload -def parse_fill_value_v3(fill_value: Any, dtype: INTEGER_DTYPE) -> INTEGER: ... +def parse_fill_value(fill_value: Any, dtype: INTEGER_DTYPE) -> INTEGER: ... @overload -def parse_fill_value_v3(fill_value: Any, dtype: FLOAT_DTYPE) -> FLOAT: ... +def parse_fill_value(fill_value: Any, dtype: FLOAT_DTYPE) -> FLOAT: ... @overload -def parse_fill_value_v3(fill_value: Any, dtype: COMPLEX_DTYPE) -> COMPLEX: ... +def parse_fill_value(fill_value: Any, dtype: COMPLEX_DTYPE) -> COMPLEX: ... @overload -def parse_fill_value_v3(fill_value: Any, dtype: np.dtype[Any]) -> Any: +def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: # This dtype[Any] is unfortunately necessary right now. # See https://github.com/zarr-developers/zarr-python/issues/2131#issuecomment-2318010899 # for more details, but `dtype` here (which comes from `parse_dtype`) @@ -614,7 +293,7 @@ def parse_fill_value_v3(fill_value: Any, dtype: np.dtype[Any]) -> Any: ... -def parse_fill_value_v3( +def parse_fill_value( fill_value: Any, dtype: BOOL_DTYPE | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | np.dtype[Any] ) -> BOOL | INTEGER | FLOAT | COMPLEX | Any: """ @@ -655,3 +334,77 @@ def parse_fill_value_v3( msg = f"Cannot parse non-string sequence {fill_value} as a scalar with type {dtype}." raise TypeError(msg) return dtype.type(fill_value) + + +# For type checking +_bool = bool + + +class DataType(Enum): + bool = "bool" + int8 = "int8" + int16 = "int16" + int32 = "int32" + int64 = "int64" + uint8 = "uint8" + uint16 = "uint16" + uint32 = "uint32" + uint64 = "uint64" + float32 = "float32" + float64 = "float64" + + @property + def byte_count(self) -> int: + data_type_byte_counts = { + DataType.bool: 1, + DataType.int8: 1, + DataType.int16: 2, + DataType.int32: 4, + DataType.int64: 8, + DataType.uint8: 1, + DataType.uint16: 2, + DataType.uint32: 4, + DataType.uint64: 8, + DataType.float32: 4, + DataType.float64: 8, + } + return data_type_byte_counts[self] + + @property + def has_endianness(self) -> _bool: + # This might change in the future, e.g. for a complex with 2 8-bit floats + return self.byte_count != 1 + + def to_numpy_shortname(self) -> str: + data_type_to_numpy = { + DataType.bool: "bool", + DataType.int8: "i1", + DataType.int16: "i2", + DataType.int32: "i4", + DataType.int64: "i8", + DataType.uint8: "u1", + DataType.uint16: "u2", + DataType.uint32: "u4", + DataType.uint64: "u8", + DataType.float32: "f4", + DataType.float64: "f8", + } + return data_type_to_numpy[self] + + @classmethod + def from_dtype(cls, dtype: np.dtype[Any]) -> DataType: + dtype_to_data_type = { + "|b1": "bool", + "bool": "bool", + "|i1": "int8", + " None: diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index bc43154a5..50f3356b3 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -7,6 +7,7 @@ from zarr.codecs.bytes import BytesCodec from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding +from zarr.core.metadata.v3 import ArrayV3Metadata if TYPE_CHECKING: from collections.abc import Sequence @@ -18,9 +19,7 @@ import numpy as np import pytest -from zarr.core.metadata import ArrayV3Metadata, parse_dimension_names -from zarr.core.metadata import parse_fill_value_v3 as parse_fill_value -from zarr.core.metadata import parse_zarr_format_v3 as parse_zarr_format +from zarr.core.metadata.v3 import parse_dimension_names, parse_fill_value, parse_zarr_format bool_dtypes = ("bool",) From c445593837c9cabbaf8beb2c512c33b1d38eb53b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 12 Sep 2024 20:14:46 +0200 Subject: [PATCH 2/5] add more explicit typeguards --- src/zarr/core/metadata/v2.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 813adeff5..fe6e050c8 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -104,9 +104,12 @@ def _json_convert( raise TypeError zarray_dict = self.to_dict() - assert isinstance(zarray_dict, dict) + + # todo: remove this check when we can ensure that to_dict always returns dicts. + if not isinstance(zarray_dict, dict): + raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.") + zattrs_dict = zarray_dict.pop("attributes", {}) - assert isinstance(zattrs_dict, dict) json_indent = config.get("json_indent") return { ZARRAY_JSON: prototype.buffer.from_bytes( @@ -128,7 +131,9 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> JSON: zarray_dict = super().to_dict() - assert isinstance(zarray_dict, dict) + # todo: remove this check when we can ensure that to_dict always returns dicts. + if not isinstance(zarray_dict, dict): + raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.") _ = zarray_dict.pop("chunk_grid") zarray_dict["chunks"] = self.chunk_grid.chunk_shape From a179c1eb9ea5ccb06e1b78f9b3699c0c2add7115 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 12 Sep 2024 20:21:25 +0200 Subject: [PATCH 3/5] port fill value normalization from v2 --- src/zarr/core/metadata/v2.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index fe6e050c8..0dc63f1ff 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -193,8 +193,6 @@ def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: """ Parse a potential fill value into a value that is compatible with the provided dtype. - This is a light wrapper around zarr.v2.util.normalize_fill_value. - Parameters ---------- fill_value: Any @@ -205,6 +203,33 @@ def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: Returns An instance of `dtype`, or `None`, or any python object (in the case of an object dtype) """ - from zarr.v2.util import normalize_fill_value - return normalize_fill_value(fill_value=fill_value, dtype=dtype) + if fill_value is None or dtype.hasobject: + # no fill value + pass + elif not isinstance(fill_value, np.void) and fill_value == 0: + # this should be compatible across numpy versions for any array type, including + # structured arrays + fill_value = np.zeros((), dtype=dtype)[()] + + elif dtype.kind == "U": + # special case unicode because of encoding issues on Windows if passed through numpy + # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 + + if not isinstance(fill_value, str): + raise ValueError( + f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string" + ) + else: + try: + if isinstance(fill_value, bytes) and dtype.kind == "V": + # special case for numpy 1.14 compatibility + fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()] + else: + fill_value = np.array(fill_value, dtype=dtype)[()] + + except Exception as e: + msg = f"Fill_value {fill_value} is not valid for dtype {dtype}." + raise ValueError(msg) from e + + return fill_value From 700d9b820ffec96420471052678516e02479254d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 12 Sep 2024 20:22:01 +0200 Subject: [PATCH 4/5] remove v2 suffix from zarr format parsing --- src/zarr/core/metadata/v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 0dc63f1ff..6d5ecd7e8 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -125,7 +125,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: # make a copy to protect the original from modification _data = data.copy() # check that the zarr_format attribute is correct - _ = parse_zarr_format_v2(_data.pop("zarr_format")) + _ = parse_zarr_format(_data.pop("zarr_format")) return cls(**_data) def to_dict(self) -> JSON: @@ -165,7 +165,7 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) -def parse_zarr_format_v2(data: Literal[2]) -> Literal[2]: +def parse_zarr_format(data: Literal[2]) -> Literal[2]: if data == 2: return data raise ValueError(f"Invalid value. Expected 2. Got {data}.") From 5994652c26ae7743d5a57c7d3642ff1bf837d4c7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 12 Sep 2024 20:22:16 +0200 Subject: [PATCH 5/5] remove v2 suffix from zarr format parsing --- tests/v3/test_metadata/test_v2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index e06e345a6..4465a8647 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -12,17 +12,17 @@ import pytest from zarr.codecs import GzipCodec -from zarr.core.metadata.v2 import parse_zarr_format_v2 +from zarr.core.metadata.v2 import parse_zarr_format def test_parse_zarr_format_valid() -> None: - assert parse_zarr_format_v2(2) == 2 + assert parse_zarr_format(2) == 2 @pytest.mark.parametrize("data", [None, 1, 3, 4, 5, "3"]) def test_parse_zarr_format_invalid(data: Any) -> None: with pytest.raises(ValueError, match=f"Invalid value. Expected 2. Got {data}"): - parse_zarr_format_v2(data) + parse_zarr_format(data) @pytest.mark.parametrize("attributes", [None, {"foo": "bar"}])