-
-
Notifications
You must be signed in to change notification settings - Fork 273
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: split metadata into v2 and v3 modules (#2163)
* refactor: split metadata into v2 and v3 modules * add more explicit typeguards * port fill value normalization from v2 * remove v2 suffix from zarr format parsing * remove v2 suffix from zarr format parsing
- Loading branch information
Showing
9 changed files
with
448 additions
and
385 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from .v2 import ArrayV2Metadata | ||
from .v3 import ArrayV3Metadata | ||
|
||
__all__ = ["ArrayV2Metadata", "ArrayV3Metadata"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
if TYPE_CHECKING: | ||
from typing import Any, Literal | ||
|
||
import numpy as np | ||
from typing_extensions import Self | ||
|
||
from zarr.core.array_spec import ArraySpec | ||
from zarr.core.buffer import Buffer, BufferPrototype | ||
from zarr.core.chunk_grids import ChunkGrid | ||
from zarr.core.common import JSON, ChunkCoords, ZarrFormat | ||
|
||
from abc import ABC, abstractmethod | ||
from dataclasses import dataclass | ||
|
||
from zarr.abc.metadata import Metadata | ||
|
||
|
||
@dataclass(frozen=True, kw_only=True) | ||
class ArrayMetadata(Metadata, ABC): | ||
shape: ChunkCoords | ||
fill_value: Any | ||
chunk_grid: ChunkGrid | ||
attributes: dict[str, JSON] | ||
zarr_format: ZarrFormat | ||
|
||
@property | ||
@abstractmethod | ||
def dtype(self) -> np.dtype[Any]: | ||
pass | ||
|
||
@property | ||
@abstractmethod | ||
def ndim(self) -> int: | ||
pass | ||
|
||
@abstractmethod | ||
def get_chunk_spec( | ||
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype | ||
) -> ArraySpec: | ||
pass | ||
|
||
@abstractmethod | ||
def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: | ||
pass | ||
|
||
@abstractmethod | ||
def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: | ||
pass | ||
|
||
@abstractmethod | ||
def update_shape(self, shape: ChunkCoords) -> Self: | ||
pass | ||
|
||
@abstractmethod | ||
def update_attributes(self, attributes: dict[str, JSON]) -> Self: | ||
pass | ||
|
||
|
||
def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]: | ||
if data is None: | ||
return {} | ||
|
||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,235 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
if TYPE_CHECKING: | ||
from typing import Any, Literal | ||
|
||
import numpy.typing as npt | ||
from typing_extensions import Self | ||
|
||
from zarr.core.buffer import Buffer, BufferPrototype | ||
from zarr.core.common import JSON, ChunkCoords | ||
|
||
import json | ||
from dataclasses import dataclass, field, replace | ||
|
||
import numpy as np | ||
|
||
from zarr.core.array_spec import ArraySpec | ||
from zarr.core.chunk_grids import RegularChunkGrid | ||
from zarr.core.chunk_key_encodings import parse_separator | ||
from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_dtype, parse_shapelike | ||
from zarr.core.config import config, parse_indexing_order | ||
from zarr.core.metadata.common import ArrayMetadata, parse_attributes | ||
|
||
|
||
@dataclass(frozen=True, kw_only=True) | ||
class ArrayV2Metadata(ArrayMetadata): | ||
shape: ChunkCoords | ||
chunk_grid: RegularChunkGrid | ||
data_type: np.dtype[Any] | ||
fill_value: None | int | float = 0 | ||
order: Literal["C", "F"] = "C" | ||
filters: list[dict[str, JSON]] | None = None | ||
dimension_separator: Literal[".", "/"] = "." | ||
compressor: dict[str, JSON] | None = None | ||
attributes: dict[str, JSON] = field(default_factory=dict) | ||
zarr_format: Literal[2] = field(init=False, default=2) | ||
|
||
def __init__( | ||
self, | ||
*, | ||
shape: ChunkCoords, | ||
dtype: npt.DTypeLike, | ||
chunks: ChunkCoords, | ||
fill_value: Any, | ||
order: Literal["C", "F"], | ||
dimension_separator: Literal[".", "/"] = ".", | ||
compressor: dict[str, JSON] | None = None, | ||
filters: list[dict[str, JSON]] | None = None, | ||
attributes: dict[str, JSON] | None = None, | ||
): | ||
""" | ||
Metadata for a Zarr version 2 array. | ||
""" | ||
shape_parsed = parse_shapelike(shape) | ||
data_type_parsed = parse_dtype(dtype) | ||
chunks_parsed = parse_shapelike(chunks) | ||
compressor_parsed = parse_compressor(compressor) | ||
order_parsed = parse_indexing_order(order) | ||
dimension_separator_parsed = parse_separator(dimension_separator) | ||
filters_parsed = parse_filters(filters) | ||
fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) | ||
attributes_parsed = parse_attributes(attributes) | ||
|
||
object.__setattr__(self, "shape", shape_parsed) | ||
object.__setattr__(self, "data_type", data_type_parsed) | ||
object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed)) | ||
object.__setattr__(self, "compressor", compressor_parsed) | ||
object.__setattr__(self, "order", order_parsed) | ||
object.__setattr__(self, "dimension_separator", dimension_separator_parsed) | ||
object.__setattr__(self, "filters", filters_parsed) | ||
object.__setattr__(self, "fill_value", fill_value_parsed) | ||
object.__setattr__(self, "attributes", attributes_parsed) | ||
|
||
# ensure that the metadata document is consistent | ||
_ = parse_metadata(self) | ||
|
||
@property | ||
def ndim(self) -> int: | ||
return len(self.shape) | ||
|
||
@property | ||
def dtype(self) -> np.dtype[Any]: | ||
return self.data_type | ||
|
||
@property | ||
def chunks(self) -> ChunkCoords: | ||
return self.chunk_grid.chunk_shape | ||
|
||
def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: | ||
def _json_convert( | ||
o: Any, | ||
) -> Any: | ||
if isinstance(o, np.dtype): | ||
if o.fields is None: | ||
return o.str | ||
else: | ||
return o.descr | ||
if np.isscalar(o): | ||
# convert numpy scalar to python type, and pass | ||
# python types through | ||
return getattr(o, "item", lambda: o)() | ||
raise TypeError | ||
|
||
zarray_dict = self.to_dict() | ||
|
||
# todo: remove this check when we can ensure that to_dict always returns dicts. | ||
if not isinstance(zarray_dict, dict): | ||
raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.") | ||
|
||
zattrs_dict = zarray_dict.pop("attributes", {}) | ||
json_indent = config.get("json_indent") | ||
return { | ||
ZARRAY_JSON: prototype.buffer.from_bytes( | ||
json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode() | ||
), | ||
ZATTRS_JSON: prototype.buffer.from_bytes( | ||
json.dumps(zattrs_dict, indent=json_indent).encode() | ||
), | ||
} | ||
|
||
@classmethod | ||
def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: | ||
# make a copy to protect the original from modification | ||
_data = data.copy() | ||
# check that the zarr_format attribute is correct | ||
_ = parse_zarr_format(_data.pop("zarr_format")) | ||
return cls(**_data) | ||
|
||
def to_dict(self) -> JSON: | ||
zarray_dict = super().to_dict() | ||
|
||
# todo: remove this check when we can ensure that to_dict always returns dicts. | ||
if not isinstance(zarray_dict, dict): | ||
raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.") | ||
|
||
_ = zarray_dict.pop("chunk_grid") | ||
zarray_dict["chunks"] = self.chunk_grid.chunk_shape | ||
|
||
_ = zarray_dict.pop("data_type") | ||
zarray_dict["dtype"] = self.data_type.str | ||
|
||
return zarray_dict | ||
|
||
def get_chunk_spec( | ||
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype | ||
) -> ArraySpec: | ||
return ArraySpec( | ||
shape=self.chunk_grid.chunk_shape, | ||
dtype=self.dtype, | ||
fill_value=self.fill_value, | ||
order=order, | ||
prototype=prototype, | ||
) | ||
|
||
def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: | ||
chunk_identifier = self.dimension_separator.join(map(str, chunk_coords)) | ||
return "0" if chunk_identifier == "" else chunk_identifier | ||
|
||
def update_shape(self, shape: ChunkCoords) -> Self: | ||
return replace(self, shape=shape) | ||
|
||
def update_attributes(self, attributes: dict[str, JSON]) -> Self: | ||
return replace(self, attributes=attributes) | ||
|
||
|
||
def parse_zarr_format(data: Literal[2]) -> Literal[2]: | ||
if data == 2: | ||
return data | ||
raise ValueError(f"Invalid value. Expected 2. Got {data}.") | ||
|
||
|
||
def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None: | ||
return data | ||
|
||
|
||
def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None: | ||
return data | ||
|
||
|
||
def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: | ||
if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)): | ||
msg = ( | ||
f"The `shape` and `chunks` attributes must have the same length. " | ||
f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}." | ||
) | ||
raise ValueError(msg) | ||
return data | ||
|
||
|
||
def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: | ||
""" | ||
Parse a potential fill value into a value that is compatible with the provided dtype. | ||
Parameters | ||
---------- | ||
fill_value: Any | ||
A potential fill value. | ||
dtype: np.dtype[Any] | ||
A numpy dtype. | ||
Returns | ||
An instance of `dtype`, or `None`, or any python object (in the case of an object dtype) | ||
""" | ||
|
||
if fill_value is None or dtype.hasobject: | ||
# no fill value | ||
pass | ||
elif not isinstance(fill_value, np.void) and fill_value == 0: | ||
# this should be compatible across numpy versions for any array type, including | ||
# structured arrays | ||
fill_value = np.zeros((), dtype=dtype)[()] | ||
|
||
elif dtype.kind == "U": | ||
# special case unicode because of encoding issues on Windows if passed through numpy | ||
# https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 | ||
|
||
if not isinstance(fill_value, str): | ||
raise ValueError( | ||
f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string" | ||
) | ||
else: | ||
try: | ||
if isinstance(fill_value, bytes) and dtype.kind == "V": | ||
# special case for numpy 1.14 compatibility | ||
fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()] | ||
else: | ||
fill_value = np.array(fill_value, dtype=dtype)[()] | ||
|
||
except Exception as e: | ||
msg = f"Fill_value {fill_value} is not valid for dtype {dtype}." | ||
raise ValueError(msg) from e | ||
|
||
return fill_value |
Oops, something went wrong.