-
-
Notifications
You must be signed in to change notification settings - Fork 273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor metadata into metadata.v2
and metadata.v3
modules
#2059
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from .v2 import ArrayV2Metadata | ||
from .v3 import ArrayV3Metadata | ||
|
||
__all__ = ["ArrayV3Metadata", "ArrayV2Metadata"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
if TYPE_CHECKING: | ||
from typing import Any, Literal | ||
|
||
from typing_extensions import Self | ||
|
||
from abc import ABC, abstractmethod | ||
from dataclasses import dataclass | ||
|
||
import numpy as np | ||
|
||
from zarr.abc.metadata import Metadata | ||
from zarr.array_spec import ArraySpec | ||
from zarr.buffer import Buffer, BufferPrototype | ||
from zarr.chunk_grids import ChunkGrid | ||
from zarr.common import JSON, ChunkCoords, ZarrFormat | ||
|
||
|
||
@dataclass(frozen=True, kw_only=True) | ||
class ArrayMetadata(Metadata, ABC): | ||
shape: ChunkCoords | ||
fill_value: Any | ||
chunk_grid: ChunkGrid | ||
attributes: dict[str, JSON] | ||
zarr_format: ZarrFormat | ||
|
||
@property | ||
@abstractmethod | ||
def dtype(self) -> np.dtype[Any]: | ||
pass | ||
|
||
@property | ||
@abstractmethod | ||
def ndim(self) -> int: | ||
pass | ||
|
||
@abstractmethod | ||
def get_chunk_spec( | ||
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype | ||
) -> ArraySpec: | ||
pass | ||
|
||
@abstractmethod | ||
def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: | ||
pass | ||
|
||
@abstractmethod | ||
def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: | ||
pass | ||
|
||
@abstractmethod | ||
def update_shape(self, shape: ChunkCoords) -> Self: | ||
pass | ||
|
||
@abstractmethod | ||
def update_attributes(self, attributes: dict[str, JSON]) -> Self: | ||
pass | ||
|
||
|
||
def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]: | ||
""" | ||
Normalize `None` to an empty dict. All other values pass through. | ||
""" | ||
if data is None: | ||
return {} | ||
|
||
return data |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this file also get an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm open to this, but I think it would only affect cases where people did |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
if TYPE_CHECKING: | ||
import numpy.typing as npt | ||
from typing_extensions import Self | ||
|
||
import json | ||
from dataclasses import dataclass, field, replace | ||
from typing import Any, Literal | ||
|
||
import numpy as np | ||
|
||
from zarr.array_spec import ArraySpec | ||
from zarr.buffer import Buffer, BufferPrototype | ||
from zarr.chunk_grids import RegularChunkGrid | ||
from zarr.chunk_key_encodings import parse_separator | ||
from zarr.common import JSON, ZARRAY_JSON, ZATTRS_JSON, ChunkCoords, parse_dtype, parse_shapelike | ||
from zarr.config import config, parse_indexing_order | ||
from zarr.metadata.common import ArrayMetadata, parse_attributes | ||
|
||
|
||
@dataclass(frozen=True, kw_only=True) | ||
class ArrayV2Metadata(ArrayMetadata): | ||
shape: ChunkCoords | ||
chunk_grid: RegularChunkGrid | ||
data_type: np.dtype[Any] | ||
fill_value: None | int | float = 0 | ||
order: Literal["C", "F"] = "C" | ||
filters: list[dict[str, JSON]] | None = None | ||
dimension_separator: Literal[".", "/"] = "." | ||
compressor: dict[str, JSON] | None = None | ||
attributes: dict[str, JSON] = field(default_factory=dict) | ||
zarr_format: Literal[2] = field(init=False, default=2) | ||
|
||
def __init__( | ||
self, | ||
*, | ||
shape: ChunkCoords, | ||
dtype: npt.DTypeLike, | ||
chunks: ChunkCoords, | ||
fill_value: Any, | ||
order: Literal["C", "F"], | ||
dimension_separator: Literal[".", "/"] = ".", | ||
compressor: dict[str, JSON] | None = None, | ||
filters: list[dict[str, JSON]] | None = None, | ||
attributes: dict[str, JSON] | None = None, | ||
): | ||
""" | ||
Metadata for a Zarr version 2 array. | ||
""" | ||
shape_parsed = parse_shapelike(shape) | ||
data_type_parsed = parse_dtype(dtype) | ||
chunks_parsed = parse_shapelike(chunks) | ||
compressor_parsed = parse_compressor(compressor) | ||
order_parsed = parse_indexing_order(order) | ||
dimension_separator_parsed = parse_separator(dimension_separator) | ||
filters_parsed = parse_filters(filters) | ||
fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) | ||
attributes_parsed = parse_attributes(attributes) | ||
|
||
object.__setattr__(self, "shape", shape_parsed) | ||
object.__setattr__(self, "data_type", data_type_parsed) | ||
object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed)) | ||
object.__setattr__(self, "compressor", compressor_parsed) | ||
object.__setattr__(self, "order", order_parsed) | ||
object.__setattr__(self, "dimension_separator", dimension_separator_parsed) | ||
object.__setattr__(self, "filters", filters_parsed) | ||
object.__setattr__(self, "fill_value", fill_value_parsed) | ||
object.__setattr__(self, "attributes", attributes_parsed) | ||
|
||
# ensure that the metadata document is consistent | ||
_ = parse_metadata(self) | ||
|
||
@property | ||
def ndim(self) -> int: | ||
return len(self.shape) | ||
|
||
@property | ||
def dtype(self) -> np.dtype[Any]: | ||
return self.data_type | ||
|
||
@property | ||
def chunks(self) -> ChunkCoords: | ||
return self.chunk_grid.chunk_shape | ||
|
||
def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: | ||
def _json_convert( | ||
o: Any, | ||
) -> Any: | ||
if isinstance(o, np.dtype): | ||
if o.fields is None: | ||
return o.str | ||
else: | ||
return o.descr | ||
if np.isscalar(o): | ||
# convert numpy scalar to python type, and pass | ||
# python types through | ||
return getattr(o, "item", lambda: o)() | ||
raise TypeError | ||
|
||
zarray_dict = self.to_dict() | ||
assert isinstance(zarray_dict, dict) | ||
zattrs_dict = zarray_dict.pop("attributes", {}) | ||
assert isinstance(zattrs_dict, dict) | ||
json_indent = config.get("json_indent") | ||
return { | ||
ZARRAY_JSON: prototype.buffer.from_bytes( | ||
json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode() | ||
), | ||
ZATTRS_JSON: prototype.buffer.from_bytes( | ||
json.dumps(zattrs_dict, indent=json_indent).encode() | ||
), | ||
} | ||
|
||
@classmethod | ||
def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: | ||
# make a copy to protect the original from modification | ||
_data = data.copy() | ||
# check that the zarr_format attribute is correct | ||
_ = parse_zarr_format(_data.pop("zarr_format")) | ||
return cls(**_data) | ||
|
||
def to_dict(self) -> JSON: | ||
zarray_dict = super().to_dict() | ||
|
||
assert isinstance(zarray_dict, dict) | ||
|
||
_ = zarray_dict.pop("chunk_grid") | ||
zarray_dict["chunks"] = self.chunk_grid.chunk_shape | ||
|
||
_ = zarray_dict.pop("data_type") | ||
zarray_dict["dtype"] = self.data_type.str | ||
|
||
return zarray_dict | ||
|
||
def get_chunk_spec( | ||
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype | ||
) -> ArraySpec: | ||
return ArraySpec( | ||
shape=self.chunk_grid.chunk_shape, | ||
dtype=self.dtype, | ||
fill_value=self.fill_value, | ||
order=order, | ||
prototype=prototype, | ||
) | ||
|
||
def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: | ||
chunk_identifier = self.dimension_separator.join(map(str, chunk_coords)) | ||
return "0" if chunk_identifier == "" else chunk_identifier | ||
|
||
def update_shape(self, shape: ChunkCoords) -> Self: | ||
return replace(self, shape=shape) | ||
|
||
def update_attributes(self, attributes: dict[str, JSON]) -> Self: | ||
return replace(self, attributes=attributes) | ||
|
||
|
||
def parse_zarr_format(data: Literal[2]) -> Literal[2]: | ||
if data == 2: | ||
return data | ||
raise ValueError(f"Invalid value. Expected 2. Got {data}.") | ||
|
||
|
||
def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None: | ||
return data | ||
|
||
|
||
def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None: | ||
return data | ||
|
||
|
||
def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: | ||
if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)): | ||
msg = ( | ||
f"The `shape` and `chunks` attributes must have the same length. " | ||
f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}." | ||
) | ||
raise ValueError(msg) | ||
return data | ||
|
||
|
||
def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: | ||
""" | ||
Parse a fill value, given a dtype. | ||
|
||
This is copied from the `normalize_fill_value` function from zarr-python 2.x. | ||
""" | ||
if fill_value is None or dtype.hasobject: | ||
# no fill value | ||
pass | ||
elif not isinstance(fill_value, np.void) and fill_value == 0: | ||
# this should be compatible across numpy versions for any array type, including | ||
# structured arrays | ||
fill_value = np.zeros((), dtype=dtype)[()] | ||
|
||
elif dtype.kind == "U": | ||
# special case unicode because of encoding issues on Windows if passed through numpy | ||
# https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 | ||
|
||
if not isinstance(fill_value, str): | ||
msg = ( | ||
f"fill_value {fill_value!r} is not valid for dtype {dtype}." | ||
f"Expected a unicode string, got {type(fill_value)}." | ||
) | ||
raise ValueError(msg) | ||
|
||
else: | ||
try: | ||
if isinstance(fill_value, bytes) and dtype.kind == "V": | ||
# special case for numpy 1.14 compatibility | ||
fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()] | ||
else: | ||
fill_value = np.array(fill_value, dtype=dtype)[()] | ||
|
||
except Exception as e: | ||
msg = f"fill_value {fill_value!r} is not valid for dtype {dtype}" | ||
raise ValueError(msg) from e | ||
|
||
return fill_value |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I slightly favor reverting this change as a way to future proof this a bit.