Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor metadata into metadata.v2 and metadata.v3 modules #2059

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
get_indexer,
morton_order_iter,
)
from zarr.core.metadata import parse_codecs
from zarr.core.metadata.v3 import parse_codecs
from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec

if TYPE_CHECKING:
Expand Down
3 changes: 2 additions & 1 deletion src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
is_scalar,
pop_fields,
)
from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
from zarr.core.sync import sync
from zarr.registry import get_pipeline_class
from zarr.store import StoreLike, StorePath, make_store_path
Expand All @@ -67,6 +67,7 @@
from collections.abc import Iterable

from zarr.abc.codec import Codec, CodecPipeline
from zarr.core.metadata.common import ArrayMetadata

# Array and AsyncArray are defined in the base ``zarr`` namespace
__all__ = ["parse_array_metadata", "create_codec_pipeline"]
Expand Down
4 changes: 4 additions & 0 deletions src/zarr/core/metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .v2 import ArrayV2Metadata
from .v3 import ArrayV3Metadata

__all__ = ["ArrayV3Metadata", "ArrayV2Metadata"]
70 changes: 70 additions & 0 deletions src/zarr/core/metadata/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Any, Literal

import numpy as np
from typing_extensions import Self

from zarr.core.array_spec import ArraySpec
from zarr.core.buffer import Buffer, BufferPrototype
from zarr.core.chunk_grids import ChunkGrid
from zarr.core.common import JSON, ChunkCoords, ZarrFormat

from abc import ABC, abstractmethod
from dataclasses import dataclass

from zarr.abc.metadata import Metadata


@dataclass(frozen=True, kw_only=True)
class ArrayMetadata(Metadata, ABC):
shape: ChunkCoords
fill_value: Any
chunk_grid: ChunkGrid
attributes: dict[str, JSON]
zarr_format: ZarrFormat

@property
@abstractmethod
def dtype(self) -> np.dtype[Any]:
pass

@property
@abstractmethod
def ndim(self) -> int:
pass

@abstractmethod
def get_chunk_spec(
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
) -> ArraySpec:
pass

@abstractmethod
def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
pass

@abstractmethod
def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
pass

@abstractmethod
def update_shape(self, shape: ChunkCoords) -> Self:
pass

@abstractmethod
def update_attributes(self, attributes: dict[str, JSON]) -> Self:
pass


def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]:
"""
Normalize `None` to an empty dict. All other values pass through.
"""
if data is None:
return {}

return data
229 changes: 229 additions & 0 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
import numpy.typing as npt
from typing_extensions import Self

from zarr.core.buffer import Buffer, BufferPrototype

import json
from dataclasses import dataclass, field, replace
from typing import Any, Literal

import numpy as np

from zarr.core.array_spec import ArraySpec
from zarr.core.chunk_grids import RegularChunkGrid
from zarr.core.chunk_key_encodings import parse_separator
from zarr.core.common import (
JSON,
ZARRAY_JSON,
ZATTRS_JSON,
ChunkCoords,
parse_dtype,
parse_shapelike,
)
from zarr.core.config import config, parse_indexing_order
from zarr.core.metadata.common import ArrayMetadata, parse_attributes


@dataclass(frozen=True, kw_only=True)
class ArrayV2Metadata(ArrayMetadata):
shape: ChunkCoords
chunk_grid: RegularChunkGrid
data_type: np.dtype[Any]
fill_value: None | int | float = 0
order: Literal["C", "F"] = "C"
filters: list[dict[str, JSON]] | None = None
dimension_separator: Literal[".", "/"] = "."
compressor: dict[str, JSON] | None = None
attributes: dict[str, JSON] = field(default_factory=dict)
zarr_format: Literal[2] = field(init=False, default=2)

def __init__(
self,
*,
shape: ChunkCoords,
dtype: npt.DTypeLike,
chunks: ChunkCoords,
fill_value: Any,
order: Literal["C", "F"],
dimension_separator: Literal[".", "/"] = ".",
compressor: dict[str, JSON] | None = None,
filters: list[dict[str, JSON]] | None = None,
attributes: dict[str, JSON] | None = None,
):
"""
Metadata for a Zarr version 2 array.
"""
shape_parsed = parse_shapelike(shape)
data_type_parsed = parse_dtype(dtype)
chunks_parsed = parse_shapelike(chunks)
compressor_parsed = parse_compressor(compressor)
order_parsed = parse_indexing_order(order)
dimension_separator_parsed = parse_separator(dimension_separator)
filters_parsed = parse_filters(filters)
fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed)
attributes_parsed = parse_attributes(attributes)

object.__setattr__(self, "shape", shape_parsed)
object.__setattr__(self, "data_type", data_type_parsed)
object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed))
object.__setattr__(self, "compressor", compressor_parsed)
object.__setattr__(self, "order", order_parsed)
object.__setattr__(self, "dimension_separator", dimension_separator_parsed)
object.__setattr__(self, "filters", filters_parsed)
object.__setattr__(self, "fill_value", fill_value_parsed)
object.__setattr__(self, "attributes", attributes_parsed)

# ensure that the metadata document is consistent
_ = parse_metadata(self)

@property
def ndim(self) -> int:
return len(self.shape)

@property
def dtype(self) -> np.dtype[Any]:
return self.data_type

@property
def chunks(self) -> ChunkCoords:
return self.chunk_grid.chunk_shape

def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
def _json_convert(
o: Any,
) -> Any:
if isinstance(o, np.dtype):
if o.fields is None:
return o.str
else:
return o.descr
if np.isscalar(o):
# convert numpy scalar to python type, and pass
# python types through
return getattr(o, "item", lambda: o)()
raise TypeError

zarray_dict = self.to_dict()
assert isinstance(zarray_dict, dict)
zattrs_dict = zarray_dict.pop("attributes", {})
assert isinstance(zattrs_dict, dict)
json_indent = config.get("json_indent")
return {
ZARRAY_JSON: prototype.buffer.from_bytes(
json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
),
ZATTRS_JSON: prototype.buffer.from_bytes(
json.dumps(zattrs_dict, indent=json_indent).encode()
),
}

@classmethod
def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
# make a copy to protect the original from modification
_data = data.copy()
# check that the zarr_format attribute is correct
_ = parse_zarr_format(_data.pop("zarr_format"))
return cls(**_data)

def to_dict(self) -> JSON:
zarray_dict = super().to_dict()

assert isinstance(zarray_dict, dict)

_ = zarray_dict.pop("chunk_grid")
zarray_dict["chunks"] = self.chunk_grid.chunk_shape

_ = zarray_dict.pop("data_type")
zarray_dict["dtype"] = self.data_type.str

return zarray_dict

def get_chunk_spec(
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
) -> ArraySpec:
return ArraySpec(
shape=self.chunk_grid.chunk_shape,
dtype=self.dtype,
fill_value=self.fill_value,
order=order,
prototype=prototype,
)

def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
chunk_identifier = self.dimension_separator.join(map(str, chunk_coords))
return "0" if chunk_identifier == "" else chunk_identifier

def update_shape(self, shape: ChunkCoords) -> Self:
return replace(self, shape=shape)

def update_attributes(self, attributes: dict[str, JSON]) -> Self:
return replace(self, attributes=attributes)


def parse_zarr_format(data: Literal[2]) -> Literal[2]:
if data == 2:
return data
raise ValueError(f"Invalid value. Expected 2. Got {data}.")


def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None:
return data


def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None:
return data


def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)):
msg = (
f"The `shape` and `chunks` attributes must have the same length. "
f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}."
)
raise ValueError(msg)
return data


def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
"""
Parse a fill value, given a dtype.

This is copied from the `normalize_fill_value` function from zarr-python 2.x.
"""
if fill_value is None or dtype.hasobject:
# no fill value
pass
elif not isinstance(fill_value, np.void) and fill_value == 0:
# this should be compatible across numpy versions for any array type, including
# structured arrays
fill_value = np.zeros((), dtype=dtype)[()]

elif dtype.kind == "U":
# special case unicode because of encoding issues on Windows if passed through numpy
# https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713

if not isinstance(fill_value, str):
msg = (
f"fill_value {fill_value!r} is not valid for dtype {dtype}."
f"Expected a unicode string, got {type(fill_value)}."
)
raise ValueError(msg)

else:
try:
if isinstance(fill_value, bytes) and dtype.kind == "V":
# special case for numpy 1.14 compatibility
fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()]
else:
fill_value = np.array(fill_value, dtype=dtype)[()]

except Exception as e:
msg = f"fill_value {fill_value!r} is not valid for dtype {dtype}"
raise ValueError(msg) from e

return fill_value
Loading