Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: split metadata into v2 and v3 modules #2163

Merged
merged 8 commits into from
Sep 12, 2024
3 changes: 2 additions & 1 deletion src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from zarr.core.array import Array, AsyncArray
from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat
from zarr.core.group import AsyncGroup
from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
from zarr.core.metadata.v2 import ArrayV2Metadata
from zarr.core.metadata.v3 import ArrayV3Metadata
from zarr.store import (
StoreLike,
make_store_path,
Expand Down
2 changes: 1 addition & 1 deletion src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
get_indexer,
morton_order_iter,
)
from zarr.core.metadata import parse_codecs
from zarr.core.metadata.v3 import parse_codecs
from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec

if TYPE_CHECKING:
Expand Down
4 changes: 3 additions & 1 deletion src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@
is_scalar,
pop_fields,
)
from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
from zarr.core.metadata.v2 import ArrayV2Metadata
from zarr.core.metadata.v3 import ArrayV3Metadata
from zarr.core.sync import sync
from zarr.registry import get_pipeline_class
from zarr.store import StoreLike, StorePath, make_store_path
Expand All @@ -67,6 +68,7 @@
from collections.abc import Iterable

from zarr.abc.codec import Codec, CodecPipeline
from zarr.core.metadata.common import ArrayMetadata

# Array and AsyncArray are defined in the base ``zarr`` namespace
__all__ = ["parse_array_metadata", "create_codec_pipeline"]
Expand Down
4 changes: 4 additions & 0 deletions src/zarr/core/metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .v2 import ArrayV2Metadata
from .v3 import ArrayV3Metadata

__all__ = ["ArrayV2Metadata", "ArrayV3Metadata"]
67 changes: 67 additions & 0 deletions src/zarr/core/metadata/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Any, Literal

import numpy as np
from typing_extensions import Self

from zarr.core.array_spec import ArraySpec
from zarr.core.buffer import Buffer, BufferPrototype
from zarr.core.chunk_grids import ChunkGrid
from zarr.core.common import JSON, ChunkCoords, ZarrFormat

from abc import ABC, abstractmethod
from dataclasses import dataclass

from zarr.abc.metadata import Metadata


@dataclass(frozen=True, kw_only=True)
class ArrayMetadata(Metadata, ABC):
shape: ChunkCoords
fill_value: Any
chunk_grid: ChunkGrid
attributes: dict[str, JSON]
zarr_format: ZarrFormat

@property
@abstractmethod
def dtype(self) -> np.dtype[Any]:
pass

@property
@abstractmethod
def ndim(self) -> int:
pass

@abstractmethod
def get_chunk_spec(
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
) -> ArraySpec:
pass

@abstractmethod
def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
pass

@abstractmethod
def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
pass

@abstractmethod
def update_shape(self, shape: ChunkCoords) -> Self:
pass

@abstractmethod
def update_attributes(self, attributes: dict[str, JSON]) -> Self:
pass


def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]:
if data is None:
return {}

return data
205 changes: 205 additions & 0 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Any, Literal

import numpy.typing as npt
from typing_extensions import Self

from zarr.core.buffer import Buffer, BufferPrototype
from zarr.core.common import JSON, ChunkCoords

import json
from dataclasses import dataclass, field, replace

import numpy as np

from zarr.core.array_spec import ArraySpec
from zarr.core.chunk_grids import RegularChunkGrid
from zarr.core.chunk_key_encodings import parse_separator
from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_dtype, parse_shapelike
from zarr.core.config import config, parse_indexing_order
from zarr.core.metadata.common import ArrayMetadata, parse_attributes


@dataclass(frozen=True, kw_only=True)
class ArrayV2Metadata(ArrayMetadata):
shape: ChunkCoords
chunk_grid: RegularChunkGrid
data_type: np.dtype[Any]
fill_value: None | int | float = 0
order: Literal["C", "F"] = "C"
filters: list[dict[str, JSON]] | None = None
dimension_separator: Literal[".", "/"] = "."
compressor: dict[str, JSON] | None = None
attributes: dict[str, JSON] = field(default_factory=dict)
zarr_format: Literal[2] = field(init=False, default=2)

def __init__(
self,
*,
shape: ChunkCoords,
dtype: npt.DTypeLike,
chunks: ChunkCoords,
fill_value: Any,
order: Literal["C", "F"],
dimension_separator: Literal[".", "/"] = ".",
compressor: dict[str, JSON] | None = None,
filters: list[dict[str, JSON]] | None = None,
attributes: dict[str, JSON] | None = None,
):
"""
Metadata for a Zarr version 2 array.
"""
shape_parsed = parse_shapelike(shape)
data_type_parsed = parse_dtype(dtype)
chunks_parsed = parse_shapelike(chunks)
compressor_parsed = parse_compressor(compressor)
order_parsed = parse_indexing_order(order)
dimension_separator_parsed = parse_separator(dimension_separator)
filters_parsed = parse_filters(filters)
fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed)
attributes_parsed = parse_attributes(attributes)

object.__setattr__(self, "shape", shape_parsed)
object.__setattr__(self, "data_type", data_type_parsed)
object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed))
object.__setattr__(self, "compressor", compressor_parsed)
object.__setattr__(self, "order", order_parsed)
object.__setattr__(self, "dimension_separator", dimension_separator_parsed)
object.__setattr__(self, "filters", filters_parsed)
object.__setattr__(self, "fill_value", fill_value_parsed)
object.__setattr__(self, "attributes", attributes_parsed)

# ensure that the metadata document is consistent
_ = parse_metadata(self)

@property
def ndim(self) -> int:
return len(self.shape)

@property
def dtype(self) -> np.dtype[Any]:
return self.data_type

@property
def chunks(self) -> ChunkCoords:
return self.chunk_grid.chunk_shape

def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
def _json_convert(
o: Any,
) -> Any:
if isinstance(o, np.dtype):
if o.fields is None:
return o.str
else:
return o.descr
if np.isscalar(o):
# convert numpy scalar to python type, and pass
# python types through
return getattr(o, "item", lambda: o)()
raise TypeError

zarray_dict = self.to_dict()
assert isinstance(zarray_dict, dict)
zattrs_dict = zarray_dict.pop("attributes", {})
assert isinstance(zattrs_dict, dict)
d-v-b marked this conversation as resolved.
Show resolved Hide resolved
json_indent = config.get("json_indent")
return {
ZARRAY_JSON: prototype.buffer.from_bytes(
json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
),
ZATTRS_JSON: prototype.buffer.from_bytes(
json.dumps(zattrs_dict, indent=json_indent).encode()
),
}

@classmethod
def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
# make a copy to protect the original from modification
_data = data.copy()
# check that the zarr_format attribute is correct
_ = parse_zarr_format_v2(_data.pop("zarr_format"))
return cls(**_data)

def to_dict(self) -> JSON:
zarray_dict = super().to_dict()

assert isinstance(zarray_dict, dict)
d-v-b marked this conversation as resolved.
Show resolved Hide resolved

_ = zarray_dict.pop("chunk_grid")
zarray_dict["chunks"] = self.chunk_grid.chunk_shape

_ = zarray_dict.pop("data_type")
zarray_dict["dtype"] = self.data_type.str

return zarray_dict

def get_chunk_spec(
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
) -> ArraySpec:
return ArraySpec(
shape=self.chunk_grid.chunk_shape,
dtype=self.dtype,
fill_value=self.fill_value,
order=order,
prototype=prototype,
)

def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
chunk_identifier = self.dimension_separator.join(map(str, chunk_coords))
return "0" if chunk_identifier == "" else chunk_identifier

def update_shape(self, shape: ChunkCoords) -> Self:
return replace(self, shape=shape)

def update_attributes(self, attributes: dict[str, JSON]) -> Self:
return replace(self, attributes=attributes)


def parse_zarr_format_v2(data: Literal[2]) -> Literal[2]:
if data == 2:
return data
raise ValueError(f"Invalid value. Expected 2. Got {data}.")


def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None:
return data


def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None:
return data


def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)):
msg = (
f"The `shape` and `chunks` attributes must have the same length. "
f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}."
)
raise ValueError(msg)
return data


def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
"""
Parse a potential fill value into a value that is compatible with the provided dtype.

This is a light wrapper around zarr.v2.util.normalize_fill_value.

Parameters
----------
fill_value: Any
A potential fill value.
dtype: np.dtype[Any]
A numpy dtype.

Returns
An instance of `dtype`, or `None`, or any python object (in the case of an object dtype)
"""
from zarr.v2.util import normalize_fill_value
d-v-b marked this conversation as resolved.
Show resolved Hide resolved

return normalize_fill_value(fill_value=fill_value, dtype=dtype)
Loading