From 1c8d97959c3ed929508547754b8dfc217ddfd510 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Mon, 9 Sep 2024 21:54:00 +0200
Subject: [PATCH 1/5] refactor: split metadata into v2 and v3 modules

---
 src/zarr/api/asynchronous.py                  |   3 +-
 src/zarr/codecs/sharding.py                   |   2 +-
 src/zarr/core/array.py                        |   4 +-
 src/zarr/core/metadata/__init__.py            |   4 +
 src/zarr/core/metadata/common.py              |  67 +++
 src/zarr/core/metadata/v2.py                  | 205 +++++++
 src/zarr/core/{metadata.py => metadata/v3.py} | 505 +++++-------------
 tests/v3/test_metadata/test_v2.py             |   4 +-
 tests/v3/test_metadata/test_v3.py             |   5 +-
 9 files changed, 416 insertions(+), 383 deletions(-)
 create mode 100644 src/zarr/core/metadata/__init__.py
 create mode 100644 src/zarr/core/metadata/common.py
 create mode 100644 src/zarr/core/metadata/v2.py
 rename src/zarr/core/{metadata.py => metadata/v3.py} (61%)

diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py
index 7f59517f3..a1c821696 100644
--- a/src/zarr/api/asynchronous.py
+++ b/src/zarr/api/asynchronous.py
@@ -10,7 +10,8 @@
 from zarr.core.array import Array, AsyncArray
 from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat
 from zarr.core.group import AsyncGroup
-from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
+from zarr.core.metadata.v2 import ArrayV2Metadata
+from zarr.core.metadata.v3 import ArrayV3Metadata
 from zarr.store import (
     StoreLike,
     make_store_path,
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 528110472..df7f5978a 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -44,7 +44,7 @@
     get_indexer,
     morton_order_iter,
 )
-from zarr.core.metadata import parse_codecs
+from zarr.core.metadata.v3 import parse_codecs
 from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec
 
 if TYPE_CHECKING:
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
index dcd7217d7..7311b6eec 100644
--- a/src/zarr/core/array.py
+++ b/src/zarr/core/array.py
@@ -55,7 +55,8 @@
     is_scalar,
     pop_fields,
 )
-from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
+from zarr.core.metadata.v2 import ArrayV2Metadata
+from zarr.core.metadata.v3 import ArrayV3Metadata
 from zarr.core.sync import sync
 from zarr.registry import get_pipeline_class
 from zarr.store import StoreLike, StorePath, make_store_path
@@ -67,6 +68,7 @@
     from collections.abc import Iterable
 
     from zarr.abc.codec import Codec, CodecPipeline
+    from zarr.core.metadata.common import ArrayMetadata
 
 # Array and AsyncArray are defined in the base ``zarr`` namespace
 __all__ = ["parse_array_metadata", "create_codec_pipeline"]
diff --git a/src/zarr/core/metadata/__init__.py b/src/zarr/core/metadata/__init__.py
new file mode 100644
index 000000000..addf47339
--- /dev/null
+++ b/src/zarr/core/metadata/__init__.py
@@ -0,0 +1,4 @@
+from .v2 import ArrayV2Metadata
+from .v3 import ArrayV3Metadata
+
+__all__ = ["ArrayV2Metadata", "ArrayV3Metadata"]
diff --git a/src/zarr/core/metadata/common.py b/src/zarr/core/metadata/common.py
new file mode 100644
index 000000000..583375b4b
--- /dev/null
+++ b/src/zarr/core/metadata/common.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from typing import Any, Literal
+
+    import numpy as np
+    from typing_extensions import Self
+
+    from zarr.core.array_spec import ArraySpec
+    from zarr.core.buffer import Buffer, BufferPrototype
+    from zarr.core.chunk_grids import ChunkGrid
+    from zarr.core.common import JSON, ChunkCoords, ZarrFormat
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+from zarr.abc.metadata import Metadata
+
+
+@dataclass(frozen=True, kw_only=True)
+class ArrayMetadata(Metadata, ABC):
+    shape: ChunkCoords
+    fill_value: Any
+    chunk_grid: ChunkGrid
+    attributes: dict[str, JSON]
+    zarr_format: ZarrFormat
+
+    @property
+    @abstractmethod
+    def dtype(self) -> np.dtype[Any]:
+        pass
+
+    @property
+    @abstractmethod
+    def ndim(self) -> int:
+        pass
+
+    @abstractmethod
+    def get_chunk_spec(
+        self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
+    ) -> ArraySpec:
+        pass
+
+    @abstractmethod
+    def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
+        pass
+
+    @abstractmethod
+    def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
+        pass
+
+    @abstractmethod
+    def update_shape(self, shape: ChunkCoords) -> Self:
+        pass
+
+    @abstractmethod
+    def update_attributes(self, attributes: dict[str, JSON]) -> Self:
+        pass
+
+
+def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]:
+    if data is None:
+        return {}
+
+    return data
diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
new file mode 100644
index 000000000..813adeff5
--- /dev/null
+++ b/src/zarr/core/metadata/v2.py
@@ -0,0 +1,205 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from typing import Any, Literal
+
+    import numpy.typing as npt
+    from typing_extensions import Self
+
+    from zarr.core.buffer import Buffer, BufferPrototype
+    from zarr.core.common import JSON, ChunkCoords
+
+import json
+from dataclasses import dataclass, field, replace
+
+import numpy as np
+
+from zarr.core.array_spec import ArraySpec
+from zarr.core.chunk_grids import RegularChunkGrid
+from zarr.core.chunk_key_encodings import parse_separator
+from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_dtype, parse_shapelike
+from zarr.core.config import config, parse_indexing_order
+from zarr.core.metadata.common import ArrayMetadata, parse_attributes
+
+
+@dataclass(frozen=True, kw_only=True)
+class ArrayV2Metadata(ArrayMetadata):
+    shape: ChunkCoords
+    chunk_grid: RegularChunkGrid
+    data_type: np.dtype[Any]
+    fill_value: None | int | float = 0
+    order: Literal["C", "F"] = "C"
+    filters: list[dict[str, JSON]] | None = None
+    dimension_separator: Literal[".", "/"] = "."
+    compressor: dict[str, JSON] | None = None
+    attributes: dict[str, JSON] = field(default_factory=dict)
+    zarr_format: Literal[2] = field(init=False, default=2)
+
+    def __init__(
+        self,
+        *,
+        shape: ChunkCoords,
+        dtype: npt.DTypeLike,
+        chunks: ChunkCoords,
+        fill_value: Any,
+        order: Literal["C", "F"],
+        dimension_separator: Literal[".", "/"] = ".",
+        compressor: dict[str, JSON] | None = None,
+        filters: list[dict[str, JSON]] | None = None,
+        attributes: dict[str, JSON] | None = None,
+    ):
+        """
+        Metadata for a Zarr version 2 array.
+        """
+        shape_parsed = parse_shapelike(shape)
+        data_type_parsed = parse_dtype(dtype)
+        chunks_parsed = parse_shapelike(chunks)
+        compressor_parsed = parse_compressor(compressor)
+        order_parsed = parse_indexing_order(order)
+        dimension_separator_parsed = parse_separator(dimension_separator)
+        filters_parsed = parse_filters(filters)
+        fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed)
+        attributes_parsed = parse_attributes(attributes)
+
+        object.__setattr__(self, "shape", shape_parsed)
+        object.__setattr__(self, "data_type", data_type_parsed)
+        object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed))
+        object.__setattr__(self, "compressor", compressor_parsed)
+        object.__setattr__(self, "order", order_parsed)
+        object.__setattr__(self, "dimension_separator", dimension_separator_parsed)
+        object.__setattr__(self, "filters", filters_parsed)
+        object.__setattr__(self, "fill_value", fill_value_parsed)
+        object.__setattr__(self, "attributes", attributes_parsed)
+
+        # ensure that the metadata document is consistent
+        _ = parse_metadata(self)
+
+    @property
+    def ndim(self) -> int:
+        return len(self.shape)
+
+    @property
+    def dtype(self) -> np.dtype[Any]:
+        return self.data_type
+
+    @property
+    def chunks(self) -> ChunkCoords:
+        return self.chunk_grid.chunk_shape
+
+    def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
+        def _json_convert(
+            o: Any,
+        ) -> Any:
+            if isinstance(o, np.dtype):
+                if o.fields is None:
+                    return o.str
+                else:
+                    return o.descr
+            if np.isscalar(o):
+                # convert numpy scalar to python type, and pass
+                # python types through
+                return getattr(o, "item", lambda: o)()
+            raise TypeError
+
+        zarray_dict = self.to_dict()
+        assert isinstance(zarray_dict, dict)
+        zattrs_dict = zarray_dict.pop("attributes", {})
+        assert isinstance(zattrs_dict, dict)
+        json_indent = config.get("json_indent")
+        return {
+            ZARRAY_JSON: prototype.buffer.from_bytes(
+                json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
+            ),
+            ZATTRS_JSON: prototype.buffer.from_bytes(
+                json.dumps(zattrs_dict, indent=json_indent).encode()
+            ),
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
+        # make a copy to protect the original from modification
+        _data = data.copy()
+        # check that the zarr_format attribute is correct
+        _ = parse_zarr_format_v2(_data.pop("zarr_format"))
+        return cls(**_data)
+
+    def to_dict(self) -> JSON:
+        zarray_dict = super().to_dict()
+
+        assert isinstance(zarray_dict, dict)
+
+        _ = zarray_dict.pop("chunk_grid")
+        zarray_dict["chunks"] = self.chunk_grid.chunk_shape
+
+        _ = zarray_dict.pop("data_type")
+        zarray_dict["dtype"] = self.data_type.str
+
+        return zarray_dict
+
+    def get_chunk_spec(
+        self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
+    ) -> ArraySpec:
+        return ArraySpec(
+            shape=self.chunk_grid.chunk_shape,
+            dtype=self.dtype,
+            fill_value=self.fill_value,
+            order=order,
+            prototype=prototype,
+        )
+
+    def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
+        chunk_identifier = self.dimension_separator.join(map(str, chunk_coords))
+        return "0" if chunk_identifier == "" else chunk_identifier
+
+    def update_shape(self, shape: ChunkCoords) -> Self:
+        return replace(self, shape=shape)
+
+    def update_attributes(self, attributes: dict[str, JSON]) -> Self:
+        return replace(self, attributes=attributes)
+
+
+def parse_zarr_format_v2(data: Literal[2]) -> Literal[2]:
+    if data == 2:
+        return data
+    raise ValueError(f"Invalid value. Expected 2. Got {data}.")
+
+
+def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None:
+    return data
+
+
+def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None:
+    return data
+
+
+def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
+    if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)):
+        msg = (
+            f"The `shape` and `chunks` attributes must have the same length. "
+            f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}."
+        )
+        raise ValueError(msg)
+    return data
+
+
+def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
+    """
+    Parse a potential fill value into a value that is compatible with the provided dtype.
+
+    This is a light wrapper around zarr.v2.util.normalize_fill_value.
+
+    Parameters
+    ----------
+    fill_value: Any
+        A potential fill value.
+    dtype: np.dtype[Any]
+        A numpy dtype.
+
+    Returns
+        An instance of `dtype`, or `None`, or any python object (in the case of an object dtype)
+    """
+    from zarr.v2.util import normalize_fill_value
+
+    return normalize_fill_value(fill_value=fill_value, dtype=dtype)
diff --git a/src/zarr/core/metadata.py b/src/zarr/core/metadata/v3.py
similarity index 61%
rename from src/zarr/core/metadata.py
rename to src/zarr/core/metadata/v3.py
index d25559cd5..195c3bd0a 100644
--- a/src/zarr/core/metadata.py
+++ b/src/zarr/core/metadata/v3.py
@@ -1,156 +1,73 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING, cast, overload
+
+if TYPE_CHECKING:
+    import numpy.typing as npt
+    from typing_extensions import Self
+
+    from zarr.core.buffer import Buffer, BufferPrototype
+    from zarr.core.chunk_grids import ChunkGrid
+    from zarr.core.common import JSON, ChunkCoords
+
 import json
-from abc import ABC, abstractmethod
 from collections.abc import Iterable, Sequence
 from dataclasses import dataclass, field, replace
 from enum import Enum
-from typing import TYPE_CHECKING, Any, Literal, cast, overload
+from typing import Any, Literal
 
+import numcodecs.abc
 import numpy as np
-import numpy.typing as npt
 
 from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec, CodecPipeline
-from zarr.abc.metadata import Metadata
-from zarr.core.buffer import Buffer, BufferPrototype, default_buffer_prototype
+from zarr.core.array_spec import ArraySpec
+from zarr.core.buffer import default_buffer_prototype
 from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid
-from zarr.core.chunk_key_encodings import ChunkKeyEncoding, parse_separator
+from zarr.core.chunk_key_encodings import ChunkKeyEncoding
+from zarr.core.common import ZARR_JSON, parse_dtype, parse_named_configuration, parse_shapelike
+from zarr.core.config import config
+from zarr.core.metadata.common import ArrayMetadata, parse_attributes
 from zarr.registry import get_codec_class, get_pipeline_class
 
-if TYPE_CHECKING:
-    from typing_extensions import Self
-
-import numcodecs.abc
-
-from zarr.core.array_spec import ArraySpec
-from zarr.core.common import (
-    JSON,
-    ZARR_JSON,
-    ZARRAY_JSON,
-    ZATTRS_JSON,
-    ChunkCoords,
-    ZarrFormat,
-    parse_dtype,
-    parse_named_configuration,
-    parse_shapelike,
-)
-from zarr.core.config import config, parse_indexing_order
-
-# For type checking
-_bool = bool
 
-__all__ = ["ArrayMetadata"]
-
-
-class DataType(Enum):
-    bool = "bool"
-    int8 = "int8"
-    int16 = "int16"
-    int32 = "int32"
-    int64 = "int64"
-    uint8 = "uint8"
-    uint16 = "uint16"
-    uint32 = "uint32"
-    uint64 = "uint64"
-    float32 = "float32"
-    float64 = "float64"
-
-    @property
-    def byte_count(self) -> int:
-        data_type_byte_counts = {
-            DataType.bool: 1,
-            DataType.int8: 1,
-            DataType.int16: 2,
-            DataType.int32: 4,
-            DataType.int64: 8,
-            DataType.uint8: 1,
-            DataType.uint16: 2,
-            DataType.uint32: 4,
-            DataType.uint64: 8,
-            DataType.float32: 4,
-            DataType.float64: 8,
-        }
-        return data_type_byte_counts[self]
-
-    @property
-    def has_endianness(self) -> _bool:
-        # This might change in the future, e.g. for a complex with 2 8-bit floats
-        return self.byte_count != 1
-
-    def to_numpy_shortname(self) -> str:
-        data_type_to_numpy = {
-            DataType.bool: "bool",
-            DataType.int8: "i1",
-            DataType.int16: "i2",
-            DataType.int32: "i4",
-            DataType.int64: "i8",
-            DataType.uint8: "u1",
-            DataType.uint16: "u2",
-            DataType.uint32: "u4",
-            DataType.uint64: "u8",
-            DataType.float32: "f4",
-            DataType.float64: "f8",
-        }
-        return data_type_to_numpy[self]
-
-    @classmethod
-    def from_dtype(cls, dtype: np.dtype[Any]) -> DataType:
-        dtype_to_data_type = {
-            "|b1": "bool",
-            "bool": "bool",
-            "|i1": "int8",
-            "<i2": "int16",
-            "<i4": "int32",
-            "<i8": "int64",
-            "|u1": "uint8",
-            "<u2": "uint16",
-            "<u4": "uint32",
-            "<u8": "uint64",
-            "<f4": "float32",
-            "<f8": "float64",
-        }
-        return DataType[dtype_to_data_type[dtype.str]]
+def parse_zarr_format(data: Literal[3]) -> Literal[3]:
+    if data == 3:
+        return data
+    raise ValueError(f"Invalid value. Expected 3. Got {data}.")
 
 
-@dataclass(frozen=True, kw_only=True)
-class ArrayMetadata(Metadata, ABC):
-    shape: ChunkCoords
-    fill_value: Any
-    chunk_grid: ChunkGrid
-    attributes: dict[str, JSON]
-    zarr_format: ZarrFormat
+def parse_node_type_array(data: Literal["array"]) -> Literal["array"]:
+    if data == "array":
+        return data
+    raise ValueError(f"Invalid value. Expected 'array'. Got {data}.")
 
-    @property
-    @abstractmethod
-    def dtype(self) -> np.dtype[Any]:
-        pass
 
-    @property
-    @abstractmethod
-    def ndim(self) -> int:
-        pass
+def parse_codecs(data: Iterable[Codec | dict[str, JSON]]) -> tuple[Codec, ...]:
+    out: tuple[Codec, ...] = ()
 
-    @abstractmethod
-    def get_chunk_spec(
-        self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
-    ) -> ArraySpec:
-        pass
+    if not isinstance(data, Iterable):
+        raise TypeError(f"Expected iterable, got {type(data)}")
 
-    @abstractmethod
-    def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
-        pass
+    for c in data:
+        if isinstance(
+            c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec
+        ):  # Can't use Codec here because of mypy limitation
+            out += (c,)
+        else:
+            name_parsed, _ = parse_named_configuration(c, require_configuration=False)
+            out += (get_codec_class(name_parsed).from_dict(c),)
 
-    @abstractmethod
-    def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
-        pass
+    return out
 
-    @abstractmethod
-    def update_shape(self, shape: ChunkCoords) -> Self:
-        pass
 
-    @abstractmethod
-    def update_attributes(self, attributes: dict[str, JSON]) -> Self:
-        pass
+def parse_dimension_names(data: None | Iterable[str | None]) -> tuple[str | None, ...] | None:
+    if data is None:
+        return data
+    elif all(isinstance(x, type(None) | str) for x in data):
+        return tuple(data)
+    else:
+        msg = f"Expected either None or a iterable of str, got {type(data)}"
+        raise TypeError(msg)
 
 
 @dataclass(frozen=True, kw_only=True)
@@ -186,7 +103,7 @@ def __init__(
         chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid)
         chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding)
         dimension_names_parsed = parse_dimension_names(dimension_names)
-        fill_value_parsed = parse_fill_value_v3(fill_value, dtype=data_type_parsed)
+        fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed)
         attributes_parsed = parse_attributes(attributes)
         codecs_parsed_partial = parse_codecs(codecs)
 
@@ -294,7 +211,7 @@ def from_dict(cls, data: dict[str, JSON]) -> ArrayV3Metadata:
         _data = data.copy()
         # TODO: Remove the type: ignores[] comments below and use a TypedDict to type `data`
         # check that the zarr_format attribute is correct
-        _ = parse_zarr_format_v3(_data.pop("zarr_format"))  # type: ignore[arg-type]
+        _ = parse_zarr_format(_data.pop("zarr_format"))  # type: ignore[arg-type]
         # check that the node_type attribute is correct
         _ = parse_node_type_array(_data.pop("node_type"))  # type: ignore[arg-type]
 
@@ -323,250 +240,14 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:
         return replace(self, attributes=attributes)
 
 
-@dataclass(frozen=True, kw_only=True)
-class ArrayV2Metadata(ArrayMetadata):
-    shape: ChunkCoords
-    chunk_grid: RegularChunkGrid
-    data_type: np.dtype[Any]
-    fill_value: None | int | float = 0
-    order: Literal["C", "F"] = "C"
-    filters: list[dict[str, JSON]] | None = None
-    dimension_separator: Literal[".", "/"] = "."
-    compressor: dict[str, JSON] | None = None
-    attributes: dict[str, JSON] = field(default_factory=dict)
-    zarr_format: Literal[2] = field(init=False, default=2)
-
-    def __init__(
-        self,
-        *,
-        shape: ChunkCoords,
-        dtype: npt.DTypeLike,
-        chunks: ChunkCoords,
-        fill_value: Any,
-        order: Literal["C", "F"],
-        dimension_separator: Literal[".", "/"] = ".",
-        compressor: dict[str, JSON] | None = None,
-        filters: list[dict[str, JSON]] | None = None,
-        attributes: dict[str, JSON] | None = None,
-    ):
-        """
-        Metadata for a Zarr version 2 array.
-        """
-        shape_parsed = parse_shapelike(shape)
-        data_type_parsed = parse_dtype(dtype)
-        chunks_parsed = parse_shapelike(chunks)
-        compressor_parsed = parse_compressor(compressor)
-        order_parsed = parse_indexing_order(order)
-        dimension_separator_parsed = parse_separator(dimension_separator)
-        filters_parsed = parse_filters(filters)
-        fill_value_parsed = parse_fill_value_v2(fill_value, dtype=data_type_parsed)
-        attributes_parsed = parse_attributes(attributes)
-
-        object.__setattr__(self, "shape", shape_parsed)
-        object.__setattr__(self, "data_type", data_type_parsed)
-        object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed))
-        object.__setattr__(self, "compressor", compressor_parsed)
-        object.__setattr__(self, "order", order_parsed)
-        object.__setattr__(self, "dimension_separator", dimension_separator_parsed)
-        object.__setattr__(self, "filters", filters_parsed)
-        object.__setattr__(self, "fill_value", fill_value_parsed)
-        object.__setattr__(self, "attributes", attributes_parsed)
-
-        # ensure that the metadata document is consistent
-        _ = parse_v2_metadata(self)
-
-    @property
-    def ndim(self) -> int:
-        return len(self.shape)
-
-    @property
-    def dtype(self) -> np.dtype[Any]:
-        return self.data_type
-
-    @property
-    def chunks(self) -> ChunkCoords:
-        return self.chunk_grid.chunk_shape
-
-    def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
-        def _json_convert(
-            o: Any,
-        ) -> Any:
-            if isinstance(o, np.dtype):
-                if o.fields is None:
-                    return o.str
-                else:
-                    return o.descr
-            if np.isscalar(o):
-                # convert numpy scalar to python type, and pass
-                # python types through
-                return getattr(o, "item", lambda: o)()
-            raise TypeError
-
-        zarray_dict = self.to_dict()
-        assert isinstance(zarray_dict, dict)
-        zattrs_dict = zarray_dict.pop("attributes", {})
-        assert isinstance(zattrs_dict, dict)
-        json_indent = config.get("json_indent")
-        return {
-            ZARRAY_JSON: prototype.buffer.from_bytes(
-                json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
-            ),
-            ZATTRS_JSON: prototype.buffer.from_bytes(
-                json.dumps(zattrs_dict, indent=json_indent).encode()
-            ),
-        }
-
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
-        # make a copy to protect the original from modification
-        _data = data.copy()
-        # check that the zarr_format attribute is correct
-        _ = parse_zarr_format_v2(_data.pop("zarr_format"))
-        return cls(**_data)
-
-    def to_dict(self) -> JSON:
-        zarray_dict = super().to_dict()
-
-        assert isinstance(zarray_dict, dict)
-
-        _ = zarray_dict.pop("chunk_grid")
-        zarray_dict["chunks"] = self.chunk_grid.chunk_shape
-
-        _ = zarray_dict.pop("data_type")
-        zarray_dict["dtype"] = self.data_type.str
-
-        return zarray_dict
-
-    def get_chunk_spec(
-        self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
-    ) -> ArraySpec:
-        return ArraySpec(
-            shape=self.chunk_grid.chunk_shape,
-            dtype=self.dtype,
-            fill_value=self.fill_value,
-            order=order,
-            prototype=prototype,
-        )
-
-    def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
-        chunk_identifier = self.dimension_separator.join(map(str, chunk_coords))
-        return "0" if chunk_identifier == "" else chunk_identifier
-
-    def update_shape(self, shape: ChunkCoords) -> Self:
-        return replace(self, shape=shape)
-
-    def update_attributes(self, attributes: dict[str, JSON]) -> Self:
-        return replace(self, attributes=attributes)
-
-
-def parse_dimension_names(data: None | Iterable[str | None]) -> tuple[str | None, ...] | None:
-    if data is None:
-        return data
-    elif all(isinstance(x, type(None) | str) for x in data):
-        return tuple(data)
-    else:
-        msg = f"Expected either None or a iterable of str, got {type(data)}"
-        raise TypeError(msg)
-
-
-# todo: real validation
-def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]:
-    if data is None:
-        return {}
-
-    return data
-
-
-# todo: move to its own module and drop _v3 suffix
-# todo: consider folding all the literal parsing into a single function
-# that takes 2 arguments
-def parse_zarr_format_v3(data: Literal[3]) -> Literal[3]:
-    if data == 3:
-        return data
-    raise ValueError(f"Invalid value. Expected 3. Got {data}.")
-
-
-# todo: move to its own module and drop _v2 suffix
-def parse_zarr_format_v2(data: Literal[2]) -> Literal[2]:
-    if data == 2:
-        return data
-    raise ValueError(f"Invalid value. Expected 2. Got {data}.")
-
-
-def parse_node_type_array(data: Literal["array"]) -> Literal["array"]:
-    if data == "array":
-        return data
-    raise ValueError(f"Invalid value. Expected 'array'. Got {data}.")
-
-
-# todo: real validation
-def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None:
-    return data
-
-
-# todo: real validation
-def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None:
-    return data
-
-
-def parse_v2_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
-    if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)):
-        msg = (
-            f"The `shape` and `chunks` attributes must have the same length. "
-            f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}."
-        )
-        raise ValueError(msg)
-    return data
-
-
 def create_pipeline(data: Iterable[Codec | JSON]) -> CodecPipeline:
     if not isinstance(data, Iterable):
         raise TypeError(f"Expected iterable, got {type(data)}")
     return get_pipeline_class().from_dict(data)
 
 
-def parse_codecs(data: Iterable[Codec | dict[str, JSON]]) -> tuple[Codec, ...]:
-    out: tuple[Codec, ...] = ()
-
-    if not isinstance(data, Iterable):
-        raise TypeError(f"Expected iterable, got {type(data)}")
-
-    for c in data:
-        if isinstance(
-            c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec
-        ):  # Can't use Codec here because of mypy limitation
-            out += (c,)
-        else:
-            name_parsed, _ = parse_named_configuration(c, require_configuration=False)
-            out += (get_codec_class(name_parsed).from_dict(c),)
-
-    return out
-
-
-def parse_fill_value_v2(fill_value: Any, dtype: np.dtype[Any]) -> Any:
-    """
-    Parse a potential fill value into a value that is compatible with the provided dtype.
-
-    This is a light wrapper around zarr.v2.util.normalize_fill_value.
-
-    Parameters
-    ----------
-    fill_value: Any
-        A potential fill value.
-    dtype: np.dtype[Any]
-        A numpy dtype.
-
-    Returns
-        An instance of `dtype`, or `None`, or any python object (in the case of an object dtype)
-    """
-    from zarr.v2.util import normalize_fill_value
-
-    return normalize_fill_value(fill_value=fill_value, dtype=dtype)
-
-
 BOOL = np.bool_
 BOOL_DTYPE = np.dtypes.BoolDType
-
 INTEGER_DTYPE = (
     np.dtypes.Int8DType
     | np.dtypes.Int16DType
@@ -577,33 +258,31 @@ def parse_fill_value_v2(fill_value: Any, dtype: np.dtype[Any]) -> Any:
     | np.dtypes.UInt32DType
     | np.dtypes.UInt64DType
 )
-
 INTEGER = np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64
 FLOAT_DTYPE = np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType
 FLOAT = np.float16 | np.float32 | np.float64
 COMPLEX_DTYPE = np.dtypes.Complex64DType | np.dtypes.Complex128DType
 COMPLEX = np.complex64 | np.complex128
-# todo: r* dtypes
 
 
 @overload
-def parse_fill_value_v3(fill_value: Any, dtype: BOOL_DTYPE) -> BOOL: ...
+def parse_fill_value(fill_value: Any, dtype: BOOL_DTYPE) -> BOOL: ...
 
 
 @overload
-def parse_fill_value_v3(fill_value: Any, dtype: INTEGER_DTYPE) -> INTEGER: ...
+def parse_fill_value(fill_value: Any, dtype: INTEGER_DTYPE) -> INTEGER: ...
 
 
 @overload
-def parse_fill_value_v3(fill_value: Any, dtype: FLOAT_DTYPE) -> FLOAT: ...
+def parse_fill_value(fill_value: Any, dtype: FLOAT_DTYPE) -> FLOAT: ...
 
 
 @overload
-def parse_fill_value_v3(fill_value: Any, dtype: COMPLEX_DTYPE) -> COMPLEX: ...
+def parse_fill_value(fill_value: Any, dtype: COMPLEX_DTYPE) -> COMPLEX: ...
 
 
 @overload
-def parse_fill_value_v3(fill_value: Any, dtype: np.dtype[Any]) -> Any:
+def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
     # This dtype[Any] is unfortunately necessary right now.
     # See https://github.com/zarr-developers/zarr-python/issues/2131#issuecomment-2318010899
     # for more details, but `dtype` here (which comes from `parse_dtype`)
@@ -614,7 +293,7 @@ def parse_fill_value_v3(fill_value: Any, dtype: np.dtype[Any]) -> Any:
     ...
 
 
-def parse_fill_value_v3(
+def parse_fill_value(
     fill_value: Any, dtype: BOOL_DTYPE | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | np.dtype[Any]
 ) -> BOOL | INTEGER | FLOAT | COMPLEX | Any:
     """
@@ -655,3 +334,77 @@ def parse_fill_value_v3(
         msg = f"Cannot parse non-string sequence {fill_value} as a scalar with type {dtype}."
         raise TypeError(msg)
     return dtype.type(fill_value)
+
+
+# For type checking
+_bool = bool
+
+
+class DataType(Enum):
+    bool = "bool"
+    int8 = "int8"
+    int16 = "int16"
+    int32 = "int32"
+    int64 = "int64"
+    uint8 = "uint8"
+    uint16 = "uint16"
+    uint32 = "uint32"
+    uint64 = "uint64"
+    float32 = "float32"
+    float64 = "float64"
+
+    @property
+    def byte_count(self) -> int:
+        data_type_byte_counts = {
+            DataType.bool: 1,
+            DataType.int8: 1,
+            DataType.int16: 2,
+            DataType.int32: 4,
+            DataType.int64: 8,
+            DataType.uint8: 1,
+            DataType.uint16: 2,
+            DataType.uint32: 4,
+            DataType.uint64: 8,
+            DataType.float32: 4,
+            DataType.float64: 8,
+        }
+        return data_type_byte_counts[self]
+
+    @property
+    def has_endianness(self) -> _bool:
+        # This might change in the future, e.g. for a complex with 2 8-bit floats
+        return self.byte_count != 1
+
+    def to_numpy_shortname(self) -> str:
+        data_type_to_numpy = {
+            DataType.bool: "bool",
+            DataType.int8: "i1",
+            DataType.int16: "i2",
+            DataType.int32: "i4",
+            DataType.int64: "i8",
+            DataType.uint8: "u1",
+            DataType.uint16: "u2",
+            DataType.uint32: "u4",
+            DataType.uint64: "u8",
+            DataType.float32: "f4",
+            DataType.float64: "f8",
+        }
+        return data_type_to_numpy[self]
+
+    @classmethod
+    def from_dtype(cls, dtype: np.dtype[Any]) -> DataType:
+        dtype_to_data_type = {
+            "|b1": "bool",
+            "bool": "bool",
+            "|i1": "int8",
+            "<i2": "int16",
+            "<i4": "int32",
+            "<i8": "int64",
+            "|u1": "uint8",
+            "<u2": "uint16",
+            "<u4": "uint32",
+            "<u8": "uint64",
+            "<f4": "float32",
+            "<f8": "float64",
+        }
+        return DataType[dtype_to_data_type[dtype.str]]
diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py
index 0053de08c..e06e345a6 100644
--- a/tests/v3/test_metadata/test_v2.py
+++ b/tests/v3/test_metadata/test_v2.py
@@ -2,6 +2,8 @@
 
 from typing import TYPE_CHECKING, Literal
 
+from zarr.core.metadata.v2 import ArrayV2Metadata
+
 if TYPE_CHECKING:
     from typing import Any
 
@@ -10,7 +12,7 @@
 import pytest
 
 from zarr.codecs import GzipCodec
-from zarr.core.metadata import ArrayV2Metadata, parse_zarr_format_v2
+from zarr.core.metadata.v2 import parse_zarr_format_v2
 
 
 def test_parse_zarr_format_valid() -> None:
diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py
index bc43154a5..50f3356b3 100644
--- a/tests/v3/test_metadata/test_v3.py
+++ b/tests/v3/test_metadata/test_v3.py
@@ -7,6 +7,7 @@
 from zarr.codecs.bytes import BytesCodec
 from zarr.core.buffer import default_buffer_prototype
 from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding
+from zarr.core.metadata.v3 import ArrayV3Metadata
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -18,9 +19,7 @@
 import numpy as np
 import pytest
 
-from zarr.core.metadata import ArrayV3Metadata, parse_dimension_names
-from zarr.core.metadata import parse_fill_value_v3 as parse_fill_value
-from zarr.core.metadata import parse_zarr_format_v3 as parse_zarr_format
+from zarr.core.metadata.v3 import parse_dimension_names, parse_fill_value, parse_zarr_format
 
 bool_dtypes = ("bool",)
 

From c445593837c9cabbaf8beb2c512c33b1d38eb53b Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 12 Sep 2024 20:14:46 +0200
Subject: [PATCH 2/5] add more explicit typeguards

---
 src/zarr/core/metadata/v2.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
index 813adeff5..fe6e050c8 100644
--- a/src/zarr/core/metadata/v2.py
+++ b/src/zarr/core/metadata/v2.py
@@ -104,9 +104,12 @@ def _json_convert(
             raise TypeError
 
         zarray_dict = self.to_dict()
-        assert isinstance(zarray_dict, dict)
+
+        # todo: remove this check when we can ensure that to_dict always returns dicts.
+        if not isinstance(zarray_dict, dict):
+            raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.")
+
         zattrs_dict = zarray_dict.pop("attributes", {})
-        assert isinstance(zattrs_dict, dict)
         json_indent = config.get("json_indent")
         return {
             ZARRAY_JSON: prototype.buffer.from_bytes(
@@ -128,7 +131,9 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
     def to_dict(self) -> JSON:
         zarray_dict = super().to_dict()
 
-        assert isinstance(zarray_dict, dict)
+        # todo: remove this check when we can ensure that to_dict always returns dicts.
+        if not isinstance(zarray_dict, dict):
+            raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.")
 
         _ = zarray_dict.pop("chunk_grid")
         zarray_dict["chunks"] = self.chunk_grid.chunk_shape

From a179c1eb9ea5ccb06e1b78f9b3699c0c2add7115 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 12 Sep 2024 20:21:25 +0200
Subject: [PATCH 3/5] port fill value normalization from v2

---
 src/zarr/core/metadata/v2.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
index fe6e050c8..0dc63f1ff 100644
--- a/src/zarr/core/metadata/v2.py
+++ b/src/zarr/core/metadata/v2.py
@@ -193,8 +193,6 @@ def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
     """
     Parse a potential fill value into a value that is compatible with the provided dtype.
 
-    This is a light wrapper around zarr.v2.util.normalize_fill_value.
-
     Parameters
     ----------
     fill_value: Any
@@ -205,6 +203,33 @@ def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
     Returns
         An instance of `dtype`, or `None`, or any python object (in the case of an object dtype)
     """
-    from zarr.v2.util import normalize_fill_value
 
-    return normalize_fill_value(fill_value=fill_value, dtype=dtype)
+    if fill_value is None or dtype.hasobject:
+        # no fill value
+        pass
+    elif not isinstance(fill_value, np.void) and fill_value == 0:
+        # this should be compatible across numpy versions for any array type, including
+        # structured arrays
+        fill_value = np.zeros((), dtype=dtype)[()]
+
+    elif dtype.kind == "U":
+        # special case unicode because of encoding issues on Windows if passed through numpy
+        # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713
+
+        if not isinstance(fill_value, str):
+            raise ValueError(
+                f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string"
+            )
+    else:
+        try:
+            if isinstance(fill_value, bytes) and dtype.kind == "V":
+                # special case for numpy 1.14 compatibility
+                fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()]
+            else:
+                fill_value = np.array(fill_value, dtype=dtype)[()]
+
+        except Exception as e:
+            msg = f"Fill_value {fill_value} is not valid for dtype {dtype}."
+            raise ValueError(msg) from e
+
+    return fill_value

From 700d9b820ffec96420471052678516e02479254d Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 12 Sep 2024 20:22:01 +0200
Subject: [PATCH 4/5] remove v2 suffix from zarr format parsing

---
 src/zarr/core/metadata/v2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
index 0dc63f1ff..6d5ecd7e8 100644
--- a/src/zarr/core/metadata/v2.py
+++ b/src/zarr/core/metadata/v2.py
@@ -125,7 +125,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
         # make a copy to protect the original from modification
         _data = data.copy()
         # check that the zarr_format attribute is correct
-        _ = parse_zarr_format_v2(_data.pop("zarr_format"))
+        _ = parse_zarr_format(_data.pop("zarr_format"))
         return cls(**_data)
 
     def to_dict(self) -> JSON:
@@ -165,7 +165,7 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:
         return replace(self, attributes=attributes)
 
 
-def parse_zarr_format_v2(data: Literal[2]) -> Literal[2]:
+def parse_zarr_format(data: Literal[2]) -> Literal[2]:
     if data == 2:
         return data
     raise ValueError(f"Invalid value. Expected 2. Got {data}.")

From 5994652c26ae7743d5a57c7d3642ff1bf837d4c7 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 12 Sep 2024 20:22:16 +0200
Subject: [PATCH 5/5] remove v2 suffix from zarr format parsing

---
 tests/v3/test_metadata/test_v2.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py
index e06e345a6..4465a8647 100644
--- a/tests/v3/test_metadata/test_v2.py
+++ b/tests/v3/test_metadata/test_v2.py
@@ -12,17 +12,17 @@
 import pytest
 
 from zarr.codecs import GzipCodec
-from zarr.core.metadata.v2 import parse_zarr_format_v2
+from zarr.core.metadata.v2 import parse_zarr_format
 
 
 def test_parse_zarr_format_valid() -> None:
-    assert parse_zarr_format_v2(2) == 2
+    assert parse_zarr_format(2) == 2
 
 
 @pytest.mark.parametrize("data", [None, 1, 3, 4, 5, "3"])
 def test_parse_zarr_format_invalid(data: Any) -> None:
     with pytest.raises(ValueError, match=f"Invalid value. Expected 2. Got {data}"):
-        parse_zarr_format_v2(data)
+        parse_zarr_format(data)
 
 
 @pytest.mark.parametrize("attributes", [None, {"foo": "bar"}])