Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v2 / v3 compat] add Group.array and data kwarg to array creation #2042

Merged
merged 17 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions src/zarr/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from zarr.abc.store import set_or_delete
from zarr.attributes import Attributes
from zarr.buffer import BufferPrototype, NDArrayLike, NDBuffer, default_buffer_prototype
from zarr.chunk_grids import RegularChunkGrid
from zarr.chunk_grids import RegularChunkGrid, _guess_chunks
from zarr.chunk_key_encodings import ChunkKeyEncoding, DefaultChunkKeyEncoding, V2ChunkKeyEncoding
from zarr.codecs import BytesCodec
from zarr.codecs._v2 import V2Compressor, V2Filters
Expand All @@ -37,6 +37,7 @@
product,
)
from zarr.config import config, parse_indexing_order
from zarr.errors import ContainsArrayError
from zarr.indexing import (
BasicIndexer,
BasicSelection,
Expand All @@ -62,6 +63,7 @@
)
from zarr.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
from zarr.store import StoreLike, StorePath, make_store_path
from zarr.store.core import contains_array
from zarr.sync import sync


Expand Down Expand Up @@ -137,12 +139,13 @@ async def create(
compressor: dict[str, JSON] | None = None,
# runtime
exists_ok: bool = False,
data: npt.ArrayLike | None = None,
) -> AsyncArray:
store_path = make_store_path(store)

if chunk_shape is None:
if chunks is None:
raise ValueError("Either chunk_shape or chunks needs to be provided.")
chunk_shape = chunks = _guess_chunks(shape=shape, typesize=np.dtype(dtype).itemsize)
chunk_shape = chunks
elif chunks is not None:
raise ValueError("Only one of chunk_shape or chunks must be provided.")
Expand All @@ -164,7 +167,7 @@ async def create(
raise ValueError(
"compressor cannot be used for arrays with version 3. Use bytes-to-bytes codecs instead."
)
return await cls._create_v3(
result = await cls._create_v3(
store_path,
shape=shape,
dtype=dtype,
Expand All @@ -187,7 +190,7 @@ async def create(
)
if dimension_names is not None:
raise ValueError("dimension_names cannot be used for arrays with version 2.")
return await cls._create_v2(
result = await cls._create_v2(
store_path,
shape=shape,
dtype=dtype,
Expand All @@ -203,6 +206,12 @@ async def create(
else:
raise ValueError(f"Insupported zarr_format. Got: {zarr_format}")

if data is not None:
# insert user-provided data
await result.setitem(..., data)

return result

@classmethod
async def _create_v3(
cls,
Expand All @@ -224,7 +233,8 @@ async def _create_v3(
exists_ok: bool = False,
) -> AsyncArray:
if not exists_ok:
assert not await (store_path / ZARR_JSON).exists()
if await contains_array(store_path=store_path, zarr_format=3):
raise ContainsArrayError(store_path.store, store_path.path)

codecs = list(codecs) if codecs is not None else [BytesCodec()]

Expand Down Expand Up @@ -280,7 +290,8 @@ async def _create_v2(
import numcodecs

if not exists_ok:
assert not await (store_path / ZARRAY_JSON).exists()
if await contains_array(store_path=store_path, zarr_format=2):
raise ContainsArrayError(store_path.store, store_path.path)

if order is None:
order = "C"
Expand Down
72 changes: 72 additions & 0 deletions src/zarr/chunk_grids.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from __future__ import annotations

import itertools
import math
import operator
from abc import abstractmethod
from collections.abc import Iterator
from dataclasses import dataclass
from functools import reduce
from typing import TYPE_CHECKING

import numpy as np

from zarr.abc.metadata import Metadata
from zarr.common import (
JSON,
Expand All @@ -22,6 +25,75 @@
from typing_extensions import Self


def _guess_chunks(
shape: ChunkCoords,
typesize: int,
*,
increment_bytes: int = 256 * 1024,
min_bytes: int = 128 * 1024,
max_bytes: int = 64 * 1024 * 1024,
) -> ChunkCoords:
"""
Iteratively guess an appropriate chunk layout for an array, given its shape and
the size of each element in bytes, and size constraints expressed in bytes. This logic is
adapted from h5py.

Parameters
----------
shape: ChunkCoords
The chunk shape.
typesize: int
The size, in bytes, of each element of the chunk.
increment_bytes: int = 256 * 1024
The number of bytes used to increment or decrement the target chunk size in bytes.
min_bytes: int = 128 * 1024
The soft lower bound on the final chunk size in bytes.
max_bytes: int = 64 * 1024 * 1024
The hard upper bound on the final chunk size in bytes.

Returns
-------
ChunkCoords

"""

ndims = len(shape)
# require chunks to have non-zero length for all dimensions
chunks = np.maximum(np.array(shape, dtype="=f8"), 1)

# Determine the optimal chunk size in bytes using a PyTables expression.
# This is kept as a float.
dset_size = np.prod(chunks) * typesize
target_size = increment_bytes * (2 ** np.log10(dset_size / (1024.0 * 1024)))

if target_size > max_bytes:
target_size = max_bytes
elif target_size < min_bytes:
target_size = min_bytes

idx = 0
while True:
# Repeatedly loop over the axes, dividing them by 2. Stop when:
# 1a. We're smaller than the target chunk size, OR
# 1b. We're within 50% of the target chunk size, AND
# 2. The chunk is smaller than the maximum chunk size

chunk_bytes = np.prod(chunks) * typesize

if (
chunk_bytes < target_size or abs(chunk_bytes - target_size) / target_size < 0.5
) and chunk_bytes < max_bytes:
break

if np.prod(chunks) == 1:
break # Element size larger than max_bytes

chunks[idx % ndims] = math.ceil(chunks[idx % ndims] / 2.0)
idx += 1

return tuple(int(x) for x in chunks)


@dataclass(frozen=True)
class ChunkGrid(Metadata):
@classmethod
Expand Down
16 changes: 16 additions & 0 deletions src/zarr/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import Any


class _BaseZarrError(ValueError):
_msg = ""

def __init__(self, *args: Any) -> None:
super().__init__(self._msg.format(*args))


class ContainsGroupError(_BaseZarrError):
_msg = "A group exists in store {0!r} at path {1!r}."


class ContainsArrayError(_BaseZarrError):
_msg = "An array exists in store {0!r} at path {1!r}."
Loading