Skip to content

Commit

Permalink
Remove read_scikit_allel_vcfzarr
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Oct 2, 2024
1 parent 54bec33 commit 2bea247
Show file tree
Hide file tree
Showing 4 changed files with 1 addition and 714 deletions.
2 changes: 0 additions & 2 deletions sgkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from .display import display_genotypes, display_pedigree
from .distance.api import pairwise_distance
from .io.dataset import load_dataset, save_dataset
from .io.vcfzarr_reader import read_scikit_allel_vcfzarr
from .model import (
DIM_ALLELE,
DIM_PLOIDY,
Expand Down Expand Up @@ -94,7 +93,6 @@
"genee",
"genomic_relationship",
"gwas_linear_regression",
"read_scikit_allel_vcfzarr",
"regenie",
"regenie_loco_regression",
"hardy_weinberg_test",
Expand Down
110 changes: 1 addition & 109 deletions sgkit/io/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
from typing import Mapping, Optional, Tuple

import dask.array as da
import dask.dataframe as dd
import numpy as np
import zarr

from ..typing import ArrayLike, DType
from ..utils import encode_array, max_str_len
Expand Down Expand Up @@ -52,109 +50,3 @@ def encode_contigs(contig: ArrayLike) -> Tuple[ArrayLike, ArrayLike]:
else:
ids, names = encode_array(np.asarray(contig, dtype=str))
return ids, names


def concatenate_and_rechunk(
zarrs: Sequence[zarr.Array],
chunks: Optional[Tuple[int, ...]] = None,
dtype: DType = None,
) -> da.Array:
"""Perform a concatenate and rechunk operation on a collection of Zarr arrays
to produce an array with a uniform chunking, suitable for saving as
a single Zarr array.
In contrast to Dask's ``rechunk`` method, the Dask computation graph
is embarrassingly parallel and will make efficient use of memory,
since no Zarr chunks are cached by the Dask scheduler.
The Zarr arrays must have matching shapes except in the first
dimension.
Parameters
----------
zarrs
Collection of Zarr arrays to concatenate.
chunks : Optional[Tuple[int, ...]], optional
The chunks to apply to the concatenated arrays. If not specified
the chunks for the first array will be applied to the concatenated
array.
dtype
The dtype of the concatenated array, by default the same as the
first array.
Returns
-------
A Dask array, suitable for saving as a single Zarr array.
Raises
------
ValueError
If the Zarr arrays do not have matching shapes (except in the first
dimension).
"""

if len(set([z.shape[1:] for z in zarrs])) > 1:
shapes = [z.shape for z in zarrs]
raise ValueError(
f"Zarr arrays must have matching shapes (except in the first dimension): {shapes}"
)

lengths = np.array([z.shape[0] for z in zarrs])
lengths0 = np.insert(lengths, 0, 0, axis=0) # type: ignore[no-untyped-call]
offsets = np.cumsum(lengths0)
total_length = offsets[-1]

shape = (total_length, *zarrs[0].shape[1:])
chunks = chunks or zarrs[0].chunks
dtype = dtype or zarrs[0].dtype

ar = da.empty(shape, chunks=chunks)

def load_chunk(
x: ArrayLike,
zarrs: Sequence[zarr.Array],
offsets: ArrayLike,
block_info: Dict[Any, Any],
) -> ArrayLike:
return _slice_zarrs(zarrs, offsets, block_info[0]["array-location"])

return ar.map_blocks(load_chunk, zarrs=zarrs, offsets=offsets, dtype=dtype)


def _zarr_index(offsets: ArrayLike, pos: int) -> int:
"""Return the index of the zarr file that pos falls in"""
index: int = np.searchsorted(offsets, pos, side="right") - 1 # type: ignore[assignment]
return index


def _slice_zarrs(
zarrs: Sequence[zarr.Array], offsets: ArrayLike, locs: Sequence[Tuple[int, ...]]
) -> ArrayLike:
"""Slice concatenated zarrs by locs"""
# convert array locations to slices
locs = [slice(*loc) for loc in locs] # type: ignore[misc]
# determine which zarr files are needed
start, stop = locs[0].start, locs[0].stop # type: ignore[attr-defined] # stack on first axis
i0 = _zarr_index(offsets, start)
i1 = _zarr_index(offsets, stop)
if i0 == i1: # within a single zarr file
sel = slice(start - offsets[i0], stop - offsets[i0])
return zarrs[i0][(sel, *locs[1:])]
else: # more than one zarr file
slices = []
slices.append((i0, slice(start - offsets[i0], None)))
for i in range(i0 + 1, i1): # entire zarr
slices.append((i, slice(None)))
if stop > offsets[i1]:
slices.append((i1, slice(0, stop - offsets[i1])))
parts = [zarrs[i][(sel, *locs[1:])] for (i, sel) in slices]
return np.concatenate(parts) # type: ignore[no-untyped-call]


def str_is_int(x: str) -> bool:
"""Test if a string can be parsed as an int"""
try:
int(x)
return True
except ValueError:
return False
Loading

0 comments on commit 2bea247

Please sign in to comment.