Remove read_scikit_allel_vcfzarr

sgkit-dev · Oct 2, 2024 · 2bea247 · 2bea247
1 parent 54bec33
commit 2bea247
Show file tree

Hide file tree

Showing 4 changed files with 1 addition and 714 deletions.
diff --git a/sgkit/__init__.py b/sgkit/__init__.py
@@ -3,7 +3,6 @@
 from .display import display_genotypes, display_pedigree
 from .distance.api import pairwise_distance
 from .io.dataset import load_dataset, save_dataset
-from .io.vcfzarr_reader import read_scikit_allel_vcfzarr
 from .model import (
     DIM_ALLELE,
     DIM_PLOIDY,
@@ -94,7 +93,6 @@
     "genee",
     "genomic_relationship",
     "gwas_linear_regression",
-    "read_scikit_allel_vcfzarr",
     "regenie",
     "regenie_loco_regression",
     "hardy_weinberg_test",

diff --git a/sgkit/io/utils.py b/sgkit/io/utils.py
@@ -1,9 +1,7 @@
-from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
+from typing import Mapping, Optional, Tuple
 
-import dask.array as da
 import dask.dataframe as dd
 import numpy as np
-import zarr
 
 from ..typing import ArrayLike, DType
 from ..utils import encode_array, max_str_len
@@ -52,109 +50,3 @@ def encode_contigs(contig: ArrayLike) -> Tuple[ArrayLike, ArrayLike]:
     else:
         ids, names = encode_array(np.asarray(contig, dtype=str))
     return ids, names
-
-
-def concatenate_and_rechunk(
-    zarrs: Sequence[zarr.Array],
-    chunks: Optional[Tuple[int, ...]] = None,
-    dtype: DType = None,
-) -> da.Array:
-    """Perform a concatenate and rechunk operation on a collection of Zarr arrays
-    to produce an array with a uniform chunking, suitable for saving as
-    a single Zarr array.
-
-    In contrast to Dask's ``rechunk`` method, the Dask computation graph
-    is embarrassingly parallel and will make efficient use of memory,
-    since no Zarr chunks are cached by the Dask scheduler.
-
-    The Zarr arrays must have matching shapes except in the first
-    dimension.
-
-    Parameters
-    ----------
-    zarrs
-        Collection of Zarr arrays to concatenate.
-    chunks : Optional[Tuple[int, ...]], optional
-        The chunks to apply to the concatenated arrays. If not specified
-        the chunks for the first array will be applied to the concatenated
-        array.
-    dtype
-        The dtype of the concatenated array, by default the same as the
-        first array.
-
-    Returns
-    -------
-    A Dask array, suitable for saving as a single Zarr array.
-
-    Raises
-    ------
-    ValueError
-        If the Zarr arrays do not have matching shapes (except in the first
-        dimension).
-    """
-
-    if len(set([z.shape[1:] for z in zarrs])) > 1:
-        shapes = [z.shape for z in zarrs]
-        raise ValueError(
-            f"Zarr arrays must have matching shapes (except in the first dimension): {shapes}"
-        )
-
-    lengths = np.array([z.shape[0] for z in zarrs])
-    lengths0 = np.insert(lengths, 0, 0, axis=0)  # type: ignore[no-untyped-call]
-    offsets = np.cumsum(lengths0)
-    total_length = offsets[-1]
-
-    shape = (total_length, *zarrs[0].shape[1:])
-    chunks = chunks or zarrs[0].chunks
-    dtype = dtype or zarrs[0].dtype
-
-    ar = da.empty(shape, chunks=chunks)
-
-    def load_chunk(
-        x: ArrayLike,
-        zarrs: Sequence[zarr.Array],
-        offsets: ArrayLike,
-        block_info: Dict[Any, Any],
-    ) -> ArrayLike:
-        return _slice_zarrs(zarrs, offsets, block_info[0]["array-location"])
-
-    return ar.map_blocks(load_chunk, zarrs=zarrs, offsets=offsets, dtype=dtype)
-
-
-def _zarr_index(offsets: ArrayLike, pos: int) -> int:
-    """Return the index of the zarr file that pos falls in"""
-    index: int = np.searchsorted(offsets, pos, side="right") - 1  # type: ignore[assignment]
-    return index
-
-
-def _slice_zarrs(
-    zarrs: Sequence[zarr.Array], offsets: ArrayLike, locs: Sequence[Tuple[int, ...]]
-) -> ArrayLike:
-    """Slice concatenated zarrs by locs"""
-    # convert array locations to slices
-    locs = [slice(*loc) for loc in locs]  # type: ignore[misc]
-    # determine which zarr files are needed
-    start, stop = locs[0].start, locs[0].stop  # type: ignore[attr-defined] # stack on first axis
-    i0 = _zarr_index(offsets, start)
-    i1 = _zarr_index(offsets, stop)
-    if i0 == i1:  # within a single zarr file
-        sel = slice(start - offsets[i0], stop - offsets[i0])
-        return zarrs[i0][(sel, *locs[1:])]
-    else:  # more than one zarr file
-        slices = []
-        slices.append((i0, slice(start - offsets[i0], None)))
-        for i in range(i0 + 1, i1):  # entire zarr
-            slices.append((i, slice(None)))
-        if stop > offsets[i1]:
-            slices.append((i1, slice(0, stop - offsets[i1])))
-        parts = [zarrs[i][(sel, *locs[1:])] for (i, sel) in slices]
-        return np.concatenate(parts)  # type: ignore[no-untyped-call]
-
-
-def str_is_int(x: str) -> bool:
-    """Test if a string can be parsed as an int"""
-    try:
-        int(x)
-        return True
-    except ValueError:
-        return False