Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify getsize to return total size, not just the top level #1815

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ Enhancements

Maintenance
~~~~~~~~~~~
* ``getsize`` now returns the total size of all nested arrays.
By :user:`Ben Jeffery <benjeffery>` :issue:`253`.

Deprecations
~~~~~~~~~~~~
Expand Down
70 changes: 37 additions & 33 deletions zarr/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from collections import OrderedDict
from collections.abc import MutableMapping
from functools import lru_cache
from os import scandir
from pickle import PicklingError
from threading import Lock, RLock
from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any
Expand Down Expand Up @@ -270,9 +269,15 @@ def _getsize(store: BaseStore, path: Path = None) -> int:
# also include zarr.json?
# members += ['zarr.json']
else:
members = listdir(store, path)
prefix = _path_to_prefix(path)
members = [prefix + k for k in members]
to_visit = [path]
members = []
while to_visit:
print(to_visit)
current_path = to_visit.pop()
current_members = listdir(store, current_path)
prefix = _path_to_prefix(current_path)
members.extend([prefix + k for k in current_members])
to_visit.extend([prefix + k for k in current_members])
for k in members:
try:
v = store[k]
Expand Down Expand Up @@ -976,8 +981,12 @@ def getsize(self, path: Path = None):
elif isinstance(value, self.cls):
# total size for directory
size = 0
for v in value.values():
if not isinstance(v, self.cls):
to_visit = list(value.values())
while to_visit:
v = to_visit.pop()
if isinstance(v, self.cls):
to_visit.extend(v.values())
else:
size += buffer_size(v)
return size

Expand Down Expand Up @@ -1274,9 +1283,13 @@ def getsize(self, path=None):
return os.path.getsize(fs_path)
elif os.path.isdir(fs_path):
size = 0
for child in scandir(fs_path):
if child.is_file():
size += child.stat().st_size
for root, _, files in os.walk(fs_path):
# Include the size of the directory itself, as this can be substantial
# for directories with many files.
size += os.path.getsize(root)
for file in files:
file_path = os.path.join(root, file)
size += os.path.getsize(file_path)
return size
else:
return 0
Expand Down Expand Up @@ -1921,29 +1934,19 @@ def listdir(self, path=None):
def getsize(self, path=None):
path = normalize_storage_path(path)
with self.mutex:
children = self.listdir(path)
if children:
size = 0
for child in children:
if path:
name = path + "/" + child
else:
name = child
try:
info = self.zf.getinfo(name)
except KeyError:
pass
else:
size += info.compress_size
return size
elif path:
to_visit = [path] if path else self.listdir(path)
total_size = 0
while to_visit:
current_path = to_visit.pop()
try:
info = self.zf.getinfo(path)
return info.compress_size
info = self.zf.getinfo(current_path)
total_size += info.compress_size
except KeyError:
return 0
else:
return 0
children = self.listdir(current_path)
for child in children:
full_path = current_path + "/" + child if current_path else child
to_visit.append(full_path)
return total_size

def clear(self):
if self.mode == "r":
Expand Down Expand Up @@ -2527,6 +2530,8 @@ def listdir(self, path: Path = None):
return listing

def getsize(self, path=None) -> int:
print("WYF")
print(self._store, path)
return getsize(self._store, path=path)

def _pop_value(self):
Expand Down Expand Up @@ -2795,10 +2800,9 @@ def getsize(self, path=None):
size = self.cursor.execute(
"""
SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr
WHERE k LIKE (? || "%") AND
0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/")
WHERE k LIKE (? || "%")
""",
(path, path),
(path,),
)
for (s,) in size:
return s
Expand Down
13 changes: 9 additions & 4 deletions zarr/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
import pickle
import shutil
import tempfile
from typing import Any, Literal, Optional, Tuple, Union, Sequence
import unittest
from itertools import zip_longest
Expand Down Expand Up @@ -100,6 +101,7 @@ class TestArray:
write_empty_chunks = True
read_only = False
storage_transformers: Tuple[Any, ...] = ()
group_size = 0

def create_store(self) -> BaseStore:
return KVStore(dict())
Expand Down Expand Up @@ -229,15 +231,15 @@ def test_nbytes_stored(self):
buffer_size(v) for k, v in z.store.items() if k != "zarr.json"
)
else:
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
assert expect_nbytes_stored == z.nbytes_stored
z[:] = 42
if self.version == 3:
expect_nbytes_stored = sum(
buffer_size(v) for k, v in z.store.items() if k != "zarr.json"
)
else:
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
assert expect_nbytes_stored == z.nbytes_stored

# mess with store
Expand Down Expand Up @@ -1677,6 +1679,8 @@ def test_nbytes_stored(self):


class TestArrayWithDirectoryStore(TestArray):
group_size = 4096

def create_store(self):
path = mkdtemp()
atexit.register(shutil.rmtree, path)
Expand All @@ -1686,10 +1690,10 @@ def create_store(self):
def test_nbytes_stored(self):
# dict as store
z = self.create_array(shape=1000, chunks=100)
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
assert expect_nbytes_stored == z.nbytes_stored
z[:] = 42
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
assert expect_nbytes_stored == z.nbytes_stored


Expand Down Expand Up @@ -2028,6 +2032,7 @@ def expected(self):

@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec")
class TestArrayWithN5FSStore(TestArrayWithN5Store):
group_size = 0
def create_store(self):
path = mkdtemp()
atexit.register(shutil.rmtree, path)
Expand Down
15 changes: 3 additions & 12 deletions zarr/tests/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,19 +366,10 @@ def test_hierarchy(self):

# test getsize (optional)
if hasattr(store, "getsize"):
# TODO: proper behavior of getsize?
# v3 returns size of all nested arrays, not just the
# size of the arrays in the current folder.
if self.version == 2:
assert 6 == store.getsize()
else:
assert 15 == store.getsize()
assert 15 == store.getsize()
assert 3 == store.getsize("a")
assert 3 == store.getsize("b")
if self.version == 2:
assert 3 == store.getsize("c")
else:
assert 9 == store.getsize("c")
assert 3 == store.getsize("c")
assert 3 == store.getsize("c/d")
assert 6 == store.getsize("c/e")
assert 3 == store.getsize("c/e/f")
Expand Down Expand Up @@ -2256,7 +2247,7 @@ def test_getsize():
store["foo"] = b"aaa"
store["bar"] = b"bbbb"
store["baz/quux"] = b"ccccc"
assert 7 == getsize(store)
assert 12 == getsize(store)
assert 5 == getsize(store, "baz")

store = KVStore(dict())
Expand Down
Loading