diff --git a/fsspec/asyn.py b/fsspec/asyn.py index 4ac1419f0..df941719b 100644 --- a/fsspec/asyn.py +++ b/fsspec/asyn.py @@ -15,7 +15,7 @@ from .exceptions import FSTimeoutError from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep from .spec import AbstractBufferedFile, AbstractFileSystem -from .utils import is_exception, other_paths +from .utils import glob_translate, is_exception, other_paths private = re.compile("_[^_]") iothread = [None] # dedicated fsspec IO thread @@ -735,8 +735,12 @@ async def _glob(self, path, maxdepth=None, **kwargs): import re - ends = path.endswith("/") + seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,) + ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash path = self._strip_protocol(path) + append_slash_to_dirname = ends_with_sep or path.endswith( + tuple(sep + "**" for sep in seps) + ) idx_star = path.find("*") if path.find("*") >= 0 else len(path) idx_qmark = path.find("?") if path.find("?") >= 0 else len(path) idx_brace = path.find("[") if path.find("[") >= 0 else len(path) @@ -775,46 +779,22 @@ async def _glob(self, path, maxdepth=None, **kwargs): allpaths = await self._find( root, maxdepth=depth, withdirs=True, detail=True, **kwargs ) - # Escape characters special to python regex, leaving our supported - # special characters in place. - # See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html - # for shell globbing details. - pattern = ( - "^" - + ( - path.replace("\\", r"\\") - .replace(".", r"\.") - .replace("+", r"\+") - .replace("//", "/") - .replace("(", r"\(") - .replace(")", r"\)") - .replace("|", r"\|") - .replace("^", r"\^") - .replace("$", r"\$") - .replace("{", r"\{") - .replace("}", r"\}") - .rstrip("/") - .replace("?", ".") - ) - + "$" - ) - pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern) - pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern) - pattern = re.sub("[*]", "[^/]*", pattern) - pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern) - pattern = re.sub("=DOUBLE_STARS=", ".*", pattern) + + pattern = glob_translate(path + ("/" if ends_with_sep else "")) pattern = re.compile(pattern) + out = { - p: allpaths[p] - for p in sorted(allpaths) - if pattern.match(p.replace("//", "/").rstrip("/")) + p: info + for p, info in sorted(allpaths.items()) + if pattern.match( + ( + p + "/" + if append_slash_to_dirname and info["type"] == "directory" + else p + ) + ) } - # Return directories only when the glob end by a slash - # This is needed for posix glob compliance - if ends: - out = {k: v for k, v in out.items() if v["type"] == "directory"} - if detail: return out else: diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index 5b21f2605..cdd84c5ce 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -14,7 +14,13 @@ from fsspec.callbacks import _DEFAULT_CALLBACK from fsspec.exceptions import FSTimeoutError from fsspec.spec import AbstractBufferedFile -from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize +from fsspec.utils import ( + DEFAULT_BLOCK_SIZE, + glob_translate, + isfilelike, + nullcontext, + tokenize, +) from ..caching import AllBytes @@ -441,8 +447,9 @@ async def _glob(self, path, maxdepth=None, **kwargs): raise ValueError("maxdepth must be at least 1") import re - ends = path.endswith("/") + ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash path = self._strip_protocol(path) + append_slash_to_dirname = ends_with_slash or path.endswith("/**") idx_star = path.find("*") if path.find("*") >= 0 else len(path) idx_brace = path.find("[") if path.find("[") >= 0 else len(path) @@ -480,45 +487,22 @@ async def _glob(self, path, maxdepth=None, **kwargs): allpaths = await self._find( root, maxdepth=depth, withdirs=True, detail=True, **kwargs ) - # Escape characters special to python regex, leaving our supported - # special characters in place. - # See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html - # for shell globbing details. - pattern = ( - "^" - + ( - path.replace("\\", r"\\") - .replace(".", r"\.") - .replace("+", r"\+") - .replace("//", "/") - .replace("(", r"\(") - .replace(")", r"\)") - .replace("|", r"\|") - .replace("^", r"\^") - .replace("$", r"\$") - .replace("{", r"\{") - .replace("}", r"\}") - .rstrip("/") - ) - + "$" - ) - pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern) - pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern) - pattern = re.sub("[*]", "[^/]*", pattern) - pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern) - pattern = re.sub("=DOUBLE_STARS=", ".*", pattern) + + pattern = glob_translate(path + ("/" if ends_with_slash else "")) pattern = re.compile(pattern) + out = { - p: allpaths[p] - for p in sorted(allpaths) - if pattern.match(p.replace("//", "/").rstrip("/")) + p: info + for p, info in sorted(allpaths.items()) + if pattern.match( + ( + p + "/" + if append_slash_to_dirname and info["type"] == "directory" + else p + ) + ) } - # Return directories only when the glob end by a slash - # This is needed for posix glob compliance - if ends: - out = {k: v for k, v in out.items() if v["type"] == "directory"} - if detail: return out else: diff --git a/fsspec/spec.py b/fsspec/spec.py index 2af44f780..ef0908630 100644 --- a/fsspec/spec.py +++ b/fsspec/spec.py @@ -17,6 +17,7 @@ from .transaction import Transaction from .utils import ( _unstrip_protocol, + glob_translate, isfilelike, other_paths, read_block, @@ -551,10 +552,6 @@ def glob(self, path, maxdepth=None, **kwargs): The `maxdepth` option is applied on the first `**` found in the path. - Search path names that contain embedded characters special to this - implementation of glob may not produce expected results; - e.g., ``foo/bar/*starredfilename*``. - kwargs are passed to ``ls``. """ if maxdepth is not None and maxdepth < 1: @@ -562,8 +559,12 @@ def glob(self, path, maxdepth=None, **kwargs): import re - ends = path.endswith("/") + seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,) + ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash path = self._strip_protocol(path) + append_slash_to_dirname = ends_with_sep or path.endswith( + tuple(sep + "**" for sep in seps) + ) idx_star = path.find("*") if path.find("*") >= 0 else len(path) idx_qmark = path.find("?") if path.find("?") >= 0 else len(path) idx_brace = path.find("[") if path.find("[") >= 0 else len(path) @@ -600,47 +601,22 @@ def glob(self, path, maxdepth=None, **kwargs): depth = None allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs) - # Escape characters special to python regex, leaving our supported - # special characters in place. - # See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html - # for shell globbing details. - pattern = ( - "^" - + ( - path.replace("\\", r"\\") - .replace(".", r"\.") - .replace("+", r"\+") - .replace("//", "/") - .replace("(", r"\(") - .replace(")", r"\)") - .replace("|", r"\|") - .replace("^", r"\^") - .replace("$", r"\$") - .replace("{", r"\{") - .replace("}", r"\}") - .rstrip("/") - .replace("?", ".") - ) - + "$" - ) - pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern) - pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern) - pattern = re.sub("[*]", "[^/]*", pattern) - pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern) - pattern = re.sub("=DOUBLE_STARS=", ".*", pattern) + + pattern = glob_translate(path + ("/" if ends_with_sep else "")) pattern = re.compile(pattern) out = { - p: allpaths[p] - for p in sorted(allpaths) - if pattern.match(p.replace("//", "/").rstrip("/")) + p: info + for p, info in sorted(allpaths.items()) + if pattern.match( + ( + p + "/" + if append_slash_to_dirname and info["type"] == "directory" + else p + ) + ) } - # Return directories only when the glob end by a slash - # This is needed for posix glob compliance - if ends: - out = {k: v for k, v in out.items() if v["type"] == "directory"} - if detail: return out else: diff --git a/fsspec/tests/abstract/common.py b/fsspec/tests/abstract/common.py index 93896a443..22e7c4140 100644 --- a/fsspec/tests/abstract/common.py +++ b/fsspec/tests/abstract/common.py @@ -107,9 +107,9 @@ "subdir1/subfile2", ], ), - ("**1", False, None, ["file1", "subdir0/subfile1", "subdir1/subfile1"]), + ("**/*1", False, None, ["file1", "subdir0/subfile1", "subdir1/subfile1"]), ( - "**1", + "**/*1", True, None, [ @@ -120,14 +120,14 @@ "subdir1/nesteddir/nestedfile", ], ), - ("**1", True, 1, ["file1"]), + ("**/*1", True, 1, ["file1"]), ( - "**1", + "**/*1", True, 2, ["file1", "subdir0/subfile1", "subdir1/subfile1", "subdir1/subfile2"], ), - ("**1", False, 2, ["file1", "subdir0/subfile1", "subdir1/subfile1"]), + ("**/*1", False, 2, ["file1", "subdir0/subfile1", "subdir1/subfile1"]), ("**/subdir0", False, None, []), ("**/subdir0", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]), ("**/subdir0/nested*", False, 2, []), diff --git a/fsspec/tests/test_spec.py b/fsspec/tests/test_spec.py index f76bbef76..b7d11b354 100644 --- a/fsspec/tests/test_spec.py +++ b/fsspec/tests/test_spec.py @@ -1157,172 +1157,3 @@ def test_glob_posix_rules(path, expected, glob_fs): detailed_output = glob_fs.glob(path=f"mock://{path}", detail=True) for name, info in _clean_paths(detailed_output).items(): assert info == glob_fs[name] - - -@pytest.mark.parametrize( - ("path", "maxdepth", "expected"), - [ - ( - "test1**", - None, - [ - "test1", - "test1.json", - "test1.yaml", - "test1/test0", - "test1/test0.json", - "test1/test0.yaml", - "test1/test0/test0.json", - "test1/test0/test0.yaml", - ], - ), - ("test1**/", None, ["test1", "test1/test0"]), - ( - "**.yaml", - None, - [ - "test0.yaml", - "test0/test0.yaml", - "test0/test1/test0.yaml", - "test0/test1/test2/test0.yaml", - "test0/test2/test0.yaml", - "test0/test2/test1/test0.yaml", - "test0/test2/test1/test3/test0.yaml", - "test1.yaml", - "test1/test0.yaml", - "test1/test0/test0.yaml", - ], - ), - ("**1/", None, ["test0/test1", "test0/test2/test1", "test1"]), - ( - "**1/*.yaml", - None, - [ - "test0/test1/test0.yaml", - "test0/test2/test1/test0.yaml", - "test1/test0.yaml", - ], - ), - ( - "test0**1**.yaml", - None, - [ - "test0/test1/test2/test0.yaml", - "test0/test1/test0.yaml", - "test0/test2/test1/test0.yaml", - "test0/test2/test1/test3/test0.yaml", - ], - ), - ( - "test0/t**.yaml", - None, - [ - "test0/test0.yaml", - "test0/test1/test0.yaml", - "test0/test1/test2/test0.yaml", - "test0/test2/test0.yaml", - "test0/test2/test1/test0.yaml", - "test0/test2/test1/test3/test0.yaml", - ], - ), - ("test0/t**1/", None, ["test0/test1", "test0/test2/test1"]), - ( - "test0/t**1/*.yaml", - None, - ["test0/test1/test0.yaml", "test0/test2/test1/test0.yaml"], - ), - ( - "test0/**", - 1, - [ - "test0", - "test0/test0.json", - "test0/test0.yaml", - "test0/test1", - "test0/test2", - ], - ), - ( - "test0/**", - 2, - [ - "test0", - "test0/test0.json", - "test0/test0.yaml", - "test0/test1", - "test0/test1/test0.json", - "test0/test1/test0.yaml", - "test0/test1/test2", - "test0/test2", - "test0/test2/test0.json", - "test0/test2/test0.yaml", - "test0/test2/test1", - ], - ), - ("test0/**/test1/*", 1, []), - ( - "test0/**/test1/*", - 2, - ["test0/test1/test0.json", "test0/test1/test0.yaml", "test0/test1/test2"], - ), - ("test0/**/test1/**", 1, ["test0/test1"]), - ( - "test0/**/test1/**", - 2, - [ - "test0/test1", - "test0/test1/test0.json", - "test0/test1/test0.yaml", - "test0/test1/test2", - "test0/test2/test1", - ], - ), - ( - "test0/test[1-2]/**", - 1, - [ - "test0/test1", - "test0/test1/test0.yaml", - "test0/test1/test0.json", - "test0/test1/test2", - "test0/test2", - "test0/test2/test0.json", - "test0/test2/test0.yaml", - "test0/test2/test1", - ], - ), - ( - "test0/test[1-2]/**", - 2, - [ - "test0/test1", - "test0/test1/test0.yaml", - "test0/test1/test0.json", - "test0/test1/test2", - "test0/test1/test2/test0.json", - "test0/test1/test2/test0.yaml", - "test0/test2", - "test0/test2/test0.json", - "test0/test2/test0.yaml", - "test0/test2/test1", - "test0/test2/test1/test0.yaml", - "test0/test2/test1/test0.json", - "test0/test2/test1/test3", - ], - ), - ], -) -def test_glob_non_posix_rules(path, maxdepth, expected, glob_fs): - output = glob_fs.glob(path=f"mock://{path}", maxdepth=maxdepth) - assert _clean_paths(output) == _clean_paths(expected) - - detailed_output = glob_fs.glob( - path=f"mock://{path}", maxdepth=maxdepth, detail=True - ) - for name, info in _clean_paths(detailed_output).items(): - assert info == glob_fs[name] - - -def test_glob_with_wrong_args(glob_fs): - with pytest.raises(ValueError): - _ = glob_fs.glob(path="mock://test0/*", maxdepth=0) diff --git a/fsspec/utils.py b/fsspec/utils.py index 9e52ffe51..432292967 100644 --- a/fsspec/utils.py +++ b/fsspec/utils.py @@ -573,3 +573,115 @@ def atomic_write(path: str, mode: str = "wb"): raise else: os.replace(fn, path) + + +def _translate(pat, STAR, QUESTION_MARK): + # Copied from: https://github.com/python/cpython/pull/106703. + res = [] + add = res.append + i, n = 0, len(pat) + while i < n: + c = pat[i] + i = i + 1 + if c == "*": + # compress consecutive `*` into one + if (not res) or res[-1] is not STAR: + add(STAR) + elif c == "?": + add(QUESTION_MARK) + elif c == "[": + j = i + if j < n and pat[j] == "!": + j = j + 1 + if j < n and pat[j] == "]": + j = j + 1 + while j < n and pat[j] != "]": + j = j + 1 + if j >= n: + add("\\[") + else: + stuff = pat[i:j] + if "-" not in stuff: + stuff = stuff.replace("\\", r"\\") + else: + chunks = [] + k = i + 2 if pat[i] == "!" else i + 1 + while True: + k = pat.find("-", k, j) + if k < 0: + break + chunks.append(pat[i:k]) + i = k + 1 + k = k + 3 + chunk = pat[i:j] + if chunk: + chunks.append(chunk) + else: + chunks[-1] += "-" + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks) - 1, 0, -1): + if chunks[k - 1][-1] > chunks[k][0]: + chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:] + del chunks[k] + # Escape backslashes and hyphens for set difference (--). + # Hyphens that create ranges shouldn't be escaped. + stuff = "-".join( + s.replace("\\", r"\\").replace("-", r"\-") for s in chunks + ) + # Escape set operations (&&, ~~ and ||). + stuff = re.sub(r"([&~|])", r"\\\1", stuff) + i = j + 1 + if not stuff: + # Empty range: never match. + add("(?!)") + elif stuff == "!": + # Negated empty range: match any character. + add(".") + else: + if stuff[0] == "!": + stuff = "^" + stuff[1:] + elif stuff[0] in ("^", "["): + stuff = "\\" + stuff + add(f"[{stuff}]") + else: + add(re.escape(c)) + assert i == n + return res + + +def glob_translate(pat): + # Copied from: https://github.com/python/cpython/pull/106703. + # The keyword parameters' values are fixed to: + # recursive=True, include_hidden=True, seps=None + """Translate a pathname with shell wildcards to a regular expression.""" + if os.path.altsep: + seps = (os.path.sep, os.path.altsep) + else: + seps = os.path.sep + escaped_seps = "".join(map(re.escape, seps)) + any_sep = f"[{escaped_seps}]" if len(seps) > 1 else escaped_seps + not_sep = f"[^{escaped_seps}]" + one_last_segment = f"{not_sep}+" + one_segment = f"{one_last_segment}{any_sep}" + any_segments = f"(?:.+{any_sep})?" + any_last_segments = ".*" + results = [] + parts = re.split(any_sep, pat) + last_part_idx = len(parts) - 1 + for idx, part in enumerate(parts): + if part == "*": + results.append(one_segment if idx < last_part_idx else one_last_segment) + continue + if part == "**": + results.append(any_segments if idx < last_part_idx else any_last_segments) + continue + elif "**" in part: + raise ValueError( + "Invalid pattern: '**' can only be an entire path component" + ) + if part: + results.extend(_translate(part, f"{not_sep}*", not_sep)) + if idx < last_part_idx: + results.append(any_sep) + res = "".join(results) + return rf"(?s:{res})\Z"