diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1a2e16ba..8c9f7457 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - name: Checkout source @@ -49,8 +49,6 @@ jobs: name: lint runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3.1.0 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 - with: - python-version: "3.9" - - uses: pre-commit/action@v3.0.0 + - uses: pre-commit/action@v2.0.0 diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 00000000..1fff95db --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +known_third_party = aiohttp,click,decorator,fsspec,fuse,google,google_auth_oauthlib,pytest,requests,setuptools diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e4f13395..486b2dcf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,15 +18,11 @@ repos: rev: 6.0.0 hooks: - id: flake8 - files: gcsfs/ - - repo: https://github.com/pycqa/isort - rev: 5.10.1 + - repo: https://github.com/asottile/seed-isort-config + rev: v2.2.0 hooks: - - id: isort - args: ["--profile", "black", "--filter-files"] - - repo: https://github.com/asottile/pyupgrade - rev: v3.2.3 + - id: seed-isort-config + - repo: https://github.com/pre-commit/mirrors-isort + rev: v5.7.0 hooks: - - id: pyupgrade - args: - - --py37-plus + - id: isort diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..f44c84ca --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,18 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: miniconda3-4.7 + +conda: + environment: docs/environment.yml + +python: + install: + - method: pip + path: . + +sphinx: + configuration: docs/source/conf.py + fail_on_warning: true diff --git a/docs/environment.yml b/docs/environment.yml new file mode 100644 index 00000000..1cfba7b5 --- /dev/null +++ b/docs/environment.yml @@ -0,0 +1,8 @@ +name: s3fs +channels: + - defaults +dependencies: + - python= 3.9 + - docutils<0.17 + - sphinx + - sphinx_rtd_theme diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index a7b14f97..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -docutils<0.18 -numpydoc diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css new file mode 100644 index 00000000..16fffc59 --- /dev/null +++ b/docs/source/_static/custom.css @@ -0,0 +1,5 @@ +.classifier:before { + font-style: normal; + margin: 0.5em; + content: ":"; +} diff --git a/docs/source/api.rst b/docs/source/api.rst index 184182a9..f9e62707 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -38,8 +38,10 @@ API .. autoclass:: GCSFileSystem :members: + :inherited-members: .. autoclass:: GCSFile :members: + :inherited-members: .. currentmodule:: gcsfs.mapping diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index d68c7d6e..074b3ae8 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,56 @@ Changelog ========= +2023.9.0 +-------- + +* bulk operations edge cases (#576, 572) +* inventory report based file listing (#573) +* pickle HttpError (#571) +* avoid warnings (#569) +* maxdepth in find() (#566) +* invalidate dircache (#564) +* standard metadata field names (#563) +* performance of building cache in find() (#561) + + +2023.6.0 +-------- + +* allow raw/session token for auth (#554) +* fix listings_expiry_time kwargs (#551) +* allow setting fixed metadata on put/pipe (#550) + +2023.5.0 +-------- + +* Allow emulator host without protocol (#548) +* Prevent upload retry from closing the file being sent (#540) + +2023.4.0 +-------- + +No changes + +2023.3.0 +-------- + +* Don't let find() mess up dircache (#531) +* Drop py3.7 (#529) +* Update docs (#528) +* Make times UTC (#527) +* Use BytesIO for large bodies (#525) +* Fix: Don't append generation when it is absent (#523) +* get/put/cp consistency tests (#521) + +2023.1.0 +-------- + +* Support create time (#516, 518) +* defer async session creation (#513, 514) +* support listing of file versions (#509) +* fix ``sign`` following versioned split protocol (#513) + 2022.11.0 --------- diff --git a/docs/source/conf.py b/docs/source/conf.py index fc8a2d34..90ca12d2 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,9 +12,6 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import os -import sys - # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -35,7 +32,7 @@ "sphinx.ext.viewcode", "sphinx.ext.autosummary", "sphinx.ext.extlinks", - "numpydoc", + "sphinx.ext.napoleon", ] # Add any paths that contain templates here, relative to this directory. @@ -68,13 +65,6 @@ # The full version, including alpha/beta/rc tags. release = version -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # today = '' @@ -115,15 +105,7 @@ # -- Options for HTML output ---------------------------------------------- -# Taken from docs.readthedocs.io: -# on_rtd is whether we are on readthedocs.io -on_rtd = os.getenv("READTHEDOCS", None) == "True" - -if not on_rtd: # only import and set the theme if we're building docs locally - import sphinx_rtd_theme - - html_theme = "sphinx_rtd_theme" - html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -154,6 +136,10 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +# Custom CSS file to override read the docs default CSS. +# Contains workaround for RTD not rendering colon between argument name and type +html_css_files = ["custom.css"] + # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. @@ -297,4 +283,4 @@ # If true, do not generate a @detailmenu in the "Top" node's menu. # texinfo_no_detailmenu = False -extlinks = {"pr": ("https://github.com/fsspec/gcsfs/pull/%s", "PR #")} +extlinks = {"pr": ("https://github.com/fsspec/gcsfs/pull/%s", "PR #%s")} diff --git a/docs/source/fuse.rst b/docs/source/fuse.rst index 8ecc475e..d1601ad7 100644 --- a/docs/source/fuse.rst +++ b/docs/source/fuse.rst @@ -1,7 +1,7 @@ GCSFS and FUSE ============== -Warning, this functionality is **experimental** +Warning, this functionality is **experimental**. FUSE_ is a mechanism to mount user-level filesystems in unix-like systems (linux, osx, etc.). GCSFS is able to use FUSE to present remote diff --git a/docs/source/index.rst b/docs/source/index.rst index bc65eb64..e48fa11b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -10,7 +10,7 @@ Please file issues and requests on github_ and we welcome pull requests. .. _github: https://github.com/fsspec/gcsfs/issues -This package depends on fsspec_ , and inherits many useful behaviours from there, +This package depends on fsspec_, and inherits many useful behaviours from there, including integration with Dask, and the facility for key-value dict-like objects of the type used by zarr. @@ -19,12 +19,16 @@ objects of the type used by zarr. Installation ------------ -The GCSFS library can be installed using ``conda`` or ``pip``: +The GCSFS library can be installed using ``conda``: .. code-block:: bash conda install -c conda-forge gcsfs - or + +or ``pip``: + +.. code-block:: bash + pip install gcsfs or by cloning the repository: @@ -50,7 +54,7 @@ Locate and read a file: ... print(f.read()) b'Hello, world' -(see also ``walk`` and ``glob``) +(see also :meth:`~gcsfs.core.GCSFileSystem.walk` and :meth:`~gcsfs.core.GCSFileSystem.glob`) Read with delimited blocks: @@ -128,7 +132,7 @@ to ``GCSFileSystem``, for example: storage_options={"token": "anon"}) This gives the chance to pass any credentials or other necessary -arguments needed to s3fs. +arguments needed to gcsfs. Async @@ -146,14 +150,11 @@ await the client creation before making any GCS call. .. code-block:: python - loop = ... # however you create your loop - - async def run_program(loop): - gcs = GCSFileSystem(..., asynchronous=True, loop=loop) - await gcs.set_session() - ... # perform work + async def run_program(): + gcs = GCSFileSystem(asynchronous=True) + print(await gcs._ls("")) - asyncio.run(run_program(loop)) # or call from your async code + asyncio.run(run_program()) # or call from your async code Concurrent async operations are also used internally for bulk operations such as ``pipe/cat``, ``get/put``, ``cp/mv/rm``. The async calls are @@ -162,6 +163,10 @@ from normal code. If you are *not* using async-style programming, you do not need to know about how this works, but you might find the implementation interesting. +For every synchronous function there is asynchronous one prefixed by ``_``, but +the ``open`` operation does not support async operation. If you need it to open +some file in async manner, it's better to asynchronously download it to +temporary location and working with it from there. Proxy ----- @@ -176,7 +181,7 @@ proxy settings from the environment provide ``session_kwargs`` as follows: For further reference check `aiohttp proxy support`_. -.. _aiohttp proxy support: https://docs.aiohttp.org/en/stable/client_advanced.html?highlight=proxy#proxy-support +.. _aiohttp proxy support: https://docs.aiohttp.org/en/stable/client_advanced.html#proxy-support Contents diff --git a/gcsfs/__init__.py b/gcsfs/__init__.py index 18517987..fffbca44 100644 --- a/gcsfs/__init__.py +++ b/gcsfs/__init__.py @@ -6,3 +6,7 @@ from .mapping import GCSMap __all__ = ["GCSFileSystem", "GCSMap"] + +from . import _version + +__version__ = _version.get_versions()["version"] diff --git a/gcsfs/_version.py b/gcsfs/_version.py index 7475623c..ac7ae81f 100644 --- a/gcsfs/_version.py +++ b/gcsfs/_version.py @@ -4,25 +4,28 @@ # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. -# This file is released into the public domain. Generated by -# versioneer-0.18 (https://github.com/warner/python-versioneer) +# This file is released into the public domain. +# Generated by versioneer-0.29 +# https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" import errno +import functools import os import re import subprocess import sys +from typing import Any, Callable, Dict, List, Optional, Tuple -def get_keywords(): +def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). - git_refnames = "$Format:%(describe:exclude=HEAD|main:tags=true)$" + git_refnames = "$Format:%d$" git_full = "$Format:%H$" git_date = "$Format:%ci$" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} @@ -32,8 +35,15 @@ def get_keywords(): class VersioneerConfig: """Container for Versioneer configuration parameters.""" + VCS: str + style: str + tag_prefix: str + parentdir_prefix: str + versionfile_source: str + verbose: bool -def get_config(): + +def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py @@ -51,14 +61,14 @@ class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" -LONG_VERSION_PY = {} -HANDLERS = {} +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} -def register_vcs_handler(vcs, method): # decorator - """Decorator to mark a method as the handler for a particular VCS.""" +def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator + """Create decorator to mark a method as the handler of a VCS.""" - def decorate(f): + def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} @@ -68,24 +78,39 @@ def decorate(f): return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): +def run_command( + commands: List[str], + args: List[str], + cwd: Optional[str] = None, + verbose: bool = False, + hide_stderr: bool = False, + env: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) - p = None - for c in commands: + process = None + + popen_kwargs: Dict[str, Any] = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + + for command in commands: try: - dispcmd = str([c] + args) + dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen( - [c] + args, + process = subprocess.Popen( + [command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), + **popen_kwargs, ) break - except OSError: - e = sys.exc_info()[1] + except OSError as e: if e.errno == errno.ENOENT: continue if verbose: @@ -94,20 +119,22 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= return None, None else: if verbose: - print(f"unable to find command, tried {commands}") + print("unable to find command, tried %s" % (commands,)) return None, None - stdout = p.communicate()[0].strip() - if sys.version_info[0] >= 3: - stdout = stdout.decode() - if p.returncode != 0: + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) - return None, p.returncode - return stdout, p.returncode + return None, process.returncode + return stdout, process.returncode -def versions_from_parentdir(parentdir_prefix, root, verbose): +def versions_from_parentdir( + parentdir_prefix: str, + root: str, + verbose: bool, +) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both @@ -116,7 +143,7 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): """ rootdirs = [] - for i in range(3): + for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return { @@ -126,9 +153,8 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): "error": None, "date": None, } - else: - rootdirs.append(root) - root = os.path.dirname(root) # up a level + rootdirs.append(root) + root = os.path.dirname(root) # up a level if verbose: print( @@ -139,41 +165,48 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): @register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): +def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. - keywords = {} + keywords: Dict[str, str] = {} try: - f = open(versionfile_abs) - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - f.close() + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): +def git_versions_from_keywords( + keywords: Dict[str, str], + tag_prefix: str, + verbose: bool, +) -> Dict[str, Any]: """Get version information from git keywords.""" - if not keywords: - raise NotThisMethod("no keywords at all, weird") + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because @@ -198,7 +231,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "". + # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) @@ -208,6 +241,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r"\d", r): + continue if verbose: print("picking %s" % r) return { @@ -230,7 +268,9 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): @register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): +def git_pieces_from_vcs( + tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command +) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* @@ -241,7 +281,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -249,7 +296,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command( + describe_out, rc = runner( GITS, [ "describe", @@ -258,7 +305,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): "--always", "--long", "--match", - "%s*" % tag_prefix, + f"{tag_prefix}[[:digit:]]*", ], cwd=root, ) @@ -266,16 +313,48 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() - full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() - pieces = {} + pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out @@ -292,7 +371,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # TAG-NUM-gHEX mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: - # unparseable. Maybe git-describe is misbehaving? + # unparsable. Maybe git-describe is misbehaving? pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces @@ -302,7 +381,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( full_tag, tag_prefix, ) @@ -318,26 +397,27 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) - pieces["distance"] = int(count_out) # total number of commits + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ - 0 - ].strip() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" -def render_pep440(pieces): +def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you @@ -361,23 +441,70 @@ def render_pep440(pieces): return rendered -def render_pep440_pre(pieces): - """TAG[.post.devDISTANCE] -- No -dirty. +def render_pep440_branch(pieces: Dict[str, Any]) -> str: + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). Exceptions: - 1: no tags. 0.post.devDISTANCE + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces: Dict[str, Any]) -> str: + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: if pieces["distance"]: - rendered += ".post.dev%d" % pieces["distance"] + # update the post release segment + tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + rendered = tag_version + if post_version is not None: + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) + else: + rendered += ".post0.dev%d" % (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] else: # exception #1 - rendered = "0.post.dev%d" % pieces["distance"] + rendered = "0.post0.dev%d" % pieces["distance"] return rendered -def render_pep440_post(pieces): +def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards @@ -404,12 +531,41 @@ def render_pep440_post(pieces): return rendered -def render_pep440_old(pieces): +def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. - Eexceptions: + Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: @@ -426,7 +582,7 @@ def render_pep440_old(pieces): return rendered -def render_git_describe(pieces): +def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. @@ -446,7 +602,7 @@ def render_git_describe(pieces): return rendered -def render_git_describe_long(pieces): +def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. @@ -466,7 +622,7 @@ def render_git_describe_long(pieces): return rendered -def render(pieces, style): +def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return { @@ -482,10 +638,14 @@ def render(pieces, style): if style == "pep440": rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": @@ -504,7 +664,7 @@ def render(pieces, style): } -def get_versions(): +def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some @@ -524,7 +684,7 @@ def get_versions(): # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for i in cfg.versionfile_source.split("/"): + for _ in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: return { diff --git a/gcsfs/core.py b/gcsfs/core.py index 390a3b7b..e6a11331 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -10,6 +10,7 @@ import re import warnings import weakref +from datetime import datetime from urllib.parse import parse_qs from urllib.parse import quote as quote_urllib from urllib.parse import urlsplit @@ -23,6 +24,7 @@ from . import __version__ as version from .checkers import get_consistency_checker from .credentials import GoogleCredentials +from .inventory_report import InventoryReport from .retry import retry_request, validate_response logger = logging.getLogger("gcsfs") @@ -95,6 +97,14 @@ async def _req_to_text(r): return (await r.read()).decode() +class UnclosableBytesIO(io.BytesIO): + """Prevent closing BytesIO to avoid errors during retries.""" + + def close(self): + """Reset stream position for next retry.""" + self.seek(0) + + def _location(): """ Resolves GCS HTTP location as http[s]://host @@ -106,9 +116,13 @@ def _location(): valid http location """ _emulator_location = os.getenv("STORAGE_EMULATOR_HOST", None) - return ( - _emulator_location if _emulator_location else "https://storage.googleapis.com" - ) + if _emulator_location: + if not any( + _emulator_location.startswith(scheme) for scheme in ("http://", "https://") + ): + _emulator_location = f"http://{_emulator_location}" + return _emulator_location + return "https://storage.googleapis.com" def _chunks(lst, n): @@ -148,7 +162,7 @@ class GCSFileSystem(AsyncFileSystem): metadata service, anonymous. - ``token='google_default'``, your default gcloud credentials will be used, which are typically established by doing ``gcloud login`` in a terminal. - - ``token=='cache'``, credentials from previously successful gcsfs + - ``token='cache'``, credentials from previously successful gcsfs authentication will be used (use this after "browser" auth succeeded) - ``token='anon'``, no authentication is performed, and you can only access data which is accessible to allUsers (in this case, the project and @@ -165,10 +179,10 @@ class GCSFileSystem(AsyncFileSystem): or a Credentials object. gcloud typically stores its tokens in locations such as ``~/.config/gcloud/application_default_credentials.json``, - `` ~/.config/gcloud/credentials``, or + ``~/.config/gcloud/credentials``, or ``~\AppData\Roaming\gcloud\credentials``, etc. - Specific methods, (eg. `ls`, `info`, ...) may return object details from GCS. + Specific methods, (eg. ``ls``, ``info``, ...) may return object details from GCS. These detailed listings include the [object resource](https://cloud.google.com/storage/docs/json_api/v1/objects#resource) @@ -198,8 +212,8 @@ class GCSFileSystem(AsyncFileSystem): created via other processes *will not* be visible to the GCSFileSystem until the cache refreshed. Calls to GCSFileSystem.open and calls to GCSFile are not effected by this cache. - In the default case the cache is never expired. This may be controlled via the `cache_timeout` - GCSFileSystem parameter or via explicit calls to `GCSFileSystem.invalidate_cache`. + In the default case the cache is never expired. This may be controlled via the ``cache_timeout`` + GCSFileSystem parameter or via explicit calls to ``GCSFileSystem.invalidate_cache``. Parameters ---------- @@ -224,11 +238,11 @@ class GCSFileSystem(AsyncFileSystem): secure_serialize: bool (deprecated) requester_pays : bool, or str default False Whether to use requester-pays requests. This will include your - project ID `project` in requests as the `userPorject`, and you'll be + project ID `project` in requests as the `userProject`, and you'll be billed for accessing data from requester-pays buckets. Optionally, pass a project-id here as a string to use that as the `userProject`. session_kwargs: dict - passed on to aiohttp.ClientSession; can contain, for example, + passed on to ``aiohttp.ClientSession``; can contain, for example, proxy settings. endpoint_url: str If given, use this URL (format protocol://host:port , *without* any @@ -271,9 +285,10 @@ def __init__( version_aware=False, **kwargs, ): + if cache_timeout: + kwargs["listings_expiry_time"] = cache_timeout super().__init__( self, - listings_expiry_time=cache_timeout, asynchronous=asynchronous, loop=loop, **kwargs, @@ -303,12 +318,6 @@ def __init__( self.credentials = GoogleCredentials(project, access, token) - if not self.asynchronous: - self._session = sync( - self.loop, get_client, timeout=self.timeout, **self.session_kwargs - ) - weakref.finalize(self, self.close_session, self.loop, self._session) - @property def _location(self): return self._endpoint or _location() @@ -325,6 +334,12 @@ def project(self): def close_session(loop, session): if loop is not None and session is not None: if loop.is_running(): + try: + loop = asyncio.get_event_loop() + loop.create_task(session.close()) + return + except RuntimeError: + pass try: sync(loop, session.close, timeout=0.1) except fsspec.FSTimeoutError: @@ -335,6 +350,7 @@ def close_session(loop, session): async def _set_session(self): if self._session is None: self._session = await get_client(**self.session_kwargs) + weakref.finalize(self, self.close_session, self.loop, self._session) return self._session @property @@ -406,7 +422,6 @@ async def _request( data=data, timeout=self.requests_timeout, ) as r: - status = r.status headers = r.headers info = r.request_info # for debug only @@ -451,6 +466,12 @@ def _process_object(self, bucket, object_metadata): result["size"] = int(object_metadata.get("size", 0)) result["name"] = posixpath.join(bucket, object_metadata["name"]) result["type"] = "file" + # Translate time metadata from GCS names to fsspec standard names. + # TODO(issues/559): Remove legacy names `updated` and `timeCreated`? + if "updated" in object_metadata: + result["mtime"] = self._parse_timestamp(object_metadata["updated"]) + if "timeCreated" in object_metadata: + result["ctime"] = self._parse_timestamp(object_metadata["timeCreated"]) if "generation" in object_metadata or "metageneration" in object_metadata: result["generation"] = object_metadata.get("generation") result["metageneration"] = object_metadata.get("metageneration") @@ -517,25 +538,31 @@ async def _get_object(self, path): raise FileNotFoundError(path) return self._process_object(bucket, res) - async def _list_objects(self, path, prefix="", versions=False): + async def _list_objects(self, path, prefix="", versions=False, **kwargs): bucket, key, generation = self.split_path(path) path = path.rstrip("/") - try: - clisting = self._ls_from_cache(path) - hassubdirs = clisting and any( - c["name"].rstrip("/") == path and c["type"] == "directory" - for c in clisting - ) - if clisting and not hassubdirs: - return clisting - except FileNotFoundError: - # not finding a bucket in list of "my" buckets is OK - if key: - raise + # NOTE: the inventory report logic is experimental. + inventory_report_info = kwargs.get("inventory_report_info", None) + + # Only attempt to list from the cache when the user does not use + # the inventory report service. + if not inventory_report_info: + try: + clisting = self._ls_from_cache(path) + hassubdirs = clisting and any( + c["name"].rstrip("/") == path and c["type"] == "directory" + for c in clisting + ) + if clisting and not hassubdirs: + return clisting + except FileNotFoundError: + # not finding a bucket in list of "my" buckets is OK + if key: + raise items, prefixes = await self._do_list_objects( - path, prefix=prefix, versions=versions + path, prefix=prefix, versions=versions, **kwargs ) pseudodirs = [ @@ -554,28 +581,166 @@ async def _list_objects(self, path, prefix="", versions=False): else: return [] out = pseudodirs + items - # Don't cache prefixed/partial listings - if not prefix: + + use_snapshot_listing = inventory_report_info and inventory_report_info.get( + "use_snapshot_listing" + ) + + # Don't cache prefixed/partial listings, in addition to + # not using the inventory report service to do listing directly. + if not prefix and use_snapshot_listing is False: self.dircache[path] = out return out async def _do_list_objects( - self, path, max_results=None, delimiter="/", prefix="", versions=False + self, path, max_results=None, delimiter="/", prefix="", versions=False, **kwargs ): """Object listing for the given {bucket}/{prefix}/ path.""" bucket, _path, generation = self.split_path(path) _path = "" if not _path else _path.rstrip("/") + "/" prefix = f"{_path}{prefix}" or None + # Page size of 5000 is officially supported across GCS. + default_page_size = 5000 + + # NOTE: the inventory report logic is experimental. + inventory_report_info = kwargs.get("inventory_report_info", None) + + # Check if the user has configured inventory report option. + if inventory_report_info is not None: + items, prefixes = await InventoryReport.fetch_snapshot( + gcs_file_system=self, + inventory_report_info=inventory_report_info, + prefix=prefix, + ) + + use_snapshot_listing = inventory_report_info.get("use_snapshot_listing") + + # If the user wants to rely on the snapshot from the inventory report + # for listing, directly return the results. + if use_snapshot_listing: + return items, prefixes + + # Otherwise, use the snapshot to initiate concurrent listing. + return await self._concurrent_list_objects_helper( + items=items, + bucket=bucket, + delimiter=delimiter, + prefix=prefix, + versions=versions, + generation=generation, + page_size=default_page_size, + ) + + # If the user has not configured inventory report, proceed to use + # sequential listing. + else: + return await self._sequential_list_objects_helper( + bucket=bucket, + delimiter=delimiter, + start_offset=None, + end_offset=None, + prefix=prefix, + versions=versions, + generation=generation, + page_size=default_page_size, + ) + + async def _concurrent_list_objects_helper( + self, items, bucket, delimiter, prefix, versions, generation, page_size + ): + """ + Lists objects using coroutines, using the object names from the inventory + report to split up the ranges. + """ + + # Extract out the names of the objects fetched from the inventory report. + snapshot_object_names = sorted([item["name"] for item in items]) + + # Determine the number of coroutines needed to concurrent listing. + # Ideally, want each coroutine to fetch a single page of objects. + num_coroutines = len(snapshot_object_names) // page_size + 1 + num_objects_per_coroutine = len(snapshot_object_names) // num_coroutines + + start_offsets = [] + end_offsets = [] + + # Calculate the split splits of each coroutine (start offset and end offset). + for i in range(num_coroutines): + range_start = i * num_objects_per_coroutine + if i == num_coroutines - 1: + range_end = len(snapshot_object_names) + else: + range_end = range_start + num_objects_per_coroutine + + if range_start == 0: + prefix_start = None + else: + prefix_start = snapshot_object_names[range_start] + + if range_end == len(snapshot_object_names): + prefix_end = None + else: + prefix_end = snapshot_object_names[range_end] + + start_offsets.append(prefix_start) + end_offsets.append(prefix_end) + + # Assign the coroutine all at once, and wait for them to finish listing. + results = await asyncio.gather( + *[ + self._sequential_list_objects_helper( + bucket=bucket, + delimiter=delimiter, + start_offset=start_offsets[i], + end_offset=end_offsets[i], + prefix=prefix, + versions=versions, + generation=generation, + page_size=page_size, + ) + for i in range(0, len(start_offsets)) + ] + ) + + items = [] + prefixes = [] + + # Concatenate the items and prefixes from each coroutine for final results. + for i in range(len(results)): + items_from_process, prefixes_from_process = results[i] + items.extend(items_from_process) + prefixes.extend(prefixes_from_process) + + return items, prefixes + + async def _sequential_list_objects_helper( + self, + bucket, + delimiter, + start_offset, + end_offset, + prefix, + versions, + generation, + page_size, + ): + """ + Sequential list objects within the start and end offset range. + """ + prefixes = [] items = [] + page = await self._call( "GET", "b/{}/o", bucket, delimiter=delimiter, prefix=prefix, - maxResults=max_results, + startOffset=start_offset, + endOffset=end_offset, + maxResults=page_size, json_out=True, versions="true" if versions or generation else None, ) @@ -591,7 +756,9 @@ async def _do_list_objects( bucket, delimiter=delimiter, prefix=prefix, - maxResults=max_results, + startOffset=start_offset, + endOffset=end_offset, + maxResults=page_size, pageToken=next_page_token, json_out=True, versions="true" if generation else None, @@ -603,6 +770,7 @@ async def _do_list_objects( next_page_token = page.get("nextPageToken", None) items = [self._process_object(bucket, i) for i in items] + return items, prefixes async def _list_buckets(self): @@ -741,6 +909,18 @@ async def _rmdir(self, bucket): rmdir = sync_wrapper(_rmdir) + def modified(self, path): + return self.info(path)["mtime"] + + def created(self, path): + return self.info(path)["ctime"] + + def _parse_timestamp(self, timestamp): + assert timestamp.endswith("Z") + timestamp = timestamp[:-1] + timestamp = timestamp + "0" * (6 - len(timestamp.rsplit(".", 1)[1])) + return datetime.fromisoformat(timestamp + "+00:00") + async def _info(self, path, generation=None, **kwargs): """File information about this path.""" path = self._strip_protocol(path) @@ -803,38 +983,25 @@ async def _info(self, path, generation=None, **kwargs): else: raise FileNotFoundError(path) - async def _glob(self, path, prefix="", **kwargs): - if not prefix: - # Identify pattern prefixes. Ripped from fsspec.spec.AbstractFileSystem.glob and matches - # the glob.has_magic patterns. - indstar = path.find("*") if path.find("*") >= 0 else len(path) - indques = path.find("?") if path.find("?") >= 0 else len(path) - indbrace = path.find("[") if path.find("[") >= 0 else len(path) - - ind = min(indstar, indques, indbrace) - prefix = path[:ind].split("/")[-1] - return await super()._glob(path, prefix=prefix, **kwargs) - - async def _ls(self, path, detail=False, prefix="", versions=False, **kwargs): + async def _ls( + self, path, detail=False, prefix="", versions=False, refresh=False, **kwargs + ): """List objects under the given '/{bucket}/{prefix} path.""" path = self._strip_protocol(path).rstrip("/") + if refresh: + self.invalidate_cache(path) if path in ["/", ""]: out = await self._list_buckets() else: out = [] for entry in await self._list_objects( - path, prefix=prefix, versions=versions + path, prefix=prefix, versions=versions, **kwargs ): - if versions: - out.append( - { - **entry, - "name": f"{entry['name']}#{entry['generation']}", - } - ) - else: - out.append(entry) + if versions and "generation" in entry: + entry = entry.copy() + entry["name"] = f"{entry['name']}#{entry['generation']}" + out.append(entry) if detail: return out @@ -884,7 +1051,7 @@ async def _setxattrs( fake-gcs-server:latest does not seem to support this. Parameters - --------- + ---------- content_type: str If not None, set the content-type to this value content_encoding: str @@ -898,6 +1065,7 @@ async def _setxattrs( - content_encoding - content_language - custom_time + More info: https://cloud.google.com/storage/docs/metadata#mutable kw_args: key-value pairs like field="value" or field=None @@ -984,6 +1152,7 @@ async def _cp_file(self, path1, path2, acl=None, **kwargs): json_out=True, sourceGeneration=g1, ) + self.invalidate_cache(self._parent(path2)) async def _rm_file(self, path, **kwargs): bucket, key, generation = self.split_path(path) @@ -1090,6 +1259,7 @@ async def _pipe_file( metadata=None, consistency=None, content_type="application/octet-stream", + fixed_key_metadata=None, chunksize=50 * 2**20, ): # enforce blocksize should be a multiple of 2**18 @@ -1099,10 +1269,24 @@ async def _pipe_file( out = None if size < 5 * 2**20: location = await simple_upload( - self, bucket, key, data, metadata, consistency, content_type + self, + bucket, + key, + data, + metadata, + consistency, + content_type, + fixed_key_metadata=fixed_key_metadata, ) else: - location = await initiate_upload(self, bucket, key, content_type, metadata) + location = await initiate_upload( + self, + bucket, + key, + content_type, + metadata, + fixed_key_metadata=fixed_key_metadata, + ) for offset in range(0, len(data), chunksize): bit = data[offset : offset + chunksize] out = await upload_chunk( @@ -1125,6 +1309,7 @@ async def _put_file( content_type="application/octet-stream", chunksize=50 * 2**20, callback=None, + fixed_key_metadata=None, **kwargs, ): # enforce blocksize should be a multiple of 2**18 @@ -1150,12 +1335,18 @@ async def _put_file( consistency=consistency, metadatain=metadata, content_type=content_type, + fixed_key_metadata=fixed_key_metadata, ) callback.absolute_update(size) else: location = await initiate_upload( - self, bucket, key, content_type, metadata + self, + bucket, + key, + content_type, + metadata=metadata, + fixed_key_metadata=fixed_key_metadata, ) offset = 0 while True: @@ -1180,21 +1371,37 @@ async def _isdir(self, path): return False async def _find( - self, path, withdirs=False, detail=False, prefix="", versions=False, **kwargs + self, + path, + withdirs=False, + detail=False, + prefix="", + versions=False, + maxdepth=None, + **kwargs, ): path = self._strip_protocol(path) - bucket, key, generation = self.split_path(path) - if prefix: - _path = "" if not key else key.rstrip("/") + "/" - _prefix = f"{_path}{prefix}" - else: - _prefix = key + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + # Fetch objects as if the path is a directory objects, _ = await self._do_list_objects( - bucket, delimiter="", prefix=_prefix, versions=versions + path, delimiter="", prefix=prefix, versions=versions ) + if not objects: + # Fetch objects as if the path is a file + bucket, key, _ = self.split_path(path) + if prefix: + _path = "" if not key else key.rstrip("/") + "/" + _prefix = f"{_path}{prefix}" + else: + _prefix = key + objects, _ = await self._do_list_objects( + bucket, delimiter="", prefix=_prefix, versions=versions + ) + dirs = {} cache_entries = {} @@ -1216,17 +1423,26 @@ async def _find( "size": 0, } - cache_entries.setdefault(parent, []).append(previous) + listing = cache_entries.setdefault(parent, {}) + name = previous["name"] + if name not in listing: + listing[name] = previous previous = dirs[parent] parent = self._parent(parent) if not prefix: - self.dircache.update(cache_entries) + cache_entries_list = {k: list(v.values()) for k, v in cache_entries.items()} + self.dircache.update(cache_entries_list) if withdirs: objects = sorted(objects + list(dirs.values()), key=lambda x: x["name"]) + if maxdepth: + # Filter returned objects based on requested maxdepth + depth = path.rstrip("/").count("/") + maxdepth + objects = list(filter(lambda o: o["name"].count("/") <= depth, objects)) + if detail: if versions: return {f"{o['name']}#{o['generation']}": o for o in objects} @@ -1241,7 +1457,7 @@ async def _get_file_request( self, rpath, lpath, *args, headers=None, callback=None, **kwargs ): consistency = kwargs.pop("consistency", self.consistency) - + await self._set_session() async with self.session.get( url=rpath, params=self._get_params(kwargs), @@ -1382,13 +1598,15 @@ def sign(self, path, expiration=100, **kwargs): """ from google.cloud import storage - bucket, key = self.split_path(path) + bucket, key, generation = self.split_path(path) client = storage.Client( credentials=self.credentials.credentials, project=self.project ) bucket = client.bucket(bucket) blob = bucket.blob(key) - return blob.generate_signed_url(expiration=expiration, **kwargs) + return blob.generate_signed_url( + expiration=expiration, generation=generation, **kwargs + ) GoogleCredentials.load_tokens() @@ -1556,7 +1774,7 @@ def _upload_chunk(self, final=False): shortfall = (self.offset + l - 1) - end if shortfall > 0: self.checker.update(data[:-shortfall]) - self.buffer = io.BytesIO(data[-shortfall:]) + self.buffer = UnclosableBytesIO(data[-shortfall:]) self.buffer.seek(shortfall) self.offset += l - shortfall continue @@ -1569,7 +1787,7 @@ def _upload_chunk(self, final=False): self.checker.update(data) self.checker.validate_json_response(j) # Clear buffer and update offset when all is received - self.buffer = io.BytesIO() + self.buffer = UnclosableBytesIO() self.offset += l break return True @@ -1674,7 +1892,9 @@ async def upload_chunk(fs, location, data, offset, size, content_type): range = "bytes %i-%i/%i" % (offset, offset + l - 1, size) head["Content-Range"] = range head.update({"Content-Type": content_type, "Content-Length": str(l)}) - headers, txt = await fs._call("POST", location, headers=head, data=data) + headers, txt = await fs._call( + "POST", location, headers=head, data=UnclosableBytesIO(data) + ) if "Range" in headers: end = int(headers["Range"].split("-")[1]) shortfall = (offset + l - 1) - end @@ -1740,7 +1960,7 @@ async def simple_upload( path, uploadType="multipart", headers={"Content-Type": 'multipart/related; boundary="==0=="'}, - data=data, + data=UnclosableBytesIO(data), json_out=True, ) checker.update(datain) diff --git a/gcsfs/credentials.py b/gcsfs/credentials.py index 05439b93..a6e8d371 100644 --- a/gcsfs/credentials.py +++ b/gcsfs/credentials.py @@ -139,22 +139,24 @@ def _connect_token(self, token): Parameters ---------- token: str, dict or Credentials - If a str, try to load as a Service file, or next as a JSON; if + If a str and a valid file name, try to load as a Service file, or next as a JSON; + if not a valid file name, assume it's a valid raw (non-renewable/session) token, and pass to Credentials. If dict, try to interpret as credentials; if Credentials, use directly. """ if isinstance(token, str): - if not os.path.exists(token): - raise FileNotFoundError(token) - try: - # is this a "service" token? - self._connect_service(token) - return - except: # noqa: E722 - # TODO: catch specific exceptions - # some other kind of token file - # will raise exception if is not json - with open(token) as data: - token = json.load(data) + if os.path.exists(token): + try: + # is this a "service" token? + self._connect_service(token) + return + except: # noqa: E722 + # TODO: catch specific exceptions + # some other kind of token file + # will raise exception if is not json + with open(token) as data: + token = json.load(data) + else: + token = Credentials(token) if isinstance(token, dict): credentials = self._dict_to_credentials(token) elif isinstance(token, google.auth.credentials.Credentials): diff --git a/gcsfs/inventory_report.py b/gcsfs/inventory_report.py new file mode 100644 index 00000000..869a6fe0 --- /dev/null +++ b/gcsfs/inventory_report.py @@ -0,0 +1,650 @@ +from datetime import datetime + + +class InventoryReport: + """ + A utility class for fetching and processing inventory reports from GCS. + + The 'InventoryReport' class provides logic to support logic to fetch + inventory reports, and process their content to obtain a final snapshot + of objects in the latest inventory reports. + + High-Level Functionality: + ------------------------ + 1. Fetching Inventory Reports: + - The class offers methods to fetch inventory report configurations and + metadata from GCS. + - It validates the inventory report information provided by the user. + - Inventory report configurations include options for parsing CSV format + and specifying the bucket and destination path. + + 2. Parsing and Processing Inventory Report Content: + - The class processes the raw content of inventory reports to extract + object details such as name, size, etc. + - It supports listing objects using a snapshot option or filtering + based on a user-defined prefix. + - The class handles CSV parsing, removes header (if specified), and + fetches required object metadata. + + 3. Constructing the Final Snapshot: + - If the user wishes to use the snapshot to do listing directly, the + snapshot will contain the relevant object details and subdirectory + prefixes, filtered by the prefix. + + - If the user wishes to use the snapshot as a starting point for async + listing, the snapshot will only contain a list of object names, + filtered by the prefix. + + Note: + ----- + - The class should only be internally used in the 'GCSFileSystem' as an + optional configuration during listing. + + Example Usage: + -------------- + # Should already be instanted in 'core.py' + gcs_file_system = GCSFileSystem(...) + + # User defines inventory report information + inventory_report_info = { + "use_snapshot_listing": True, + "location": "us-east1", + "id": "inventory_report_id" + } + + # User defines a prefix for filtering objects + prefix = "prefix/" + + # Fetch the snapshot based on inventory reports + items, prefixes = await InventoryReport.fetch_snapshot( + gcs_file_system, inventory_report_info, prefix) + """ + + # HTTP endpoint of the Storage Insights Service. + BASE_URL = "https://storageinsights.googleapis.com/v1" + + @classmethod + async def fetch_snapshot(cls, gcs_file_system, inventory_report_info, prefix): + """ + Main entry point of the 'InventoryReport' class. + Fetches the latest snapshot of objects based on inventory report configuration. + + Parameters: + gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' + class (see 'core.py'). + inventory_report_info (dict): A client-configured dictionary + containing inventory report information. + prefix (str): Listing prefix specified by the client. + + Returns: + tuple: A tuple containing two lists: the 'items' list representing + object details for the snapshot, and the 'prefixes' list containing + subdirectory prefixes. + + Note: when 'use_snapshot_listing' in 'inventory_report_info' is set + to False, the 'prefixes' list will be empty, and the 'items' list + will contain only the object names. + """ + # Validate the inventory report info that the user passes in. + cls._validate_inventory_report_info(inventory_report_info) + + # Parse the inventory report info. + use_snapshot_listing = inventory_report_info.get("use_snapshot_listing") + inventory_report_location = inventory_report_info.get("location") + inventory_report_id = inventory_report_info.get("id") + + # Fetch the inventory report configuration. + raw_inventory_report_config = await cls._fetch_raw_inventory_report_config( + gcs_file_system=gcs_file_system, + location=inventory_report_location, + id=inventory_report_id, + ) + + # Parse the inventory report configuration. + inventory_report_config = cls._parse_raw_inventory_report_config( + raw_inventory_report_config=raw_inventory_report_config, + use_snapshot_listing=use_snapshot_listing, + ) + + # Use the config to fetch all inventory report metadata. + unsorted_inventory_report_metadata = await cls._fetch_inventory_report_metadata( + gcs_file_system=gcs_file_system, + inventory_report_config=inventory_report_config, + ) + + # Sort the metadata based on reverse created time order. + inventory_report_metadata = cls._sort_inventory_report_metadata( + unsorted_inventory_report_metadata=unsorted_inventory_report_metadata + ) + + # Download the most recent inventory reports in raw form. + bucket = inventory_report_config.bucket + inventory_report_content = await cls._download_inventory_report_content( + gcs_file_system=gcs_file_system, + inventory_report_metadata=inventory_report_metadata, + bucket=bucket, + ) + + # Parse the raw inventory reports into snapshot objects. + objects = cls._parse_inventory_report_content( + gcs_file_system=gcs_file_system, + inventory_report_content=inventory_report_content, + inventory_report_config=inventory_report_config, + use_snapshot_listing=use_snapshot_listing, + bucket=bucket, + ) + + # Construct the final snapshot based on the fetched objects. + snapshot = cls._construct_final_snapshot( + objects=objects, prefix=prefix, use_snapshot_listing=use_snapshot_listing + ) + + # Return the final snapshot. + return snapshot + + def _validate_inventory_report_info(inventory_report_info): + """ + Validates the inventory report information dictionary that user + passes in. + + Parameters: + inventory_report_info (dict): A dictionary containing the inventory + report information with the following keys: + - "use_snapshot_listing" (bool): A flag indicating whether + to use snapshot listing in the inventory report. + - "location" (str): The location of the inventory report in GCS. + - "id" (str): The ID of the inventory report in GCS. + + Raises: + ValueError: If any required key (use_snapshot_listing, location, id) + is missing from the inventory_report_info dictionary. + """ + if "use_snapshot_listing" not in inventory_report_info: + raise ValueError("Use snapshot listing is not configured.") + if "location" not in inventory_report_info: + raise ValueError("Inventory report location is not configured.") + if "id" not in inventory_report_info: + raise ValueError("Inventory report id is not configured.") + + async def _fetch_raw_inventory_report_config(gcs_file_system, location, id): + """ + Fetches the raw inventory report configuration from GCS based on the + specified location and ID. + + Parameters: + gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' + class (see 'core.py'). + location (str): The location of the inventory report in GCS. + id (str): The ID of the inventory report in GCS. + + Returns: + dict: A dictionary containing the raw inventory report + configuration retrieved from GCS. + + Raises: + Exception: If there is an error while fetching the inventory + report configuration. + """ + project = gcs_file_system.project + url = "{}/projects/{}/locations/{}/reportConfigs/{}" + url = url.format(InventoryReport.BASE_URL, project, location, id) + try: + raw_inventory_report_config = await gcs_file_system._call( + "GET", url, json_out=True + ) + return raw_inventory_report_config + except Exception as e: + raise ValueError( + f"Error encountered when fetching inventory report config: {e}." + ) + + def _parse_raw_inventory_report_config( + raw_inventory_report_config, use_snapshot_listing + ): + """ + Parses the raw inventory report configuration and validates its properties. + + Parameters: + raw_inventory_report_config (dict): A dictionary containing the raw + inventory report configuration retrieved from GCS. + use_snapshot_listing (bool): A flag indicating whether to use snapshot + listing in the inventory report. + + Returns: + InventoryReportConfig: An instance of the InventoryReportConfig + class representing the parsed inventory report configuration. + + Raises: + ValueError: If the current date is outside the start and + end range specified in the inventory report config. + ValueError: If the "name" field is not present in the metadata + fields of the report config. + ValueError: If "size" field is not present in the metadata + fields and use_snapshot_listing is True. + """ + # Parse the report config. + frequency_options = raw_inventory_report_config.get("frequencyOptions") + start_date = InventoryReport._convert_obj_to_date( + frequency_options.get("startDate") + ) + end_date = InventoryReport._convert_obj_to_date( + frequency_options.get("endDate") + ) + object_metadata_report_options = raw_inventory_report_config.get( + "objectMetadataReportOptions" + ) + storage_destination_options = object_metadata_report_options.get( + "storageDestinationOptions" + ) + + # Save relevant report config properties. + csv_options = raw_inventory_report_config.get("csvOptions") + bucket = storage_destination_options.get("bucket") + destination_path = storage_destination_options.get("destinationPath") + metadata_fields = object_metadata_report_options.get("metadataFields") + + # Validate date, making sure the current date is within the start and end range. + today = datetime.now() + if today < start_date or today > end_date: + raise ValueError( + f"Current date {today} is outside the range \ + {start_date} and {end_date} specified by the inventory report config." + ) + + # Validate object name exists in the metadata fields. + # Note that the size field is mandated to be included in the + # config when the client sets up the inventory report. + obj_name_idx = metadata_fields.index("name") + + # If the user wants to do listing based on the snapshot, also + # validate the report contains size metadata for each object. + if use_snapshot_listing: + try: + metadata_fields.index("size") + except ValueError: + raise ValueError( + "If you want to use the snapshot for listing, the object size \ + metadata has to be included in the inventory report." + ) + + # Finally, construct and return the inventory report config. + inventory_report_config = InventoryReportConfig( + csv_options=csv_options, + bucket=bucket, + destination_path=destination_path, + metadata_fields=metadata_fields, + obj_name_idx=obj_name_idx, + ) + + return inventory_report_config + + async def _fetch_inventory_report_metadata( + gcs_file_system, inventory_report_config + ): + """ + Fetches all inventory report metadata from GCS based on the specified + inventory report config. + + Parameters: + gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' + class (see 'core.py'). + inventory_report_config (InventoryReportConfig): An instance of + the InventoryReportConfig class representing the inventory report + configuration. + + Returns: + list: A list containing dictionaries representing the metadata of + objects from the inventory reports. + + Raises: + ValueError: If the fetched inventory reports are empty. + """ + # There might be multiple inventory reports in the bucket. + inventory_report_metadata = [] + + # Extract out bucket and destination path of the inventory reports. + bucket = inventory_report_config.bucket + destination_path = inventory_report_config.destination_path + + # Fetch the first page. + page = await gcs_file_system._call( + "GET", "b/{}/o", bucket, prefix=destination_path, json_out=True + ) + + inventory_report_metadata.extend(page.get("items", [])) + next_page_token = page.get("nextPageToken", None) + + # Keep fetching new pages as long as next page token exists. + # Note that the iteration in the while loop should most likely + # be minimal. For reference, a million objects is split up into + # two reports, and if the report is generated daily, then in a year, + # there will be roughly ~700 reports generated, which will still be + # fetched in a single page. + while next_page_token is not None: + page = await gcs_file_system._call( + "GET", + "b/{}/o", + bucket, + prefix=destination_path, + json_out=True, + pageToken=next_page_token, + ) + + inventory_report_metadata.extend(page.get("items", [])) + next_page_token = page.get("nextPageToken", None) + + # If no reports are fetched, indicates there is an error. + if len(inventory_report_metadata) == 0: + raise ValueError( + "No inventory reports to fetch. Check if \ + your inventory report is set up correctly." + ) + + return inventory_report_metadata + + def _sort_inventory_report_metadata(unsorted_inventory_report_metadata): + """ + Sorts the inventory report metadata based on the 'timeCreated' field + in reverse chronological order. + + Parameters: + unsorted_inventory_report_metadata (list): A list of dictionaries + representing the metadata of objects from the inventory reports. + + Returns: + list: A sorted list of dictionaries representing the inventory + report metadata, sorted in reverse chronological order based + on 'timeCreated'. + """ + return sorted( + unsorted_inventory_report_metadata, + key=lambda ir: InventoryReport._convert_str_to_datetime( + ir.get("timeCreated") + ), + reverse=True, + ) + + async def _download_inventory_report_content( + gcs_file_system, inventory_report_metadata, bucket + ): + """ + Downloads the most recent inventory report content from GCS based on + the inventory report metadata. + + Parameters: + gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' + class (see 'core.py'). + inventory_report_metadata (list): A list of dictionaries + representing the metadata of objects from the inventory reports. + bucket (str): The name of the GCS bucket containing + the inventory reports. + + Returns: + list: A list containing the content of the most recent inventory + report as strings. + """ + # Get the most recent inventory report date. + most_recent_inventory_report = inventory_report_metadata[0] + most_recent_date = InventoryReport._convert_str_to_datetime( + most_recent_inventory_report.get("timeCreated") + ).date() + + inventory_report_content = [] + + # Run a for loop here, since there might be multiple inventory reports + # generated on the same day. For reference, 1 million objects will be + # split into only 2 inventory reports, so it is very rare that there + # will be many inventory reports on the same day. But including this + # logic for robustness. + for metadata in inventory_report_metadata: + inventory_report_date = InventoryReport._convert_str_to_datetime( + metadata["timeCreated"] + ).date() + + if inventory_report_date == most_recent_date: + # Download the raw inventory report if the date matches. + # Header is not needed, we only need to process and store + # the content. + _header, encoded_content = await gcs_file_system._call( + "GET", "b/{}/o/{}", bucket, metadata.get("name"), alt="media" + ) + + # Decode the binary content into string for the content. + decoded_content = encoded_content.decode() + + inventory_report_content.append(decoded_content) + + return inventory_report_content + + def _parse_inventory_report_content( + gcs_file_system, + inventory_report_content, + inventory_report_config, + use_snapshot_listing, + bucket, + ): + """ + Parses the raw inventory report content and extracts object details. + + Parameters: + gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' + class (see 'core.py'). + inventory_report_content (list): A list of strings containing the + raw content of the inventory report. + inventory_report_config (InventoryReportConfig): An instance of the + InventoryReportConfig class representing the inventory report + configuration. + use_snapshot_listing (bool): A flag indicating whether to use snapshot + listing in the inventory report. + bucket (str): The name of the GCS bucket containing the inventory + reports. + + Returns: + list: A list of dictionaries representing object details parsed + from the inventory report content. + """ + # Get the csv configuration for each inventory report. + csv_options = inventory_report_config.csv_options + record_separator = csv_options.get("recordSeparator", "\n") + delimiter = csv_options.get("delimiter", ",") + header_required = csv_options.get("headerRequired", False) + + objects = [] + + for content in inventory_report_content: + # Split the content into lines based on the specified separator. + lines = content.split(record_separator) + + # Remove the header, if present. + if header_required: + lines = lines[1:] + + # Parse each line of the inventory report. + for line in lines: + obj = InventoryReport._parse_inventory_report_line( + inventory_report_line=line, + use_snapshot_listing=use_snapshot_listing, + gcs_file_system=gcs_file_system, + inventory_report_config=inventory_report_config, + delimiter=delimiter, + bucket=bucket, + ) + + objects.append(obj) + + return objects + + def _parse_inventory_report_line( + inventory_report_line, + use_snapshot_listing, + gcs_file_system, + inventory_report_config, + delimiter, + bucket, + ): + """ + Parses a single line of the inventory report and extracts object details. + + Parameters: + inventory_report_line (str): A string representing a single line of + the raw content from the inventory report. + use_snapshot_listing (bool): A flag indicating whether to use snapshot + listing in the inventory report. + gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' + class (see 'core.py'). + inventory_report_config (InventoryReportConfig): An instance of the + InventoryReportConfig class representing the inventory report + configuration. + delimiter (str): The delimiter used in the inventory report content + to separate fields. + bucket (str): The name of the GCS bucket containing the inventory + reports. + + Returns: + dict: A dictionary representing object details parsed from the + inventory report line. + """ + obj_name_idx = inventory_report_config.obj_name_idx + metadata_fields = inventory_report_config.metadata_fields + + # If the client wants to do listing from the snapshot, we need + # to fetch all the metadata for each object. Otherwise, we only + # need to fetch the name. + if use_snapshot_listing is True: + obj = gcs_file_system._process_object( + { + key: value + for key, value in zip( + metadata_fields, inventory_report_line.strip().split(delimiter) + ) + }, + bucket, + ) + else: + obj = {"name": inventory_report_line.strip().split(delimiter)[obj_name_idx]} + + return obj + + def _construct_final_snapshot(objects, prefix, use_snapshot_listing): + """ + Constructs the final snapshot based on the retrieved objects and prefix. + + Parameters: + objects (list): A list of dictionaries representing object details + from the inventory report. + prefix (str): A prefix used to filter objects in the snapshot based + on their names. + use_snapshot_listing (bool): A flag indicating whether to use snapshot + listing in the inventory report. + + Returns: + tuple: A tuple containing two lists: the 'items' list representing + object details for the snapshot, and the 'prefixes' list containing + subdirectory prefixes. If 'use_snapshot_listing' is set to False, + 'prefix' will also be empty, and 'items' will contains the object + names in the snapshot. + """ + if prefix is None: + prefix = "" + + # Filter the prefix and returns the list if the user does not want to use + # the snapshot for listing. + if use_snapshot_listing is False: + return [obj for obj in objects if obj.get("name").startswith(prefix)], [] + + else: + # If the user wants to use the snapshot, generate both the items and + # prefixes manually. + items = [] + prefixes = set() + + for obj in objects: + # Fetch the name of the object. + obj_name = obj.get("name") + + # If the object name doesn't start with the prefix, continue. + # In the case where prefix is empty, it will always return + # true (which is the expected behavior). + if not obj_name.startswith(prefix): + continue + + # Remove the prefix. + object_name_no_prefix = obj_name[len(prefix) :] + + # Determine whether the object name is a directory. + first_delimiter_idx = object_name_no_prefix.find("/") + + # If not, then append it to items. + if first_delimiter_idx == -1: + items.append(obj) + continue + + # If it is, recompose the directory and add to the prefix set. + dir = object_name_no_prefix[:first_delimiter_idx] + obj_prefix = ( + prefix.rstrip("/") + + ("" if prefix == "" else "/") + + dir + + ("" if dir == "" else "/") + ) + prefixes.add(obj_prefix) + + return items, list(prefixes) + + @staticmethod + def _convert_obj_to_date(obj): + """ + Converts a dictionary representing a date object to a datetime object. + + Parameters: + obj (dict): A dictionary representing a date object with keys "day", + "month", and "year". + + Returns: + datetime: A datetime object representing the converted date. + """ + day = obj["day"] + month = obj["month"] + year = obj["year"] + return datetime(year, month, day) + + @staticmethod + def _convert_str_to_datetime(str): + """ + Converts an ISO-formatted date string to a datetime object. + + Parameters: + date_string (str): An ISO-formatted date string with or without + timezone information (Z). + + Returns: + datetime: A datetime object representing the converted date and time. + """ + return datetime.fromisoformat(str.replace("Z", "+00:00")) + + +class InventoryReportConfig(object): + """ + Represents the configuration for fetching inventory reports. + + Attributes: + csv_options (dict): A dictionary containing options for parsing CSV + format in the inventory reports. + bucket (str): The name of the GCS bucket from which to fetch the + inventory reports. + destination_path (str): The path within the GCS bucket where the + inventory reports are stored. + metadata_fields (list): A list of strings representing metadata + fields to be extracted from the inventory reports. + obj_name_idx (int): The index of the "name" field in the 'metadata_fields' + list, used to identify object names. + """ + + def __init__( + self, csv_options, bucket, destination_path, metadata_fields, obj_name_idx + ): + self.csv_options = csv_options + self.bucket = bucket + self.destination_path = destination_path + self.metadata_fields = metadata_fields + self.obj_name_idx = obj_name_idx diff --git a/gcsfs/retry.py b/gcsfs/retry.py index 967ad691..9ae823ba 100644 --- a/gcsfs/retry.py +++ b/gcsfs/retry.py @@ -15,6 +15,8 @@ class HttpError(Exception): """Holds the message and code from cloud errors.""" def __init__(self, error_response=None): + # Save error_response for potential pickle. + self._error_response = error_response if error_response: self.code = error_response.get("code", None) self.message = error_response.get("message", "") @@ -29,6 +31,12 @@ def __init__(self, error_response=None): # Call the base class constructor with the parameters it needs super().__init__(self.message) + def __reduce__(self): + """This makes the Exception pickleable.""" + + # This is basically deconstructing the HttpError when pickled. + return HttpError, (self._error_response,) + class ChecksumError(Exception): """Raised when the md5 hash of the content does not match the header.""" diff --git a/gcsfs/tests/derived/__init__.py b/gcsfs/tests/derived/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gcsfs/tests/derived/gcsfs_fixtures.py b/gcsfs/tests/derived/gcsfs_fixtures.py new file mode 100644 index 00000000..21ce2431 --- /dev/null +++ b/gcsfs/tests/derived/gcsfs_fixtures.py @@ -0,0 +1,42 @@ +import fsspec +import pytest +from fsspec.tests.abstract import AbstractFixtures + +from gcsfs.core import GCSFileSystem +from gcsfs.tests.conftest import allfiles +from gcsfs.tests.settings import TEST_BUCKET + + +class GcsfsFixtures(AbstractFixtures): + @pytest.fixture(scope="class") + def fs(self, docker_gcs): + GCSFileSystem.clear_instance_cache() + gcs = fsspec.filesystem("gcs", endpoint_url=docker_gcs) + try: + # ensure we're empty. + try: + gcs.rm(TEST_BUCKET, recursive=True) + except FileNotFoundError: + pass + try: + gcs.mkdir(TEST_BUCKET) + except Exception: + pass + + gcs.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()}) + gcs.invalidate_cache() + yield gcs + finally: + try: + gcs.rm(gcs.find(TEST_BUCKET)) + gcs.rm(TEST_BUCKET) + except: # noqa: E722 + pass + + @pytest.fixture + def fs_path(self): + return TEST_BUCKET + + @pytest.fixture + def supports_empty_directories(self): + return False diff --git a/gcsfs/tests/derived/gcsfs_test.py b/gcsfs/tests/derived/gcsfs_test.py new file mode 100644 index 00000000..2e3f8d44 --- /dev/null +++ b/gcsfs/tests/derived/gcsfs_test.py @@ -0,0 +1,15 @@ +import fsspec.tests.abstract as abstract + +from gcsfs.tests.derived.gcsfs_fixtures import GcsfsFixtures + + +class TestGcsfsCopy(abstract.AbstractCopyTests, GcsfsFixtures): + pass + + +class TestGcsfsGet(abstract.AbstractGetTests, GcsfsFixtures): + pass + + +class TestGcsfsPut(abstract.AbstractPutTests, GcsfsFixtures): + pass diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 05a607e7..89dd1760 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -1,23 +1,30 @@ +import datetime import io +import os from builtins import FileNotFoundError from itertools import chain from unittest import mock from urllib.parse import parse_qs, unquote, urlparse from uuid import uuid4 +import fsspec.core import pytest import requests from fsspec.asyn import sync from fsspec.utils import seek_delimiter import gcsfs.checkers +import gcsfs.tests.settings from gcsfs import __version__ as version from gcsfs.core import GCSFileSystem, quote from gcsfs.credentials import GoogleCredentials from gcsfs.tests.conftest import a, allfiles, b, csv_files, files, text_files -from gcsfs.tests.settings import TEST_BUCKET, TEST_PROJECT, TEST_REQUESTER_PAYS_BUCKET from gcsfs.tests.utils import tempdir, tmpfile +TEST_BUCKET = gcsfs.tests.settings.TEST_BUCKET +TEST_PROJECT = gcsfs.tests.settings.TEST_PROJECT +TEST_REQUESTER_PAYS_BUCKET = gcsfs.tests.settings.TEST_REQUESTER_PAYS_BUCKET + def test_simple(gcs): assert not GoogleCredentials.tokens @@ -118,6 +125,13 @@ def test_info(gcs): gcs.touch(a) assert gcs.info(a) == gcs.ls(a, detail=True)[0] + today = datetime.datetime.utcnow().date().isoformat() + assert gcs.created(a).isoformat().startswith(today) + assert gcs.modified(a).isoformat().startswith(today) + # Check conformance with expected info attribute names. + assert gcs.info(a)["ctime"] == gcs.created(a) + assert gcs.info(a)["mtime"] == gcs.modified(a) + def test_ls2(gcs): assert TEST_BUCKET + "/" in gcs.ls("") @@ -256,11 +270,18 @@ def test_ls_detail(gcs): assert all(isinstance(item, dict) for item in L) +@pytest.mark.parametrize("refresh", (False, True)) +def test_ls_refresh(gcs, refresh): + with mock.patch.object(gcs, "invalidate_cache") as mock_invalidate_cache: + gcs.ls(TEST_BUCKET, refresh=refresh) + assert mock_invalidate_cache.called is refresh + + def test_gcs_glob(gcs): fn = TEST_BUCKET + "/nested/file1" assert fn not in gcs.glob(TEST_BUCKET + "/") assert fn not in gcs.glob(TEST_BUCKET + "/*") - assert fn in gcs.glob(TEST_BUCKET + "/nested/") + assert fn not in gcs.glob(TEST_BUCKET + "/nested/") assert fn in gcs.glob(TEST_BUCKET + "/nested/*") assert fn in gcs.glob(TEST_BUCKET + "/nested/file*") assert fn in gcs.glob(TEST_BUCKET + "/*/*") @@ -271,10 +292,11 @@ def test_gcs_glob(gcs): for f in gcs.glob(TEST_BUCKET + "/nested/*") if gcs.isfile(f) ) + # the following is no longer true since the glob method list the root path # Ensure the glob only fetches prefixed folders - gcs.dircache.clear() - gcs.glob(TEST_BUCKET + "/nested**1") - assert all(d.startswith(TEST_BUCKET + "/nested") for d in gcs.dircache) + # gcs.dircache.clear() + # gcs.glob(TEST_BUCKET + "/nested**1") + # assert all(d.startswith(TEST_BUCKET + "/nested") for d in gcs.dircache) # the following is no longer true as of #437 # gcs.glob(TEST_BUCKET + "/test*") # assert TEST_BUCKET + "/test" in gcs.dircache @@ -385,6 +407,19 @@ def test_move(gcs): assert not gcs.exists(fn) +@pytest.mark.parametrize("slash_from", ([False, True])) +def test_move_recursive(gcs, slash_from): + # See issue #489 + dir_from = TEST_BUCKET + "/nested" + if slash_from: + dir_from += "/" + dir_to = TEST_BUCKET + "/new_name" + + gcs.mv(dir_from, dir_to, recursive=True) + assert not gcs.exists(dir_from) + assert gcs.ls(dir_to) == [dir_to + "/file1", dir_to + "/file2", dir_to + "/nested2"] + + def test_cat_file(gcs): fn = TEST_BUCKET + "/test/accounts.1.json" data = gcs.cat_file(fn) @@ -992,10 +1027,16 @@ def test_put_small_cache_validity(gcs): def test_pseudo_dir_find(gcs): gcs.rm(f"{TEST_BUCKET}/*", recursive=True) gcs.touch(f"{TEST_BUCKET}/a/b/file") + + c = gcs.glob(f"{TEST_BUCKET}/a/b/*") + assert c == [f"{TEST_BUCKET}/a/b/file"] + b = set(gcs.glob(f"{TEST_BUCKET}/a/*")) - assert f"{TEST_BUCKET}/a/b" in b + assert b == {f"{TEST_BUCKET}/a/b"} + a = set(gcs.glob(f"{TEST_BUCKET}/*")) - assert f"{TEST_BUCKET}/a" in a + assert a == {f"{TEST_BUCKET}/a"} + assert gcs.find(TEST_BUCKET) == [f"{TEST_BUCKET}/a/b/file"] assert gcs.find(f"{TEST_BUCKET}/a", withdirs=True) == [ f"{TEST_BUCKET}/a", @@ -1198,6 +1239,7 @@ def test_ls_versioned(gcs_versioned): assert versions == { entry["name"] for entry in gcs_versioned.ls(dpath, detail=True, versions=True) } + assert gcs_versioned.ls(TEST_BUCKET, versions=True) == ["gcsfs_test/tmp"] def test_find_versioned(gcs_versioned): @@ -1210,3 +1252,206 @@ def test_find_versioned(gcs_versioned): versions = {f"{a}#{v1}", f"{a}#{v2}"} assert versions == set(gcs_versioned.find(a, versions=True)) assert versions == set(gcs_versioned.find(a, detail=True, versions=True)) + + +def test_cp_directory_recursive(gcs): + src = TEST_BUCKET + "/src" + src_file = src + "/file" + gcs.mkdir(src) + gcs.touch(src_file) + + target = TEST_BUCKET + "/target" + + # cp without slash + assert not gcs.exists(target) + for loop in range(2): + gcs.cp(src, target, recursive=True) + assert gcs.isdir(target) + + if loop == 0: + correct = [target + "/file"] + assert gcs.find(target) == correct + else: + correct = [target + "/file", target + "/src/file"] + assert sorted(gcs.find(target)) == correct + + gcs.rm(target, recursive=True) + + # cp with slash + assert not gcs.exists(target) + for loop in range(2): + gcs.cp(src + "/", target, recursive=True) + assert gcs.isdir(target) + correct = [target + "/file"] + assert gcs.find(target) == correct + + +def test_get_directory_recursive(gcs): + src = TEST_BUCKET + "/src" + src_file = src + "/file" + gcs.mkdir(src) + gcs.touch(src_file) + + with tempdir() as tmpdir: + target = os.path.join(tmpdir, "target") + target_fs = fsspec.filesystem("file") + + # get without slash + assert not target_fs.exists(target) + for loop in range(2): + gcs.get(src, target, recursive=True) + assert target_fs.isdir(target) + + if loop == 0: + assert target_fs.find(target) == [os.path.join(target, "file")] + else: + assert sorted(target_fs.find(target)) == [ + os.path.join(target, "file"), + os.path.join(target, "src", "file"), + ] + + target_fs.rm(target, recursive=True) + + # get with slash + assert not target_fs.exists(target) + for loop in range(2): + gcs.get(src + "/", target, recursive=True) + assert target_fs.isdir(target) + assert target_fs.find(target) == [os.path.join(target, "file")] + + +def test_put_directory_recursive(gcs): + with tempdir() as tmpdir: + src = os.path.join(tmpdir, "src") + src_file = os.path.join(src, "file") + + source_fs = fsspec.filesystem("file") + source_fs.mkdir(src) + source_fs.touch(src_file) + + target = TEST_BUCKET + "/target" + + # put without slash + assert not gcs.exists(target) + for loop in range(2): + gcs.put(src, target, recursive=True) + assert gcs.isdir(target) + + if loop == 0: + assert gcs.find(target) == [target + "/file"] + else: + assert sorted(gcs.find(target)) == [ + target + "/file", + target + "/src/file", + ] + + gcs.rm(target, recursive=True) + + # put with slash + assert not gcs.exists(target) + for loop in range(2): + gcs.put(src + "/", target, recursive=True) + assert gcs.isdir(target) + assert gcs.find(target) == [target + "/file"] + + +def test_cp_two_files(gcs): + src = TEST_BUCKET + "/src" + file0 = src + "/file0" + file1 = src + "/file1" + gcs.mkdir(src) + gcs.touch(file0) + gcs.touch(file1) + + target = TEST_BUCKET + "/target" + assert not gcs.exists(target) + + gcs.cp([file0, file1], target) + + assert gcs.isdir(target) + assert sorted(gcs.find(target)) == [ + target + "/file0", + target + "/file1", + ] + + +def test_multiglob(gcs): + # #530 + root = TEST_BUCKET + + ggparent = root + "/t1" + gparent = ggparent + "/t2" + parent = gparent + "/t3" + leaf1 = parent + "/foo.txt" + leaf2 = parent + "/bar.txt" + leaf3 = parent + "/baz.txt" + + gcs.touch(leaf1) + gcs.touch(leaf2) + gcs.touch(leaf3) + gcs.invalidate_cache() + + assert gcs.ls(gparent, detail=False) == [f"{root}/t1/t2/t3"] + gcs.glob(ggparent + "/") + assert gcs.ls(gparent, detail=False) == [f"{root}/t1/t2/t3"] + + +def test_expiry_keyword(): + gcs = GCSFileSystem(listings_expiry_time=1, token="anon") + assert gcs.dircache.listings_expiry_time == 1 + gcs = GCSFileSystem(cache_timeout=1, token="anon") + assert gcs.dircache.listings_expiry_time == 1 + + +def test_copy_cache_invalidated(gcs): + # Issue https://github.com/fsspec/gcsfs/issues/562 + source = TEST_BUCKET + "/source" + gcs.mkdir(source) + gcs.touch(source + "/file2") + + target = TEST_BUCKET + "/target" + assert not gcs.exists(target) + gcs.touch(target + "/dummy") + assert gcs.isdir(target) + + target_file2 = target + "/file2" + gcs.cp(source + "/file2", target) + + # Explicitly check that target has been removed from DirCache + assert target not in gcs.dircache + + # Prior to fix the following failed as cache stale + assert gcs.isfile(target_file2) + + +def test_find_maxdepth(gcs): + assert gcs.find(f"{TEST_BUCKET}/nested", maxdepth=None) == [ + f"{TEST_BUCKET}/nested/file1", + f"{TEST_BUCKET}/nested/file2", + f"{TEST_BUCKET}/nested/nested2/file1", + f"{TEST_BUCKET}/nested/nested2/file2", + ] + + assert gcs.find(f"{TEST_BUCKET}/nested", maxdepth=None, withdirs=True) == [ + f"{TEST_BUCKET}/nested", + f"{TEST_BUCKET}/nested/file1", + f"{TEST_BUCKET}/nested/file2", + f"{TEST_BUCKET}/nested/nested2", + f"{TEST_BUCKET}/nested/nested2/file1", + f"{TEST_BUCKET}/nested/nested2/file2", + ] + + assert gcs.find(f"{TEST_BUCKET}/nested", maxdepth=1) == [ + f"{TEST_BUCKET}/nested/file1", + f"{TEST_BUCKET}/nested/file2", + ] + + assert gcs.find(f"{TEST_BUCKET}/nested", maxdepth=1, withdirs=True) == [ + f"{TEST_BUCKET}/nested", + f"{TEST_BUCKET}/nested/file1", + f"{TEST_BUCKET}/nested/file2", + f"{TEST_BUCKET}/nested/nested2", + ] + + with pytest.raises(ValueError, match="maxdepth must be at least 1"): + gcs.find(f"{TEST_BUCKET}/nested", maxdepth=0) diff --git a/gcsfs/tests/test_credentials.py b/gcsfs/tests/test_credentials.py index c3119888..5c1bb658 100644 --- a/gcsfs/tests/test_credentials.py +++ b/gcsfs/tests/test_credentials.py @@ -1,7 +1,18 @@ +import pytest + +from gcsfs import GCSFileSystem from gcsfs.credentials import GoogleCredentials +from gcsfs.retry import HttpError def test_googlecredentials_none(): credentials = GoogleCredentials(project="myproject", token=None, access="read_only") headers = {} credentials.apply(headers) + + +@pytest.mark.parametrize("token", ["", "incorrect.token", "x" * 100]) +def test_credentials_from_raw_token(token): + with pytest.raises(HttpError, match="Invalid Credentials"): + fs = GCSFileSystem(project="myproject", token=token) + fs.ls("/") diff --git a/gcsfs/tests/test_inventory_report.py b/gcsfs/tests/test_inventory_report.py new file mode 100644 index 00000000..36e42270 --- /dev/null +++ b/gcsfs/tests/test_inventory_report.py @@ -0,0 +1,751 @@ +import asyncio +from datetime import datetime, timedelta +from unittest import mock + +import pytest + +from gcsfs.core import GCSFileSystem +from gcsfs.inventory_report import InventoryReport, InventoryReportConfig + + +class TestInventoryReport(object): + """ + Unit tests for the inventory report logic, see 'inventory_report.py'. + + The test cases follow the same ordering as the methods in `inventory.report.py`. + Each method is covered by either one or more parametrized test cases. Some + methods include a setup method just above them. + """ + + @pytest.mark.parametrize( + "inventory_report_info, expected_error", + [ + # Check whether missing inventory report info will raise exception. + ( + {"location": "us-west", "id": "123"}, + "Use snapshot listing is not configured.", + ), + ( + {"use_snapshot_listing": True, "id": "123"}, + "Inventory report location is not configured.", + ), + ( + {"use_snapshot_listing": True, "location": "us-west"}, + "Inventory report id is not configured.", + ), + # Check complete inventory report info will not raise exception. + ({"use_snapshot_listing": True, "location": "us-west", "id": "123"}, None), + ], + ) + def test_validate_inventory_report_info( + self, inventory_report_info, expected_error + ): + if expected_error is not None: + with pytest.raises(ValueError) as e_info: + InventoryReport._validate_inventory_report_info( + inventory_report_info=inventory_report_info + ) + assert str(e_info.value) == expected_error + else: + # If no error is expected, we simply call the function + # to ensure no exception is raised. + InventoryReport._validate_inventory_report_info( + inventory_report_info=inventory_report_info + ) + + @pytest.mark.asyncio + @pytest.mark.parametrize( + "location, id, exception, expected_result", + [ + # Test no error fetching proceeds normally. + ("us-west", "id1", None, {"config": "config1"}), + # Test if the exception is caught successfully. + ("us-west", "id2", Exception("fetch error"), None), + ], + ) + async def test_fetch_raw_inventory_report_config( + self, location, id, exception, expected_result + ): + # Mocking the gcs_file_system. + gcs_file_system = mock.MagicMock() + gcs_file_system.project = "project" + + # Mocking gcs_file_system._call. + if exception is not None: + gcs_file_system._call = mock.MagicMock(side_effect=exception) + else: + return_value = asyncio.Future() + return_value.set_result(expected_result) + gcs_file_system._call = mock.MagicMock(return_value=return_value) + + if exception is not None: + with pytest.raises(Exception) as e_info: + await InventoryReport._fetch_raw_inventory_report_config( + gcs_file_system=gcs_file_system, location=location, id=id + ) + assert str(e_info.value) == str(exception) + else: + result = await InventoryReport._fetch_raw_inventory_report_config( + gcs_file_system=gcs_file_system, location=location, id=id + ) + gcs_file_system._call.assert_called_once_with( + "GET", mock.ANY, json_out=True + ) + assert result == expected_result + + def test_parse_raw_inventory_report_config_invalid_date(self): + today = datetime.today().date() + + # Get tomorrow's date. + tomorrow = today + timedelta(days=1) + + # Get the date a week later. + next_week = today + timedelta(days=7) + + raw_inventory_report_config = { + "frequencyOptions": { + "startDate": { + "day": tomorrow.day, + "month": tomorrow.month, + "year": tomorrow.year, + }, + "endDate": { + "day": next_week.day, + "month": next_week.month, + "year": next_week.year, + }, + }, + "objectMetadataReportOptions": mock.MagicMock(), + "csvOptions": mock.MagicMock(), + } + + # If the current date is outside the ranges in the inventory report + # an exception should be raised. + with pytest.raises(ValueError): + InventoryReport._parse_raw_inventory_report_config( + raw_inventory_report_config=raw_inventory_report_config, + use_snapshot_listing=mock.MagicMock(), + ) + + def test_parse_raw_inventory_report_config_missing_metadata_fields(self): + raw_inventory_report_config = { + "frequencyOptions": mock.MagicMock(), + "objectMetadataReportOptions": { + "metadataFields": ["project", "bucket", "name"], + "storageDestinationOptions": mock.MagicMock(), + }, + "csvOptions": mock.MagicMock(), + } + + # When the user wants to use snapshot listing, but object size is not + # included in the inventory reports, an exception should be raised. + with pytest.raises(ValueError): + InventoryReport._parse_raw_inventory_report_config( + raw_inventory_report_config=raw_inventory_report_config, + use_snapshot_listing=True, + ) + + def test_parse_raw_inventory_report_config_returns_correct_config(self): + bucket = "bucket" + destination_path = "path/to/inventory-report" + metadata_fields = ["project", "bucket", "name", "size"] + obj_name_idx = metadata_fields.index("name") + today = datetime.today().date() + yesterday = today - timedelta(days=1) + tomorrow = today + timedelta(days=1) + use_snapshot_listing = False + + csv_options = { + "recordSeparator": "\n", + "delimiter": ",", + "headerRequired": False, + } + + raw_inventory_report_config = { + "frequencyOptions": { + "startDate": { + "day": yesterday.day, + "month": yesterday.month, + "year": yesterday.year, + }, + "endDate": { + "day": tomorrow.day, + "month": tomorrow.month, + "year": tomorrow.year, + }, + }, + "objectMetadataReportOptions": { + "metadataFields": metadata_fields, + "storageDestinationOptions": { + "bucket": bucket, + "destinationPath": destination_path, + }, + }, + "csvOptions": csv_options, + } + + try: + inventory_report_config = ( + InventoryReport._parse_raw_inventory_report_config( + raw_inventory_report_config=raw_inventory_report_config, + use_snapshot_listing=use_snapshot_listing, + ) + ) + + assert isinstance(inventory_report_config, InventoryReportConfig) + + assert inventory_report_config.csv_options == csv_options + assert inventory_report_config.bucket == bucket + assert inventory_report_config.destination_path == destination_path + assert inventory_report_config.metadata_fields == metadata_fields + assert inventory_report_config.obj_name_idx == obj_name_idx + + except Exception as e: + pytest.fail(f"Unexpected exception: {e}.") + + @pytest.mark.asyncio + async def test_fetch_inventory_report_metadata_no_reports(self): + # Create a mock for GCSFileSystem. + gcs_file_system = mock.MagicMock(spec=GCSFileSystem) + + # Mock the _call method to return a page with two items + # and then a page with one item and without next page token. + gcs_file_system._call.side_effect = [{"items": [], "nextPageToken": None}] + + # Create a mock for InventoryReportConfig. + inventory_report_config = mock.MagicMock(spec=InventoryReportConfig) + inventory_report_config.bucket = "bucket_name" + inventory_report_config.destination_path = "destination_path" + + # If no inventory report metadata is fetched, an exception should be raised. + match = "No inventory reports to fetch. Check if \ + your inventory report is set up correctly." + with pytest.raises(ValueError, match=match): + await InventoryReport._fetch_inventory_report_metadata( + gcs_file_system=gcs_file_system, + inventory_report_config=inventory_report_config, + ) + + @pytest.mark.asyncio + async def test_fetch_inventory_report_metadata_multiple_calls(self): + # Create a mock for GCSFileSystem. + gcs_file_system = mock.MagicMock(spec=GCSFileSystem) + + # Mock the _call method to return a page with two items + # and then a page with one item and without next page token. + gcs_file_system._call.side_effect = [ + {"items": ["item1", "item2"], "nextPageToken": "token1"}, + {"items": ["item3"], "nextPageToken": None}, + ] + + # Create a mock for InventoryReportConfig. + inventory_report_config = mock.MagicMock(spec=InventoryReportConfig) + inventory_report_config.bucket = "bucket_name" + inventory_report_config.destination_path = "destination_path" + + result = await InventoryReport._fetch_inventory_report_metadata( + gcs_file_system=gcs_file_system, + inventory_report_config=inventory_report_config, + ) + + # Check that _call was called with the right arguments. + calls = [ + mock.call( + "GET", "b/{}/o", "bucket_name", prefix="destination_path", json_out=True + ), + mock.call( + "GET", + "b/{}/o", + "bucket_name", + prefix="destination_path", + pageToken="token1", + json_out=True, + ), + ] + gcs_file_system._call.assert_has_calls(calls) + + # Check that the function correctly processed the response + # and returned the right result. + assert result == ["item1", "item2", "item3"] + + @pytest.mark.parametrize( + "unsorted_inventory_report_metadata, expected", + [ + ( + # Input. + [ + {"timeCreated": "2023-08-01T12:00:00Z"}, + {"timeCreated": "2023-08-02T12:00:00Z"}, + {"timeCreated": "2023-08-03T12:00:00Z"}, + ], + # Expected output. + [ + {"timeCreated": "2023-08-03T12:00:00Z"}, + {"timeCreated": "2023-08-02T12:00:00Z"}, + {"timeCreated": "2023-08-01T12:00:00Z"}, + ], + ), + ( + # Input. + [ + {"timeCreated": "2023-08-01T12:00:00Z"}, + {"timeCreated": "2023-07-31T12:00:00Z"}, + {"timeCreated": "2023-08-02T12:00:00Z"}, + ], + # Expected output. + [ + {"timeCreated": "2023-08-02T12:00:00Z"}, + {"timeCreated": "2023-08-01T12:00:00Z"}, + {"timeCreated": "2023-07-31T12:00:00Z"}, + ], + ), + ], + ) + def test_sort_inventory_report_metadata( + self, unsorted_inventory_report_metadata, expected + ): + result = InventoryReport._sort_inventory_report_metadata( + unsorted_inventory_report_metadata=unsorted_inventory_report_metadata + ) + assert result == expected + + @pytest.fixture( + params=[ + # Unique most recent day, same datetime. + ( + [ + {"name": "report1", "timeCreated": "2023-08-02T12:00:00.000Z"}, + {"name": "report2", "timeCreated": "2023-08-01T12:00:00.000Z"}, + ], + # Expected results. + ["report1"], + ), + # Multiple most recent day, same datetime. + ( + [ + {"name": "report1", "timeCreated": "2023-08-02T12:00:00.000Z"}, + {"name": "report2", "timeCreated": "2023-08-02T12:00:00.000Z"}, + {"name": "report3", "timeCreated": "2023-08-01T12:00:00.000Z"}, + ], + # Expected results. + ["report1", "report2"], + ), + # Multiple most recent day, different datetimes (same day, different hour). + ( + [ + {"name": "report1", "timeCreated": "2023-08-02T12:00:00.000Z"}, + {"name": "report2", "timeCreated": "2023-08-02T11:00:00.000Z"}, + {"name": "report3", "timeCreated": "2023-08-01T12:00:00.000Z"}, + ], + # Expected results. + ["report1", "report2"], + ), + ] + ) + def download_inventory_report_content_setup(self, request): + bucket = "bucket" + gcs_file_system = mock.MagicMock() + inventory_report_metadata, expected_reports = request.param + + # We are accessing the third argument as the return value, + # since it is the object name in the function. + # We are also encoding the content, since the actual method call needs + # to decode the content. + async_side_effect = mock.AsyncMock( + side_effect=lambda *args, **kwargs: ("_header", args[3].encode()) + ) + gcs_file_system._call = async_side_effect + return gcs_file_system, inventory_report_metadata, bucket, expected_reports + + @pytest.mark.asyncio + async def test_download_inventory_report_content( + self, download_inventory_report_content_setup + ): + ( + gcs_file_system, + inventory_report_metadata, + bucket, + expected_reports, + ) = download_inventory_report_content_setup + + result = await InventoryReport._download_inventory_report_content( + gcs_file_system=gcs_file_system, + inventory_report_metadata=inventory_report_metadata, + bucket=bucket, + ) + + # Verify the mocked downloaded reports match (ordering does not matter). + assert sorted(result) == sorted(expected_reports) + + @pytest.mark.parametrize( + "inventory_report_line, use_snapshot_listing, \ + inventory_report_config_attrs, delimiter, bucket, expected", + [ + # Test case 1: use snapshot listing with specific metadata + # fields and delimiter. + ( + "object1,value1,value2", + True, + {"obj_name_idx": 0, "metadata_fields": ["name", "field1", "field2"]}, + ",", + "bucket", + {"name": "object1", "field1": "value1", "field2": "value2"}, + ), + # Test case 2: do not use snapshot listing and only fetch the name. + ( + "object1,value1,value2", + False, + {"obj_name_idx": 0, "metadata_fields": ["name", "field1", "field2"]}, + ",", + "bucket", + {"name": "object1"}, + ), + ], + ) + def test_parse_inventory_report_line( + self, + inventory_report_line, + use_snapshot_listing, + inventory_report_config_attrs, + delimiter, + bucket, + expected, + ): + # Mock InventoryReportConfig. + inventory_report_config = mock.MagicMock(spec=InventoryReportConfig) + inventory_report_config.obj_name_idx = inventory_report_config_attrs.get( + "obj_name_idx" + ) + inventory_report_config.metadata_fields = inventory_report_config_attrs.get( + "metadata_fields" + ) + + # Mock GCSFileSystem. + gcs_file_system = mock.MagicMock(spec=GCSFileSystem) + gcs_file_system._process_object = mock.Mock(side_effect=lambda obj, bucket: obj) + + result = InventoryReport._parse_inventory_report_line( + inventory_report_line=inventory_report_line, + use_snapshot_listing=use_snapshot_listing, + gcs_file_system=gcs_file_system, + inventory_report_config=inventory_report_config, + delimiter=delimiter, + bucket=bucket, + ) + + assert result == expected + + @pytest.fixture( + params=[ + # One file, one lines. + (["header \n line1"], {"recordSeparator": "\n", "headerRequired": True}), + (["line1"], {"recordSeparator": "\n", "headerRequired": False}), + ( + ["header \r\n line1"], + {"recordSeparator": "\r\n", "headerRequired": True}, + ), + (["line1"], {"recordSeparator": "\r\n", "headerRequired": False}), + # One file, multiple lines. + ( + ["header \n line1 \n line2 \n line3"], + {"recordSeparator": "\n", "headerRequired": True}, + ), + ( + ["line1 \n line2 \n line3"], + {"recordSeparator": "\n", "headerRequired": False}, + ), + ( + ["header \r\n line1 \r\n line2 \r\n line3"], + {"recordSeparator": "\r\n", "headerRequired": True}, + ), + ( + ["line1 \r\n line2 \r\n line3"], + {"recordSeparator": "\r\n", "headerRequired": False}, + ), + # Multiple files. + ( + ["line1", "line2 \n line3"], + {"recordSeparator": "\n", "headerRequired": False}, + ), + ( + ["header \n line1", "header \n line2 \n line3"], + {"recordSeparator": "\n", "headerRequired": True}, + ), + ] + ) + def parse_inventory_report_content_setup(self, request): + # Mock the necessary parameters. + gcs_file_system = mock.MagicMock() + bucket = mock.MagicMock() + use_snapshot_listing = mock.MagicMock() + + # Parse the content and config data. + inventory_report_content = request.param[0] + inventory_report_config = request.param[1] + record_separator = inventory_report_config["recordSeparator"] + header_required = inventory_report_config["headerRequired"] + + # Construct custom inventory report config. + inventory_report_config = mock.MagicMock(spec=InventoryReportConfig) + inventory_report_config.csv_options = { + "recordSeparator": record_separator, + "headerRequired": header_required, + } + + # Stub parse_inventory_report_line method. + InventoryReport._parse_inventory_report_line = mock.MagicMock( + side_effect="parsed_inventory_report_line" + ) + + return ( + gcs_file_system, + inventory_report_content, + inventory_report_config, + bucket, + use_snapshot_listing, + ) + + def test_parse_inventory_reports(self, parse_inventory_report_content_setup): + ( + gcs_file_system, + inventory_report_content, + inventory_report_config, + bucket, + use_snapshot_listing, + ) = parse_inventory_report_content_setup + + record_separator = inventory_report_config.csv_options["recordSeparator"] + header_required = inventory_report_config.csv_options["headerRequired"] + + # Number of inventory reports. + num_inventory_reports = len(inventory_report_content) + + # Tota, number of object metadata lines. + total_lines_in_reports = sum( + content.count(record_separator) + 1 for content in inventory_report_content + ) + + # Remove the header line for each line if header is present. + total_lines_in_reports -= num_inventory_reports * 1 if header_required else 0 + + result = InventoryReport._parse_inventory_report_content( + gcs_file_system=gcs_file_system, + inventory_report_content=inventory_report_content, + inventory_report_config=inventory_report_config, + use_snapshot_listing=use_snapshot_listing, + bucket=bucket, + ) + + # Assert that the number of objects returned is correct. + assert len(result) == total_lines_in_reports + + # Assert parse_inventory_report_line was called the correct + # number of times. + assert ( + InventoryReport._parse_inventory_report_line.call_count + == total_lines_in_reports + ) + + @pytest.mark.parametrize( + "use_snapshot_listing, prefix, mock_objects, expected_result", + [ + # Not using snapshot, no prefix, directory, all matched. + ( + False, + None, + [{"name": "prefix/object1"}, {"name": "prefix/object2"}], + ([{"name": "prefix/object1"}, {"name": "prefix/object2"}], []), + ), + # Not using snapshot, no prefix, no directory, all matched. + ( + False, + None, + [{"name": "object1"}, {"name": "object2"}], + ([{"name": "object1"}, {"name": "object2"}], []), + ), + # Not using snapshot, prefix, directory, all matched. + ( + False, + "prefix", + [{"name": "prefix/object1"}, {"name": "prefix/object2"}], + ([{"name": "prefix/object1"}, {"name": "prefix/object2"}], []), + ), + # Not using snapshot, prefix, directory, some matched. + ( + False, + "prefix", + [{"name": "prefix/object1"}, {"name": "object2"}], + ([{"name": "prefix/object1"}], []), + ), + # Not using snapshot, prefix, directory, none matched. + (False, "prefix", [{"name": "a/object1"}, {"name": "b/object2"}], ([], [])), + # Not using snapshot, prefix, no directory, all matched. + ( + False, + "object", + [{"name": "object1"}, {"name": "object2"}], + ([{"name": "object1"}, {"name": "object2"}], []), + ), + # Not using snapshot, prefix, no directory, some matched. + ( + False, + "object", + [{"name": "object1"}, {"name": "obj2"}], + ([{"name": "object1"}], []), + ), + # Not using snapshot, prefix, no directory, none matched. + (False, "object", [{"name": "obj1"}, {"name": "obj2"}], ([], [])), + # Using snapshot, no prefix, no directory. + ( + True, + None, + [{"name": "object1"}, {"name": "object2"}], + ([{"name": "object1"}, {"name": "object2"}], []), + ), + # Using snapshot, no prefix, a single directory. + ( + True, + None, + [{"name": "object1"}, {"name": "dir/object2"}], + ([{"name": "object1"}], ["dir/"]), + ), + # Using snapshot, no prefix, multiple directories. + ( + True, + None, + [ + {"name": "object1"}, + {"name": "dir1/object2"}, + {"name": "dir2/object3"}, + ], + ([{"name": "object1"}], ["dir1/", "dir2/"]), + ), + # Using snapshot, no prefix, same directory multiple times. + ( + True, + None, + [ + {"name": "object1"}, + {"name": "dir1/object2"}, + {"name": "dir1/object3"}, + ], + ([{"name": "object1"}], ["dir1/"]), + ), + # Using snapshot, prefix, no directory. + ( + True, + "object", + [{"name": "object1"}, {"name": "object2"}], + ([{"name": "object1"}, {"name": "object2"}], []), + ), + # Using snapshot, prefix, a single directory. + ( + True, + "dir1/", + [{"name": "dir1/dir2/object1"}, {"name": "dir1/object2"}], + ([{"name": "dir1/object2"}], ["dir1/dir2/"]), + ), + # Using snapshot, prefix, multiple directories. + ( + True, + "dir1/", + [ + {"name": "dir1/dir2/object1"}, + {"name": "dir1/dir3/object2"}, + {"name": "dir1/object3"}, + ], + ([{"name": "dir1/object3"}], ["dir1/dir2/", "dir1/dir3/"]), + ), + # Using snapshot, prefix, same directory multiple times. + ( + True, + "dir1/", + [ + {"name": "dir1/dir2/object1"}, + {"name": "dir1/dir2/object2"}, + {"name": "dir1/object3"}, + ], + ([{"name": "dir1/object3"}], ["dir1/dir2/"]), + ), + # Sanity check from the examples given by the JSON API. + # https://cloud.google.com/storage/docs/json_api/v1/objects/list + ( + True, + None, + [ + {"name": "a/b"}, + {"name": "a/c"}, + {"name": "d"}, + {"name": "e"}, + {"name": "e/f"}, + {"name": "e/g/h"}, + ], + ([{"name": "d"}, {"name": "e"}], ["a/", "e/"]), + ), + ( + True, + "e/", + [ + {"name": "a/b"}, + {"name": "a/c"}, + {"name": "d"}, + {"name": "e"}, + {"name": "e/f"}, + {"name": "e/g/h"}, + ], + ([{"name": "e/f"}], ["e/g/"]), + ), + ( + True, + "e", + [ + {"name": "a/b"}, + {"name": "a/c"}, + {"name": "d"}, + {"name": "e"}, + {"name": "e/f"}, + {"name": "e/g/h"}, + ], + ([{"name": "e"}], ["e/"]), + ), + ], + ) + def test_construct_final_snapshot( + self, use_snapshot_listing, prefix, mock_objects, expected_result + ): + # Construct the final snapshot. + result = InventoryReport._construct_final_snapshot( + objects=mock_objects, + prefix=prefix, + use_snapshot_listing=use_snapshot_listing, + ) + + # Assert the expected outcomes. + items, prefixes = result + expected_items, expected_prefixes = expected_result + assert items == expected_items + assert sorted(prefixes) == sorted(expected_prefixes) + + +# Test fields of the inventory report config is correctly stored. +class TestInventoryReportConfig: + def test_inventory_report_config_creation(self): + csv_options = {} + bucket = "bucket" + destination_path = "" + metadata_fields = [] + obj_name_idx = 0 + + inventory_report_config = InventoryReportConfig( + csv_options=csv_options, + bucket=bucket, + destination_path=destination_path, + metadata_fields=metadata_fields, + obj_name_idx=obj_name_idx, + ) + + assert inventory_report_config.csv_options == csv_options + assert inventory_report_config.bucket == bucket + assert inventory_report_config.destination_path == destination_path + assert inventory_report_config.metadata_fields == metadata_fields + assert inventory_report_config.obj_name_idx == obj_name_idx diff --git a/gcsfs/tests/test_inventory_report_listing.py b/gcsfs/tests/test_inventory_report_listing.py new file mode 100644 index 00000000..584485a7 --- /dev/null +++ b/gcsfs/tests/test_inventory_report_listing.py @@ -0,0 +1,29 @@ +import gcsfs.checkers +import gcsfs.tests.settings +from gcsfs.inventory_report import InventoryReport + +TEST_BUCKET = gcsfs.tests.settings.TEST_BUCKET + + +# Basic integration test to ensure listing returns the correct result. +def test_ls_base(monkeypatch, gcs): + # First get results from original listing. + items = gcs.ls(TEST_BUCKET) + + async def mock_fetch_snapshot(*args, **kwargs): + return [{"name": item} for item in items], [] + + # Patch the fetch_snapshot method with the replacement. + monkeypatch.setattr(InventoryReport, "fetch_snapshot", mock_fetch_snapshot) + + inventory_report_info = { + "location": "location", + "id": "id", + "use_snapshot_listing": False, + } + + # Then get results from listing with inventory report. + actual_items = gcs.ls(TEST_BUCKET, inventory_report_info=inventory_report_info) + + # Check equality. + assert actual_items == items diff --git a/gcsfs/tests/test_mapping.py b/gcsfs/tests/test_mapping.py index b8162da2..a8f1220b 100644 --- a/gcsfs/tests/test_mapping.py +++ b/gcsfs/tests/test_mapping.py @@ -57,28 +57,11 @@ def test_map_with_data(gcs): assert list(d) == [] -def test_map_complex_keys(gcs): - d = gcs.get_mapper(MAPPING_ROOT) - d[1] = b"hello" - assert d[1] == b"hello" - del d[1] - - d[1, 2] = b"world" - assert d[1, 2] == b"world" - del d[1, 2] - - d["x", 1, 2] = b"hello world" - assert d["x", 1, 2] == b"hello world" - - assert ("x", 1, 2) in d - - def test_map_clear_empty(gcs): d = gcs.get_mapper(MAPPING_ROOT) d.clear() assert list(d) == [] - d[1] = b"1" - # may repeat the test below, since VCR sometimes picks the wrong call to ls + d["1"] = b"1" assert list(d) == ["1"] or list(d) == ["1"] d.clear() assert list(d) == [] diff --git a/gcsfs/tests/test_retry.py b/gcsfs/tests/test_retry.py index 306c99af..79058f52 100644 --- a/gcsfs/tests/test_retry.py +++ b/gcsfs/tests/test_retry.py @@ -1,4 +1,7 @@ +import multiprocessing import os +import pickle +from concurrent.futures import ProcessPoolExecutor import pytest import requests @@ -39,6 +42,37 @@ def test_retriable_exception(): assert is_retriable(e) +def test_pickle_serialization(): + expected = HttpError({"message": "", "code": 400}) + + # Serialize/Deserialize + serialized = pickle.dumps(expected) + actual = pickle.loads(serialized) + + is_same_type = type(expected) is type(actual) + is_same_args = expected.args == actual.args + + assert is_same_type and is_same_args + + +def conditional_exception(process_id): + # Raise only on second process (id=1) + if process_id == 1: + raise HttpError({"message": "", "code": 400}) + + +def test_multiprocessing_error_handling(): + # Ensure spawn context to avoid forking issues + ctx = multiprocessing.get_context("spawn") + + # Run on two processes + with ProcessPoolExecutor(2, mp_context=ctx) as p: + results = p.map(conditional_exception, range(2)) + + with pytest.raises(HttpError): + _ = [result for result in results] + + def test_validate_response(): validate_response(200, None, "/path") diff --git a/requirements.txt b/requirements.txt index 5f87b743..d270996c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ aiohttp!=4.0.0a0, !=4.0.0a1 decorator>4.1.2 -fsspec==2022.11.0 +fsspec==2023.9.0 google-auth>=1.2 google-auth-oauthlib google-cloud-storage diff --git a/setup.py b/setup.py index 90712349..dcf0fabe 100755 --- a/setup.py +++ b/setup.py @@ -20,10 +20,10 @@ "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ], keywords=["google-cloud-storage", "gcloud", "file-system"], packages=["gcsfs", "gcsfs.cli"], @@ -32,6 +32,6 @@ open("README.rst").read() if os.path.exists("README.rst") else "" ), extras_require={"gcsfuse": ["fusepy"], "crc": ["crcmod"]}, - python_requires=">=3.7", + python_requires=">=3.8", zip_safe=False, ) diff --git a/versioneer.py b/versioneer.py index c4dac8e1..de97d904 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,4 +1,4 @@ -# Version: 0.18 +# Version: 0.29 """The Versioneer - like a rocketeer, but for versions. @@ -6,18 +6,14 @@ ============== * like a rocketeer, but for versions! -* https://github.com/warner/python-versioneer +* https://github.com/python-versioneer/python-versioneer * Brian Warner -* License: Public Domain -* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy -* [![Latest Version] -(https://pypip.in/version/versioneer/badge.svg?style=flat) -](https://pypi.python.org/pypi/versioneer/) -* [![Build Status] -(https://travis-ci.org/warner/python-versioneer.png?branch=master) -](https://travis-ci.org/warner/python-versioneer) - -This is a tool for managing a recorded version number in distutils-based +* License: Public Domain (Unlicense) +* Compatible with: Python 3.7, 3.8, 3.9, 3.10, 3.11 and pypy3 +* [![Latest Version][pypi-image]][pypi-url] +* [![Build Status][travis-image]][travis-url] + +This is a tool for managing a recorded version number in setuptools-based python projects. The goal is to remove the tedious and error-prone "update the embedded version string" step from your release process. Making a new release should be as easy as recording a new tag in your version-control @@ -26,9 +22,38 @@ ## Quick Install -* `pip install versioneer` to somewhere to your $PATH -* add a `[versioneer]` section to your setup.cfg (see below) -* run `versioneer install` in your source tree, commit the results +Versioneer provides two installation modes. The "classic" vendored mode installs +a copy of versioneer into your repository. The experimental build-time dependency mode +is intended to allow you to skip this step and simplify the process of upgrading. + +### Vendored mode + +* `pip install versioneer` to somewhere in your $PATH + * A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is + available, so you can also use `conda install -c conda-forge versioneer` +* add a `[tool.versioneer]` section to your `pyproject.toml` or a + `[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md)) + * Note that you will need to add `tomli; python_version < "3.11"` to your + build-time dependencies if you use `pyproject.toml` +* run `versioneer install --vendor` in your source tree, commit the results +* verify version information with `python setup.py version` + +### Build-time dependency mode + +* `pip install versioneer` to somewhere in your $PATH + * A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is + available, so you can also use `conda install -c conda-forge versioneer` +* add a `[tool.versioneer]` section to your `pyproject.toml` or a + `[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md)) +* add `versioneer` (with `[toml]` extra, if configuring in `pyproject.toml`) + to the `requires` key of the `build-system` table in `pyproject.toml`: + ```toml + [build-system] + requires = ["setuptools", "versioneer[toml]"] + build-backend = "setuptools.build_meta" + ``` +* run `versioneer install --no-vendor` in your source tree, commit the results +* verify version information with `python setup.py version` ## Version Identifiers @@ -60,7 +85,7 @@ for example `git describe --tags --dirty --always` reports things like "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has -uncommitted changes. +uncommitted changes). The version identifier is used for multiple purposes: @@ -165,7 +190,7 @@ Some situations are known to cause problems for Versioneer. This details the most significant ones. More can be found on Github -[issues page](https://github.com/warner/python-versioneer/issues). +[issues page](https://github.com/python-versioneer/python-versioneer/issues). ### Subprojects @@ -179,7 +204,7 @@ `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI distributions (and upload multiple independently-installable tarballs). * Source trees whose main purpose is to contain a C library, but which also - provide bindings to Python (and perhaps other langauges) in subdirectories. + provide bindings to Python (and perhaps other languages) in subdirectories. Versioneer will look for `.git` in parent directories, and most operations should get the right version string. However `pip` and `setuptools` have bugs @@ -193,9 +218,9 @@ Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in some later version. -[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking +[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking this issue. The discussion in -[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the +[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the issue from the Versioneer side in more detail. [pip PR#3176](https://github.com/pypa/pip/pull/3176) and [pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve @@ -223,31 +248,20 @@ cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into a different virtualenv), so this can be surprising. -[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes +[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes this one, but upgrading to a newer version of setuptools should probably resolve it. -### Unicode version strings - -While Versioneer works (and is continually tested) with both Python 2 and -Python 3, it is not entirely consistent with bytes-vs-unicode distinctions. -Newer releases probably generate unicode version strings on py2. It's not -clear that this is wrong, but it may be surprising for applications when then -write these strings to a network connection or include them in bytes-oriented -APIs like cryptographic checksums. - -[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates -this question. - ## Updating Versioneer To upgrade your project to a new release of Versioneer, do the following: * install the new Versioneer (`pip install -U versioneer` or equivalent) -* edit `setup.cfg`, if necessary, to include any new configuration settings - indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. -* re-run `versioneer install` in your source tree, to replace +* edit `setup.cfg` and `pyproject.toml`, if necessary, + to include any new configuration settings indicated by the release notes. + See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install --[no-]vendor` in your source tree, to replace `SRC/_version.py` * commit any changed files @@ -264,36 +278,70 @@ direction and include code from all supported VCS systems, reducing the number of intermediate scripts. +## Similar projects + +* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time + dependency +* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of + versioneer +* [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools + plugin ## License To make Versioneer easier to embed, all its code is dedicated to the public domain. The `_version.py` that it creates is also in the public domain. -Specifically, both are released under the Creative Commons "Public Domain -Dedication" license (CC0-1.0), as described in -https://creativecommons.org/publicdomain/zero/1.0/ . +Specifically, both are released under the "Unlicense", as described in +https://unlicense.org/. -""" +[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg +[pypi-url]: https://pypi.python.org/pypi/versioneer/ +[travis-image]: +https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg +[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer -from __future__ import print_function +""" +# pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring +# pylint:disable=missing-class-docstring,too-many-branches,too-many-statements +# pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error +# pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with +# pylint:disable=attribute-defined-outside-init,too-many-arguments -try: - import configparser -except ImportError: - import ConfigParser as configparser +import configparser import errno import json import os import re import subprocess import sys +from pathlib import Path +from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union +from typing import NoReturn +import functools + +have_tomllib = True +if sys.version_info >= (3, 11): + import tomllib +else: + try: + import tomli as tomllib + except ImportError: + have_tomllib = False class VersioneerConfig: """Container for Versioneer configuration parameters.""" + VCS: str + style: str + tag_prefix: str + versionfile_source: str + versionfile_build: Optional[str] + parentdir_prefix: Optional[str] + verbose: Optional[bool] + -def get_root(): +def get_root() -> str: """Get the project root directory. We require that all commands are run from the project root, i.e. the @@ -301,13 +349,23 @@ def get_root(): """ root = os.path.realpath(os.path.abspath(os.getcwd())) setup_py = os.path.join(root, "setup.py") + pyproject_toml = os.path.join(root, "pyproject.toml") versioneer_py = os.path.join(root, "versioneer.py") - if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + if not ( + os.path.exists(setup_py) + or os.path.exists(pyproject_toml) + or os.path.exists(versioneer_py) + ): # allow 'python path/to/setup.py COMMAND' root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) setup_py = os.path.join(root, "setup.py") + pyproject_toml = os.path.join(root, "pyproject.toml") versioneer_py = os.path.join(root, "versioneer.py") - if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + if not ( + os.path.exists(setup_py) + or os.path.exists(pyproject_toml) + or os.path.exists(versioneer_py) + ): err = ( "Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " @@ -323,46 +381,64 @@ def get_root(): # module-import table will cache the first one. So we can't use # os.path.dirname(__file__), as that will find whichever # versioneer.py was first imported, even in later projects. - me = os.path.realpath(os.path.abspath(__file__)) - me_dir = os.path.normcase(os.path.splitext(me)[0]) + my_path = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(my_path)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) - if me_dir != vsr_dir: + if me_dir != vsr_dir and "VERSIONEER_PEP518" not in globals(): print( "Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(me), versioneer_py) + % (os.path.dirname(my_path), versioneer_py) ) except NameError: pass return root -def get_config_from_root(root): +def get_config_from_root(root: str) -> VersioneerConfig: """Read the project setup.cfg file to determine Versioneer config.""" - # This might raise EnvironmentError (if setup.cfg is missing), or + # This might raise OSError (if setup.cfg is missing), or # configparser.NoSectionError (if it lacks a [versioneer] section), or # configparser.NoOptionError (if it lacks "VCS="). See the docstring at # the top of versioneer.py for instructions on writing your setup.cfg . - setup_cfg = os.path.join(root, "setup.cfg") - parser = configparser.SafeConfigParser() - with open(setup_cfg, "r") as f: - parser.readfp(f) - VCS = parser.get("versioneer", "VCS") # mandatory - - def get(parser, name): - if parser.has_option("versioneer", name): - return parser.get("versioneer", name) - return None + root_pth = Path(root) + pyproject_toml = root_pth / "pyproject.toml" + setup_cfg = root_pth / "setup.cfg" + section: Union[Dict[str, Any], configparser.SectionProxy, None] = None + if pyproject_toml.exists() and have_tomllib: + try: + with open(pyproject_toml, "rb") as fobj: + pp = tomllib.load(fobj) + section = pp["tool"]["versioneer"] + except (tomllib.TOMLDecodeError, KeyError) as e: + print(f"Failed to load config from {pyproject_toml}: {e}") + print("Try to load it from setup.cfg") + if not section: + parser = configparser.ConfigParser() + with open(setup_cfg) as cfg_file: + parser.read_file(cfg_file) + parser.get("versioneer", "VCS") # raise error if missing + + section = parser["versioneer"] + + # `cast`` really shouldn't be used, but its simplest for the + # common VersioneerConfig users at the moment. We verify against + # `None` values elsewhere where it matters cfg = VersioneerConfig() - cfg.VCS = VCS - cfg.style = get(parser, "style") or "" - cfg.versionfile_source = get(parser, "versionfile_source") - cfg.versionfile_build = get(parser, "versionfile_build") - cfg.tag_prefix = get(parser, "tag_prefix") - if cfg.tag_prefix in ("''", '""'): + cfg.VCS = section["VCS"] + cfg.style = section.get("style", "") + cfg.versionfile_source = cast(str, section.get("versionfile_source")) + cfg.versionfile_build = section.get("versionfile_build") + cfg.tag_prefix = cast(str, section.get("tag_prefix")) + if cfg.tag_prefix in ("''", '""', None): cfg.tag_prefix = "" - cfg.parentdir_prefix = get(parser, "parentdir_prefix") - cfg.verbose = get(parser, "verbose") + cfg.parentdir_prefix = section.get("parentdir_prefix") + if isinstance(section, configparser.SectionProxy): + # Make sure configparser translates to bool + cfg.verbose = section.getboolean("verbose") + else: + cfg.verbose = section.get("verbose") + return cfg @@ -371,41 +447,54 @@ class NotThisMethod(Exception): # these dictionaries contain VCS-specific tools -LONG_VERSION_PY = {} -HANDLERS = {} +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} -def register_vcs_handler(vcs, method): # decorator - """Decorator to mark a method as the handler for a particular VCS.""" +def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator + """Create decorator to mark a method as the handler of a VCS.""" - def decorate(f): + def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" - if vcs not in HANDLERS: - HANDLERS[vcs] = {} - HANDLERS[vcs][method] = f + HANDLERS.setdefault(vcs, {})[method] = f return f return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): +def run_command( + commands: List[str], + args: List[str], + cwd: Optional[str] = None, + verbose: bool = False, + hide_stderr: bool = False, + env: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) - p = None - for c in commands: + process = None + + popen_kwargs: Dict[str, Any] = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + + for command in commands: try: - dispcmd = str([c] + args) + dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen( - [c] + args, + process = subprocess.Popen( + [command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), + **popen_kwargs, ) break - except EnvironmentError: - e = sys.exc_info()[1] + except OSError as e: if e.errno == errno.ENOENT: continue if verbose: @@ -416,28 +505,27 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= if verbose: print("unable to find command, tried %s" % (commands,)) return None, None - stdout = p.communicate()[0].strip() - if sys.version_info[0] >= 3: - stdout = stdout.decode() - if p.returncode != 0: + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) - return None, p.returncode - return stdout, p.returncode + return None, process.returncode + return stdout, process.returncode LONG_VERSION_PY[ "git" -] = ''' +] = r''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. -# This file is released into the public domain. Generated by -# versioneer-0.18 (https://github.com/warner/python-versioneer) +# This file is released into the public domain. +# Generated by versioneer-0.29 +# https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" @@ -446,9 +534,11 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= import re import subprocess import sys +from typing import Any, Callable, Dict, List, Optional, Tuple +import functools -def get_keywords(): +def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must @@ -464,8 +554,15 @@ def get_keywords(): class VersioneerConfig: """Container for Versioneer configuration parameters.""" + VCS: str + style: str + tag_prefix: str + parentdir_prefix: str + versionfile_source: str + verbose: bool + -def get_config(): +def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py @@ -483,13 +580,13 @@ class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" -LONG_VERSION_PY = {} -HANDLERS = {} +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} -def register_vcs_handler(vcs, method): # decorator - """Decorator to mark a method as the handler for a particular VCS.""" - def decorate(f): +def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator + """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} @@ -498,22 +595,35 @@ def decorate(f): return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command( + commands: List[str], + args: List[str], + cwd: Optional[str] = None, + verbose: bool = False, + hide_stderr: bool = False, + env: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) - p = None - for c in commands: + process = None + + popen_kwargs: Dict[str, Any] = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + + for command in commands: try: - dispcmd = str([c] + args) + dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + process = subprocess.Popen([command] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None), **popen_kwargs) break - except EnvironmentError: - e = sys.exc_info()[1] + except OSError as e: if e.errno == errno.ENOENT: continue if verbose: @@ -524,18 +634,20 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, if verbose: print("unable to find command, tried %%s" %% (commands,)) return None, None - stdout = p.communicate()[0].strip() - if sys.version_info[0] >= 3: - stdout = stdout.decode() - if p.returncode != 0: + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: if verbose: print("unable to run %%s (error)" %% dispcmd) print("stdout was %%s" %% stdout) - return None, p.returncode - return stdout, p.returncode + return None, process.returncode + return stdout, process.returncode -def versions_from_parentdir(parentdir_prefix, root, verbose): +def versions_from_parentdir( + parentdir_prefix: str, + root: str, + verbose: bool, +) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both @@ -544,15 +656,14 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): """ rootdirs = [] - for i in range(3): + for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} - else: - rootdirs.append(root) - root = os.path.dirname(root) # up a level + rootdirs.append(root) + root = os.path.dirname(root) # up a level if verbose: print("Tried directories %%s but none started with prefix %%s" %% @@ -561,41 +672,48 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): @register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): +def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. - keywords = {} + keywords: Dict[str, str] = {} try: - f = open(versionfile_abs, "r") - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - f.close() - except EnvironmentError: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: pass return keywords @register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): +def git_versions_from_keywords( + keywords: Dict[str, str], + tag_prefix: str, + verbose: bool, +) -> Dict[str, Any]: """Get version information from git keywords.""" - if not keywords: - raise NotThisMethod("no keywords at all, weird") + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because @@ -608,11 +726,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set([r.strip() for r in refnames.strip("()").split(",")]) + refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %%d @@ -620,8 +738,8 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "". - tags = set([r for r in refs if re.search(r'\d', r)]) + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%%s', no digits" %% ",".join(refs - tags)) if verbose: @@ -630,6 +748,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r'\d', r): + continue if verbose: print("picking %%s" %% r) return {"version": r, @@ -645,7 +768,12 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): @register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): +def git_pieces_from_vcs( + tag_prefix: str, + root: str, + verbose: bool, + runner: Callable = run_command +) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* @@ -656,8 +784,15 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %%s not under git control" %% root) @@ -665,24 +800,57 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%%s*" %% tag_prefix], - cwd=root) + describe_out, rc = runner(GITS, [ + "describe", "--tags", "--dirty", "--always", "--long", + "--match", f"{tag_prefix}[[:digit:]]*" + ], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() - full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() - pieces = {} + pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], + cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out @@ -699,7 +867,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: - # unparseable. Maybe git-describe is misbehaving? + # unparsable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%%s'" %% describe_out) return pieces @@ -724,26 +892,27 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) - pieces["distance"] = int(count_out) # total number of commits + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], - cwd=root)[0].strip() + date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" -def render_pep440(pieces): +def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you @@ -768,23 +937,71 @@ def render_pep440(pieces): return rendered -def render_pep440_pre(pieces): - """TAG[.post.devDISTANCE] -- No -dirty. +def render_pep440_branch(pieces: Dict[str, Any]) -> str: + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). Exceptions: - 1: no tags. 0.post.devDISTANCE + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces: Dict[str, Any]) -> str: + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: if pieces["distance"]: - rendered += ".post.dev%%d" %% pieces["distance"] + # update the post release segment + tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + rendered = tag_version + if post_version is not None: + rendered += ".post%%d.dev%%d" %% (post_version + 1, pieces["distance"]) + else: + rendered += ".post0.dev%%d" %% (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] else: # exception #1 - rendered = "0.post.dev%%d" %% pieces["distance"] + rendered = "0.post0.dev%%d" %% pieces["distance"] return rendered -def render_pep440_post(pieces): +def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards @@ -811,12 +1028,41 @@ def render_pep440_post(pieces): return rendered -def render_pep440_old(pieces): +def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. - Eexceptions: + Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: @@ -833,7 +1079,7 @@ def render_pep440_old(pieces): return rendered -def render_git_describe(pieces): +def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. @@ -853,7 +1099,7 @@ def render_git_describe(pieces): return rendered -def render_git_describe_long(pieces): +def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. @@ -873,7 +1119,7 @@ def render_git_describe_long(pieces): return rendered -def render(pieces, style): +def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", @@ -887,10 +1133,14 @@ def render(pieces, style): if style == "pep440": rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": @@ -905,7 +1155,7 @@ def render(pieces, style): "date": pieces.get("date")} -def get_versions(): +def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some @@ -926,7 +1176,7 @@ def get_versions(): # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for i in cfg.versionfile_source.split('/'): + for _ in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, @@ -953,41 +1203,48 @@ def get_versions(): @register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): +def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. - keywords = {} + keywords: Dict[str, str] = {} try: - f = open(versionfile_abs, "r") - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - f.close() - except EnvironmentError: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: pass return keywords @register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): +def git_versions_from_keywords( + keywords: Dict[str, str], + tag_prefix: str, + verbose: bool, +) -> Dict[str, Any]: """Get version information from git keywords.""" - if not keywords: - raise NotThisMethod("no keywords at all, weird") + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because @@ -1000,11 +1257,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set([r.strip() for r in refnames.strip("()").split(",")]) + refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -1012,8 +1269,8 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "". - tags = set([r for r in refs if re.search(r"\d", r)]) + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -1022,6 +1279,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r"\d", r): + continue if verbose: print("picking %s" % r) return { @@ -1044,7 +1306,9 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): @register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): +def git_pieces_from_vcs( + tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command +) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* @@ -1055,7 +1319,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -1063,7 +1334,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command( + describe_out, rc = runner( GITS, [ "describe", @@ -1072,7 +1343,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): "--always", "--long", "--match", - "%s*" % tag_prefix, + f"{tag_prefix}[[:digit:]]*", ], cwd=root, ) @@ -1080,16 +1351,48 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() - full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() - pieces = {} + pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out @@ -1106,7 +1409,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # TAG-NUM-gHEX mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: - # unparseable. Maybe git-describe is misbehaving? + # unparsable. Maybe git-describe is misbehaving? pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces @@ -1132,19 +1435,20 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) - pieces["distance"] = int(count_out) # total number of commits + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ - 0 - ].strip() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces -def do_vcs_install(manifest_in, versionfile_source, ipy): +def do_vcs_install(versionfile_source: str, ipy: Optional[str]) -> None: """Git-specific installation logic for Versioneer. For Git, this means creating/changing .gitattributes to mark _version.py @@ -1153,36 +1457,40 @@ def do_vcs_install(manifest_in, versionfile_source, ipy): GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - files = [manifest_in, versionfile_source] + files = [versionfile_source] if ipy: files.append(ipy) - try: - me = __file__ - if me.endswith(".pyc") or me.endswith(".pyo"): - me = os.path.splitext(me)[0] + ".py" - versioneer_file = os.path.relpath(me) - except NameError: - versioneer_file = "versioneer.py" - files.append(versioneer_file) + if "VERSIONEER_PEP518" not in globals(): + try: + my_path = __file__ + if my_path.endswith((".pyc", ".pyo")): + my_path = os.path.splitext(my_path)[0] + ".py" + versioneer_file = os.path.relpath(my_path) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) present = False try: - f = open(".gitattributes", "r") - for line in f.readlines(): - if line.strip().startswith(versionfile_source): - if "export-subst" in line.strip().split()[1:]: - present = True - f.close() - except EnvironmentError: + with open(".gitattributes", "r") as fobj: + for line in fobj: + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + break + except OSError: pass if not present: - f = open(".gitattributes", "a+") - f.write("%s export-subst\n" % versionfile_source) - f.close() + with open(".gitattributes", "a+") as fobj: + fobj.write(f"{versionfile_source} export-subst\n") files.append(".gitattributes") run_command(GITS, ["add", "--"] + files) -def versions_from_parentdir(parentdir_prefix, root, verbose): +def versions_from_parentdir( + parentdir_prefix: str, + root: str, + verbose: bool, +) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both @@ -1191,7 +1499,7 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): """ rootdirs = [] - for i in range(3): + for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return { @@ -1201,9 +1509,8 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): "error": None, "date": None, } - else: - rootdirs.append(root) - root = os.path.dirname(root) # up a level + rootdirs.append(root) + root = os.path.dirname(root) # up a level if verbose: print( @@ -1214,7 +1521,7 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): SHORT_VERSION_PY = """ -# This file was generated by 'versioneer.py' (0.18) from +# This file was generated by 'versioneer.py' (0.29) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. @@ -1231,12 +1538,12 @@ def get_versions(): """ -def versions_from_file(filename): +def versions_from_file(filename: str) -> Dict[str, Any]: """Try to determine the version from _version.py if present.""" try: with open(filename) as f: contents = f.read() - except EnvironmentError: + except OSError: raise NotThisMethod("unable to read _version.py") mo = re.search( r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S @@ -1250,9 +1557,8 @@ def versions_from_file(filename): return json.loads(mo.group(1)) -def write_to_version_file(filename, versions): +def write_to_version_file(filename: str, versions: Dict[str, Any]) -> None: """Write the given version number to the given _version.py file.""" - os.unlink(filename) contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1260,14 +1566,14 @@ def write_to_version_file(filename, versions): print("set %s to '%s'" % (filename, versions["version"])) -def plus_or_dot(pieces): +def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" -def render_pep440(pieces): +def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you @@ -1291,23 +1597,70 @@ def render_pep440(pieces): return rendered -def render_pep440_pre(pieces): - """TAG[.post.devDISTANCE] -- No -dirty. +def render_pep440_branch(pieces: Dict[str, Any]) -> str: + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). Exceptions: - 1: no tags. 0.post.devDISTANCE + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces: Dict[str, Any]) -> str: + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: if pieces["distance"]: - rendered += ".post.dev%d" % pieces["distance"] + # update the post release segment + tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + rendered = tag_version + if post_version is not None: + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) + else: + rendered += ".post0.dev%d" % (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] else: # exception #1 - rendered = "0.post.dev%d" % pieces["distance"] + rendered = "0.post0.dev%d" % pieces["distance"] return rendered -def render_pep440_post(pieces): +def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards @@ -1334,12 +1687,41 @@ def render_pep440_post(pieces): return rendered -def render_pep440_old(pieces): +def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. - Eexceptions: + Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: @@ -1356,7 +1738,7 @@ def render_pep440_old(pieces): return rendered -def render_git_describe(pieces): +def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. @@ -1376,7 +1758,7 @@ def render_git_describe(pieces): return rendered -def render_git_describe_long(pieces): +def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. @@ -1396,7 +1778,7 @@ def render_git_describe_long(pieces): return rendered -def render(pieces, style): +def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return { @@ -1412,10 +1794,14 @@ def render(pieces, style): if style == "pep440": rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": @@ -1438,7 +1824,7 @@ class VersioneerBadRootError(Exception): """The project root directory is unknown or missing key files.""" -def get_versions(verbose=False): +def get_versions(verbose: bool = False) -> Dict[str, Any]: """Get the project version from whatever source is available. Returns dict with two keys: 'version' and 'full'. @@ -1453,7 +1839,7 @@ def get_versions(verbose=False): assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS - verbose = verbose or cfg.verbose + verbose = verbose or bool(cfg.verbose) # `bool()` used to avoid `None` assert ( cfg.versionfile_source is not None ), "please set versioneer.versionfile_source" @@ -1519,13 +1905,17 @@ def get_versions(verbose=False): } -def get_version(): +def get_version() -> str: """Get the short version string for this project.""" return get_versions()["version"] -def get_cmdclass(): - """Get the custom setuptools/distutils subclasses used by Versioneer.""" +def get_cmdclass(cmdclass: Optional[Dict[str, Any]] = None): + """Get the custom setuptools subclasses used by Versioneer. + + If the package uses a different cmdclass (e.g. one from numpy), it + should be provide as an argument. + """ if "versioneer" in sys.modules: del sys.modules["versioneer"] # this fixes the "python setup.py develop" case (also 'install' and @@ -1539,25 +1929,25 @@ def get_cmdclass(): # parent is protected against the child's "import versioneer". By # removing ourselves from sys.modules here, before the child build # happens, we protect the child from the parent's versioneer too. - # Also see https://github.com/warner/python-versioneer/issues/52 + # Also see https://github.com/python-versioneer/python-versioneer/issues/52 - cmds = {} + cmds = {} if cmdclass is None else cmdclass.copy() - # we add "version" to both distutils and setuptools - from distutils.core import Command + # we add "version" to setuptools + from setuptools import Command class cmd_version(Command): description = "report generated version string" - user_options = [] - boolean_options = [] + user_options: List[Tuple[str, str, str]] = [] + boolean_options: List[str] = [] - def initialize_options(self): + def initialize_options(self) -> None: pass - def finalize_options(self): + def finalize_options(self) -> None: pass - def run(self): + def run(self) -> None: vers = get_versions(verbose=True) print("Version: %s" % vers["version"]) print(" full-revisionid: %s" % vers.get("full-revisionid")) @@ -1568,7 +1958,7 @@ def run(self): cmds["version"] = cmd_version - # we override "build_py" in both distutils and setuptools + # we override "build_py" in setuptools # # most invocation pathways end up running build_py: # distutils/build -> build_py @@ -1583,18 +1973,25 @@ def run(self): # then does setup.py bdist_wheel, or sometimes setup.py install # setup.py egg_info -> ? + # pip install -e . and setuptool/editable_wheel will invoke build_py + # but the build_py command is not expected to copy any files. + # we override different "build_py" commands for both environments - if "setuptools" in sys.modules: - from setuptools.command.build_py import build_py as _build_py + if "build_py" in cmds: + _build_py: Any = cmds["build_py"] else: - from distutils.command.build_py import build_py as _build_py + from setuptools.command.build_py import build_py as _build_py class cmd_build_py(_build_py): - def run(self): + def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_py.run(self) + if getattr(self, "editable_mode", False): + # During editable installs `.py` and data files are + # not copied to build_lib + return # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: @@ -1604,8 +2001,42 @@ def run(self): cmds["build_py"] = cmd_build_py + if "build_ext" in cmds: + _build_ext: Any = cmds["build_ext"] + else: + from setuptools.command.build_ext import build_ext as _build_ext + + class cmd_build_ext(_build_ext): + def run(self) -> None: + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_ext.run(self) + if self.inplace: + # build_ext --inplace will only build extensions in + # build/lib<..> dir with no _version.py to write to. + # As in place builds will already have a _version.py + # in the module dir, we do not need to write one. + return + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if not cfg.versionfile_build: + return + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) + if not os.path.exists(target_versionfile): + print( + f"Warning: {target_versionfile} does not exist, skipping " + "version update. This can happen if you are running build_ext " + "without first running build_py." + ) + return + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + cmds["build_ext"] = cmd_build_ext + if "cx_Freeze" in sys.modules: # cx_freeze enabled? - from cx_Freeze.dist import build_exe as _build_exe + from cx_Freeze.dist import build_exe as _build_exe # type: ignore # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. @@ -1615,7 +2046,7 @@ def run(self): # ... class cmd_build_exe(_build_exe): - def run(self): + def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() @@ -1643,12 +2074,12 @@ def run(self): if "py2exe" in sys.modules: # py2exe enabled? try: - from py2exe.distutils_buildexe import py2exe as _py2exe # py3 + from py2exe.setuptools_buildexe import py2exe as _py2exe # type: ignore except ImportError: - from py2exe.build_exe import py2exe as _py2exe # py2 + from py2exe.distutils_buildexe import py2exe as _py2exe # type: ignore class cmd_py2exe(_py2exe): - def run(self): + def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() @@ -1673,14 +2104,54 @@ def run(self): cmds["py2exe"] = cmd_py2exe + # sdist farms its file list building out to egg_info + if "egg_info" in cmds: + _egg_info: Any = cmds["egg_info"] + else: + from setuptools.command.egg_info import egg_info as _egg_info + + class cmd_egg_info(_egg_info): + def find_sources(self) -> None: + # egg_info.find_sources builds the manifest list and writes it + # in one shot + super().find_sources() + + # Modify the filelist and normalize it + root = get_root() + cfg = get_config_from_root(root) + self.filelist.append("versioneer.py") + if cfg.versionfile_source: + # There are rare cases where versionfile_source might not be + # included by default, so we must be explicit + self.filelist.append(cfg.versionfile_source) + self.filelist.sort() + self.filelist.remove_duplicates() + + # The write method is hidden in the manifest_maker instance that + # generated the filelist and was thrown away + # We will instead replicate their final normalization (to unicode, + # and POSIX-style paths) + from setuptools import unicode_utils + + normalized = [ + unicode_utils.filesys_decode(f).replace(os.sep, "/") + for f in self.filelist.files + ] + + manifest_filename = os.path.join(self.egg_info, "SOURCES.txt") + with open(manifest_filename, "w") as fobj: + fobj.write("\n".join(normalized)) + + cmds["egg_info"] = cmd_egg_info + # we override different "sdist" commands for both environments - if "setuptools" in sys.modules: - from setuptools.command.sdist import sdist as _sdist + if "sdist" in cmds: + _sdist: Any = cmds["sdist"] else: - from distutils.command.sdist import sdist as _sdist + from setuptools.command.sdist import sdist as _sdist class cmd_sdist(_sdist): - def run(self): + def run(self) -> None: versions = get_versions() self._versioneer_generated_versions = versions # unless we update this, the command will keep using the old @@ -1688,7 +2159,7 @@ def run(self): self.distribution.metadata.version = versions["version"] return _sdist.run(self) - def make_release_tree(self, base_dir, files): + def make_release_tree(self, base_dir: str, files: List[str]) -> None: root = get_root() cfg = get_config_from_root(root) _sdist.make_release_tree(self, base_dir, files) @@ -1743,24 +2214,25 @@ def make_release_tree(self, base_dir, files): """ -INIT_PY_SNIPPET = """ +OLD_SNIPPET = """ from ._version import get_versions __version__ = get_versions()['version'] del get_versions """ +INIT_PY_SNIPPET = """ +from . import {0} +__version__ = {0}.get_versions()['version'] +""" + -def do_setup(): - """Main VCS-independent setup function for installing Versioneer.""" +def do_setup() -> int: + """Do main VCS-independent setup function for installing Versioneer.""" root = get_root() try: cfg = get_config_from_root(root) - except ( - EnvironmentError, - configparser.NoSectionError, - configparser.NoOptionError, - ) as e: - if isinstance(e, (EnvironmentError, configparser.NoSectionError)): + except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: + if isinstance(e, (OSError, configparser.NoSectionError)): print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) @@ -1782,64 +2254,37 @@ def do_setup(): ) ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") + maybe_ipy: Optional[str] = ipy if os.path.exists(ipy): try: with open(ipy, "r") as f: old = f.read() - except EnvironmentError: + except OSError: old = "" - if INIT_PY_SNIPPET not in old: + module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0] + snippet = INIT_PY_SNIPPET.format(module) + if OLD_SNIPPET in old: + print(" replacing boilerplate in %s" % ipy) + with open(ipy, "w") as f: + f.write(old.replace(OLD_SNIPPET, snippet)) + elif snippet not in old: print(" appending to %s" % ipy) with open(ipy, "a") as f: - f.write(INIT_PY_SNIPPET) + f.write(snippet) else: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) - ipy = None - - # Make sure both the top-level "versioneer.py" and versionfile_source - # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so - # they'll be copied into source distributions. Pip won't be able to - # install the package without this. - manifest_in = os.path.join(root, "MANIFEST.in") - simple_includes = set() - try: - with open(manifest_in, "r") as f: - for line in f: - if line.startswith("include "): - for include in line.split()[1:]: - simple_includes.add(include) - except EnvironmentError: - pass - # That doesn't cover everything MANIFEST.in can do - # (http://docs.python.org/2/distutils/sourcedist.html#commands), so - # it might give some false negatives. Appending redundant 'include' - # lines is safe, though. - if "versioneer.py" not in simple_includes: - print(" appending 'versioneer.py' to MANIFEST.in") - with open(manifest_in, "a") as f: - f.write("include versioneer.py\n") - else: - print(" 'versioneer.py' already in MANIFEST.in") - if cfg.versionfile_source not in simple_includes: - print( - " appending versionfile_source ('%s') to MANIFEST.in" - % cfg.versionfile_source - ) - with open(manifest_in, "a") as f: - f.write("include %s\n" % cfg.versionfile_source) - else: - print(" versionfile_source already in MANIFEST.in") + maybe_ipy = None # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-subst keyword # substitution. - do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + do_vcs_install(cfg.versionfile_source, maybe_ipy) return 0 -def scan_setup_py(): +def scan_setup_py() -> int: """Validate the contents of setup.py against Versioneer's expectations.""" found = set() setters = False @@ -1876,10 +2321,14 @@ def scan_setup_py(): return errors +def setup_command() -> NoReturn: + """Set up Versioneer and exit with appropriate error code.""" + errors = do_setup() + errors += scan_setup_py() + sys.exit(1 if errors else 0) + + if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup": - errors = do_setup() - errors += scan_setup_py() - if errors: - sys.exit(1) + setup_command()