From a49493d10eb79c1d51a92612d0497ab991d55d50 Mon Sep 17 00:00:00 2001 From: Emile Sonneveld Date: Wed, 2 Oct 2024 16:16:35 +0200 Subject: [PATCH 1/5] Allow creating load_stac datacube without connection object. https://github.com/Open-EO/openeo-python-client/issues/638 --- openeo/rest/connection.py | 29 ++++++++--------------------- openeo/rest/datacube.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/openeo/rest/connection.py b/openeo/rest/connection.py index 3e132fc3f..25ba304ac 100644 --- a/openeo/rest/connection.py +++ b/openeo/rest/connection.py @@ -44,7 +44,6 @@ CollectionMetadata, SpatialDimension, TemporalDimension, - metadata_from_stac, ) from openeo.rest import ( DEFAULT_DOWNLOAD_CHUNK_SIZE, @@ -1415,26 +1414,14 @@ def load_stac( Argument ``temporal_extent``: add support for year/month shorthand notation as discussed at :ref:`date-shorthand-handling`. """ - # TODO #425 move this implementation to `DataCube` and just forward here (like with `load_collection`) - # TODO #425 detect actual metadata from URL - arguments = {"url": url} - # TODO #425 more normalization/validation of extent/band parameters - if spatial_extent: - arguments["spatial_extent"] = spatial_extent - if temporal_extent: - arguments["temporal_extent"] = DataCube._get_temporal_extent(extent=temporal_extent) - if bands: - arguments["bands"] = bands - if properties: - arguments["properties"] = { - prop: build_child_callback(pred, parent_parameters=["value"]) for prop, pred in properties.items() - } - cube = self.datacube_from_process(process_id="load_stac", **arguments) - try: - cube.metadata = metadata_from_stac(url) - except Exception: - _log.warning(f"Failed to extract cube metadata from STAC URL {url}", exc_info=True) - return cube + return DataCube.load_stac( + url=url, + spatial_extent=spatial_extent, + temporal_extent=temporal_extent, + bands=bands, + properties=properties, + connection=self, + ) def load_stac_from_job( self, diff --git a/openeo/rest/datacube.py b/openeo/rest/datacube.py index 430a8785d..c7bbf0b8f 100644 --- a/openeo/rest/datacube.py +++ b/openeo/rest/datacube.py @@ -40,6 +40,7 @@ CollectionMetadata, SpatialDimension, TemporalDimension, + metadata_from_stac, ) from openeo.processes import ProcessBuilder from openeo.rest import BandMathException, OpenEoClientException, OperatorException @@ -259,6 +260,36 @@ def load_disk_collection(cls, connection: Connection, file_format: str, glob_pat ) return cls(graph=pg, connection=connection) + @classmethod + def load_stac( + cls, + url: str, + spatial_extent: Union[Dict[str, float], Parameter, None] = None, + temporal_extent: Union[Sequence[InputDate], Parameter, str, None] = None, + bands: Optional[List[str]] = None, + properties: Optional[Dict[str, Union[str, PGNode, Callable]]] = None, + connection: Connection = None, + ) -> DataCube: + arguments = {"url": url} + # TODO #425 more normalization/validation of extent/band parameters + if spatial_extent: + arguments["spatial_extent"] = spatial_extent + if temporal_extent: + arguments["temporal_extent"] = DataCube._get_temporal_extent(extent=temporal_extent) + if bands: + arguments["bands"] = bands + if properties: + arguments["properties"] = { + prop: build_child_callback(pred, parent_parameters=["value"]) for prop, pred in properties.items() + } + graph = PGNode("load_stac", arguments=arguments) + try: + metadata = metadata_from_stac(url) + except Exception: + log.warning(f"Failed to extract cube metadata from STAC URL {url}", exc_info=True) + metadata = None + return cls(graph=graph, connection=connection, metadata=metadata) + @classmethod def _get_temporal_extent( cls, From 33df2b953356d1c14542894d3ff8bd712898dc95 Mon Sep 17 00:00:00 2001 From: Emile Sonneveld Date: Wed, 2 Oct 2024 16:23:29 +0200 Subject: [PATCH 2/5] Add test. --- tests/rest/test_connection.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/rest/test_connection.py b/tests/rest/test_connection.py index 78e8679e7..0bf2681be 100644 --- a/tests/rest/test_connection.py +++ b/tests/rest/test_connection.py @@ -2443,6 +2443,16 @@ def test_basic(self, con120): } } + def test_basic_connectionless(self, con120): + cube = openeo.DataCube.load_stac("https://provider.test/dataset") + assert cube.flat_graph() == { + "loadstac1": { + "process_id": "load_stac", + "arguments": {"url": "https://provider.test/dataset"}, + "result": True, + } + } + def test_extents(self, con120): cube = con120.load_stac( "https://provider.test/dataset", From 33e490acd1c1783b5c6305a7df6e6df2b2c6a494 Mon Sep 17 00:00:00 2001 From: Emile Sonneveld Date: Wed, 2 Oct 2024 17:02:21 +0200 Subject: [PATCH 3/5] Fixes for merge request. Copied documentation for load_stac. --- CHANGELOG.md | 2 + openeo/rest/datacube.py | 103 ++++++++++++++++++++++++++- tests/rest/datacube/test_datacube.py | 13 ++++ tests/rest/test_connection.py | 10 --- 4 files changed, 115 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52c694347..64c8dfbaf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `DataCube.load_stac()` to create a datacube without needing an openeo connection. + ### Changed ### Removed diff --git a/openeo/rest/datacube.py b/openeo/rest/datacube.py index c7bbf0b8f..7be11cc3c 100644 --- a/openeo/rest/datacube.py +++ b/openeo/rest/datacube.py @@ -85,7 +85,7 @@ class DataCube(_ProcessGraphAbstraction): # TODO: set this based on back-end or user preference? _DEFAULT_RASTER_FORMAT = "GTiff" - def __init__(self, graph: PGNode, connection: Connection, metadata: Optional[CollectionMetadata] = None): + def __init__(self, graph: PGNode, connection: Optional[Connection], metadata: Optional[CollectionMetadata] = None): super().__init__(pgnode=graph, connection=connection) self.metadata: Optional[CollectionMetadata] = metadata @@ -138,7 +138,7 @@ def _assert_valid_dimension_name(self, name: str) -> str: def load_collection( cls, collection_id: Union[str, Parameter], - connection: Connection = None, + connection: Optional[Connection] = None, spatial_extent: Union[Dict[str, float], Parameter, None] = None, temporal_extent: Union[Sequence[InputDate], Parameter, str, None] = None, bands: Union[None, List[str], Parameter] = None, @@ -268,8 +268,105 @@ def load_stac( temporal_extent: Union[Sequence[InputDate], Parameter, str, None] = None, bands: Optional[List[str]] = None, properties: Optional[Dict[str, Union[str, PGNode, Callable]]] = None, - connection: Connection = None, + connection: Optional[Connection] = None, ) -> DataCube: + """ + Loads data from a static STAC catalog or a STAC API Collection and returns the data as a processable :py:class:`DataCube`. + A batch job result can be loaded by providing a reference to it. + + If supported by the underlying metadata and file format, the data that is added to the data cube can be + restricted with the parameters ``spatial_extent``, ``temporal_extent`` and ``bands``. + If no data is available for the given extents, a ``NoDataAvailable`` error is thrown. + + Remarks: + + * The bands (and all dimensions that specify nominal dimension labels) are expected to be ordered as + specified in the metadata if the ``bands`` parameter is set to ``null``. + * If no additional parameter is specified this would imply that the whole data set is expected to be loaded. + Due to the large size of many data sets, this is not recommended and may be optimized by back-ends to only + load the data that is actually required after evaluating subsequent processes such as filters. + This means that the values should be processed only after the data has been limited to the required extent + and as a consequence also to a manageable size. + + + :param url: The URL to a static STAC catalog (STAC Item, STAC Collection, or STAC Catalog) + or a specific STAC API Collection that allows to filter items and to download assets. + This includes batch job results, which itself are compliant to STAC. + For external URLs, authentication details such as API keys or tokens may need to be included in the URL. + + Batch job results can be specified in two ways: + + - For Batch job results at the same back-end, a URL pointing to the corresponding batch job results + endpoint should be provided. The URL usually ends with ``/jobs/{id}/results`` and ``{id}`` + is the corresponding batch job ID. + - For external results, a signed URL must be provided. Not all back-ends support signed URLs, + which are provided as a link with the link relation `canonical` in the batch job result metadata. + :param spatial_extent: + Limits the data to load to the specified bounding box or polygons. + + For raster data, the process loads the pixel into the data cube if the point at the pixel center intersects + with the bounding box or any of the polygons (as defined in the Simple Features standard by the OGC). + + For vector data, the process loads the geometry into the data cube if the geometry is fully within the + bounding box or any of the polygons (as defined in the Simple Features standard by the OGC). + Empty geometries may only be in the data cube if no spatial extent has been provided. + + The GeoJSON can be one of the following feature types: + + * A ``Polygon`` or ``MultiPolygon`` geometry, + * a ``Feature`` with a ``Polygon`` or ``MultiPolygon`` geometry, or + * a ``FeatureCollection`` containing at least one ``Feature`` with ``Polygon`` or ``MultiPolygon`` geometries. + + Set this parameter to ``None`` to set no limit for the spatial extent. + Be careful with this when loading large datasets. It is recommended to use this parameter instead of + using ``filter_bbox()`` or ``filter_spatial()`` directly after loading unbounded data. + + :param temporal_extent: + Limits the data to load to the specified left-closed temporal interval. + Applies to all temporal dimensions. + The interval has to be specified as an array with exactly two elements: + + 1. The first element is the start of the temporal interval. + The specified instance in time is **included** in the interval. + 2. The second element is the end of the temporal interval. + The specified instance in time is **excluded** from the interval. + + The second element must always be greater/later than the first element. + Otherwise, a `TemporalExtentEmpty` exception is thrown. + + Also supports open intervals by setting one of the boundaries to ``None``, but never both. + + Set this parameter to ``None`` to set no limit for the temporal extent. + Be careful with this when loading large datasets. It is recommended to use this parameter instead of + using ``filter_temporal()`` directly after loading unbounded data. + + :param bands: + Only adds the specified bands into the data cube so that bands that don't match the list + of band names are not available. Applies to all dimensions of type `bands`. + + Either the unique band name (metadata field ``name`` in bands) or one of the common band names + (metadata field ``common_name`` in bands) can be specified. + If the unique band name and the common name conflict, the unique band name has a higher priority. + + The order of the specified array defines the order of the bands in the data cube. + If multiple bands match a common name, all matched bands are included in the original order. + + It is recommended to use this parameter instead of using ``filter_bands()`` directly after loading unbounded data. + + :param properties: + Limits the data by metadata properties to include only data in the data cube which + all given conditions return ``True`` for (AND operation). + + Specify key-value-pairs with the key being the name of the metadata property, + which can be retrieved with the openEO Data Discovery for Collections. + The value must be a condition (user-defined process) to be evaluated against a STAC API. + This parameter is not supported for static STAC. + + :param connection: The connection to use to connect with the backend. + + .. versionadded:: 0.33.0 + + """ arguments = {"url": url} # TODO #425 more normalization/validation of extent/band parameters if spatial_extent: diff --git a/tests/rest/datacube/test_datacube.py b/tests/rest/datacube/test_datacube.py index 46a7f6d3d..1b746beb1 100644 --- a/tests/rest/datacube/test_datacube.py +++ b/tests/rest/datacube/test_datacube.py @@ -102,6 +102,19 @@ def test_filter_temporal_basic_extent(s2cube): assert graph['arguments']['extent'] == ["2016-01-01", "2016-03-10"] +def test_load_stac_connectionless(connection): + expected_graph = { + "loadstac1": { + "process_id": "load_stac", + "arguments": {"url": "https://provider.test/dataset"}, + "result": True, + } + } + cube = DataCube.load_stac("https://provider.test/dataset") + assert cube.flat_graph() == expected_graph + cube2 = connection.load_stac("https://provider.test/dataset") + assert cube2.flat_graph() == expected_graph + @pytest.mark.parametrize( "args,kwargs,extent", [ diff --git a/tests/rest/test_connection.py b/tests/rest/test_connection.py index 0bf2681be..78e8679e7 100644 --- a/tests/rest/test_connection.py +++ b/tests/rest/test_connection.py @@ -2443,16 +2443,6 @@ def test_basic(self, con120): } } - def test_basic_connectionless(self, con120): - cube = openeo.DataCube.load_stac("https://provider.test/dataset") - assert cube.flat_graph() == { - "loadstac1": { - "process_id": "load_stac", - "arguments": {"url": "https://provider.test/dataset"}, - "result": True, - } - } - def test_extents(self, con120): cube = con120.load_stac( "https://provider.test/dataset", From 8ddf73e8fa95063992f38ca20d22ee93dd211612 Mon Sep 17 00:00:00 2001 From: Emile Date: Wed, 2 Oct 2024 17:41:09 +0200 Subject: [PATCH 4/5] Update CHANGELOG.md Co-authored-by: Stefaan Lippens --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 64c8dfbaf..dadd889db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Added `DataCube.load_stac()` to create a datacube without needing an openeo connection. +- Added `DataCube.load_stac()` to also support creating a `load_stac` based cube without a connection ([#638](https://github.com/Open-EO/openeo-python-client/issues/638)) ### Changed From 8b8c072988aaa8884932b40de7eb498d6bbd45e3 Mon Sep 17 00:00:00 2001 From: Emile Sonneveld Date: Wed, 2 Oct 2024 17:44:04 +0200 Subject: [PATCH 5/5] Move test to new class. --- tests/rest/datacube/test_datacube.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/rest/datacube/test_datacube.py b/tests/rest/datacube/test_datacube.py index 32f9899f3..6b0da6e8b 100644 --- a/tests/rest/datacube/test_datacube.py +++ b/tests/rest/datacube/test_datacube.py @@ -83,6 +83,19 @@ def _get_leaf_node(cube, force_flat=True) -> dict: class TestDataCube: + def test_load_stac_connectionless(self, connection): + expected_graph = { + "loadstac1": { + "process_id": "load_stac", + "arguments": {"url": "https://provider.test/dataset"}, + "result": True, + } + } + cube = DataCube.load_stac("https://provider.test/dataset") + assert cube.flat_graph() == expected_graph + cube2 = connection.load_stac("https://provider.test/dataset") + assert cube2.flat_graph() == expected_graph + def test_load_collection_connectionless_basic(self): cube = DataCube.load_collection("T3") assert cube.flat_graph() == { @@ -157,19 +170,6 @@ def test_filter_temporal_basic_extent(s2cube): assert graph['arguments']['extent'] == ["2016-01-01", "2016-03-10"] -def test_load_stac_connectionless(connection): - expected_graph = { - "loadstac1": { - "process_id": "load_stac", - "arguments": {"url": "https://provider.test/dataset"}, - "result": True, - } - } - cube = DataCube.load_stac("https://provider.test/dataset") - assert cube.flat_graph() == expected_graph - cube2 = connection.load_stac("https://provider.test/dataset") - assert cube2.flat_graph() == expected_graph - @pytest.mark.parametrize( "args,kwargs,extent", [