From 81a45940e975caa460188f77d9e3c2166829a2ad Mon Sep 17 00:00:00 2001 From: Matthias Mohr Date: Tue, 14 Mar 2023 13:20:15 +0100 Subject: [PATCH 1/2] Remove ML processes from 2.0.0 (#417) * Remove ML processes for 2.0.0 #416 --- CHANGELOG.md | 5 -- meta/subtype-schemas.json | 6 -- proposals/fit_class_random_forest.json | 110 ------------------------- proposals/fit_regr_random_forest.json | 110 ------------------------- proposals/load_ml_model.json | 53 ------------ proposals/predict_random_forest.json | 42 ---------- proposals/save_ml_model.json | 44 ---------- 7 files changed, 370 deletions(-) delete mode 100644 proposals/fit_class_random_forest.json delete mode 100644 proposals/fit_regr_random_forest.json delete mode 100644 proposals/load_ml_model.json delete mode 100644 proposals/predict_random_forest.json delete mode 100644 proposals/save_ml_model.json diff --git a/CHANGELOG.md b/CHANGELOG.md index e2165c06..fec780e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New processes in proposal state: - `date_difference` - `filter_vector` - - `fit_class_random_forest` - - `fit_regr_random_forest` - `flatten_dimensions` - - `load_ml_model` - - `predict_random_forest` - - `save_ml_model` - `unflatten_dimension` - `vector_buffer` - `vector_to_random_points` diff --git a/meta/subtype-schemas.json b/meta/subtype-schemas.json index 941e6a48..498adf60 100644 --- a/meta/subtype-schemas.json +++ b/meta/subtype-schemas.json @@ -238,12 +238,6 @@ } } }, - "ml-model": { - "type": "object", - "subtype": "ml-model", - "title": "Machine Learning Model", - "description": "A machine learning model, accompanied with STAC metadata that implements the the STAC ml-model extension." - }, "output-format": { "type": "string", "subtype": "output-format", diff --git a/proposals/fit_class_random_forest.json b/proposals/fit_class_random_forest.json deleted file mode 100644 index 6eb874bf..00000000 --- a/proposals/fit_class_random_forest.json +++ /dev/null @@ -1,110 +0,0 @@ -{ - "id": "fit_class_random_forest", - "summary": "Train a random forest classification model", - "description": "Executes the fit of a random forest classification based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest classification model is based on the approach by Breiman (2001).", - "categories": [ - "machine learning" - ], - "experimental": true, - "parameters": [ - { - "name": "predictors", - "description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", - "schema": [ - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "bands" - } - ] - }, - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "other" - } - ] - } - ] - }, - { - "name": "target", - "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", - "schema": { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - } - ] - } - }, - { - "name": "max_variables", - "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split.\n- `sqrt`: The square root of the number of variables are considered for each split. This is often the default for classification.", - "schema": [ - { - "type": "integer", - "minimum": 1 - }, - { - "type": "string", - "enum": [ - "all", - "log2", - "onethird", - "sqrt" - ] - } - ] - }, - { - "name": "num_trees", - "description": "The number of trees build within the Random Forest classification.", - "optional": true, - "default": 100, - "schema": { - "type": "integer", - "minimum": 1 - } - }, - { - "name": "seed", - "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", - "optional": true, - "default": null, - "schema": { - "type": [ - "integer", - "null" - ] - } - } - ], - "returns": { - "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", - "schema": { - "type": "object", - "subtype": "ml-model" - } - }, - "links": [ - { - "href": "https://doi.org/10.1023/A:1010933404324", - "title": "Breiman (2001): Random Forests", - "type": "text/html", - "rel": "about" - } - ] -} diff --git a/proposals/fit_regr_random_forest.json b/proposals/fit_regr_random_forest.json deleted file mode 100644 index 51191fa5..00000000 --- a/proposals/fit_regr_random_forest.json +++ /dev/null @@ -1,110 +0,0 @@ -{ - "id": "fit_regr_random_forest", - "summary": "Train a random forest regression model", - "description": "Executes the fit of a random forest regression based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest regression model is based on the approach by Breiman (2001).", - "categories": [ - "machine learning" - ], - "experimental": true, - "parameters": [ - { - "name": "predictors", - "description": "The predictors for the regression model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", - "schema": [ - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "bands" - } - ] - }, - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "other" - } - ] - } - ] - }, - { - "name": "target", - "description": "The training sites for the regression model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", - "schema": { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - } - ] - } - }, - { - "name": "max_variables", - "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split. This is often the default for regression.\n- `sqrt`: The square root of the number of variables are considered for each split.", - "schema": [ - { - "type": "integer", - "minimum": 1 - }, - { - "type": "string", - "enum": [ - "all", - "log2", - "onethird", - "sqrt" - ] - } - ] - }, - { - "name": "num_trees", - "description": "The number of trees build within the Random Forest regression.", - "optional": true, - "default": 100, - "schema": { - "type": "integer", - "minimum": 1 - } - }, - { - "name": "seed", - "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", - "optional": true, - "default": null, - "schema": { - "type": [ - "integer", - "null" - ] - } - } - ], - "returns": { - "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", - "schema": { - "type": "object", - "subtype": "ml-model" - } - }, - "links": [ - { - "href": "https://doi.org/10.1023/A:1010933404324", - "title": "Breiman (2001): Random Forests", - "type": "text/html", - "rel": "about" - } - ] -} diff --git a/proposals/load_ml_model.json b/proposals/load_ml_model.json deleted file mode 100644 index 151513c8..00000000 --- a/proposals/load_ml_model.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "id": "load_ml_model", - "summary": "Load a ML model", - "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as ``fit_regr_random_forest()`` and ``save_ml_model()``.", - "categories": [ - "machine learning", - "import" - ], - "experimental": true, - "parameters": [ - { - "name": "id", - "description": "The STAC Item to load the machine learning model from. The STAC Item must implement the `ml-model` extension.", - "schema": [ - { - "title": "URL", - "type": "string", - "format": "uri", - "subtype": "uri", - "pattern": "^https?://" - }, - { - "title": "Batch Job ID", - "description": "Loading a model by batch job ID is possible only if a single model has been saved by the job. Otherwise, you have to load a specific model from a batch job by URL.", - "type": "string", - "subtype": "job-id", - "pattern": "^[\\w\\-\\.~]+$" - }, - { - "title": "User-uploaded File", - "type": "string", - "subtype": "file-path", - "pattern": "^[^\r\n\\:'\"]+$" - } - ] - } - ], - "returns": { - "description": "A machine learning model to be used with machine learning processes such as ``predict_random_forest()``.", - "schema": { - "type": "object", - "subtype": "ml-model" - } - }, - "links": [ - { - "href": "https://github.com/stac-extensions/ml-model", - "title": "STAC ml-model extension", - "type": "text/html", - "rel": "about" - } - ] -} diff --git a/proposals/predict_random_forest.json b/proposals/predict_random_forest.json deleted file mode 100644 index 62c54e9f..00000000 --- a/proposals/predict_random_forest.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "id": "predict_random_forest", - "summary": "Predict values based on a Random Forest model", - "description": "Applies a Random Forest machine learning model to an array and predict a value for it.", - "categories": [ - "machine learning", - "reducer" - ], - "experimental": true, - "parameters": [ - { - "name": "data", - "description": "An array of numbers.", - "schema": { - "type": "array", - "items": { - "type": [ - "number", - "null" - ] - } - } - }, - { - "name": "model", - "description": "A model object that can be trained with the processes ``fit_regr_random_forest()`` (regression) and ``fit_class_random_forest()`` (classification).", - "schema": { - "type": "object", - "subtype": "ml-model" - } - } - ], - "returns": { - "description": "The predicted value. Returns `null` if any of the given values in the array is a no-data value.", - "schema": { - "type": [ - "number", - "null" - ] - } - } -} diff --git a/proposals/save_ml_model.json b/proposals/save_ml_model.json deleted file mode 100644 index 5e9ea8b0..00000000 --- a/proposals/save_ml_model.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "id": "save_ml_model", - "summary": "Save a ML model", - "description": "Saves a machine learning model as part of a batch job.\n\nThe model will be accompanied by a separate STAC Item that implements the [ml-model extension](https://github.com/stac-extensions/ml-model).", - "categories": [ - "machine learning", - "import" - ], - "experimental": true, - "parameters": [ - { - "name": "data", - "description": "The data to store as a machine learning model.", - "schema": { - "type": "object", - "subtype": "ml-model" - } - }, - { - "name": "options", - "description": "Additional parameters to create the file(s).", - "schema": { - "type": "object", - "additionalParameters": false - }, - "default": {}, - "optional": true - } - ], - "returns": { - "description": "Returns `false` if the process failed to store the model, `true` otherwise.", - "schema": { - "type": "boolean" - } - }, - "links": [ - { - "href": "https://github.com/stac-extensions/ml-model", - "title": "STAC ml-model extension", - "type": "text/html", - "rel": "about" - } - ] -} \ No newline at end of file From 08cb18d9fcea4b2987632788901235cb5e506e42 Mon Sep 17 00:00:00 2001 From: Matthias Mohr Date: Tue, 14 Mar 2023 13:51:11 +0100 Subject: [PATCH 2/2] Implicit resampling for spatial dimensions in mask and merge_cubes + clarifications (#405) * `mask` and `merge_cubes`: The spatial dimensions `x` and `y` can now be resampled implicitly instead of throwing an error. #402 * Clarify descriptions #379 * Improve wording as suggested by @soxofaan * Update merge_cubes.json * Slim down description * Default parameters of resample_cube_spatial apply --- CHANGELOG.md | 2 ++ mask.json | 2 +- merge_cubes.json | 10 +++++----- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fec780e3..3d74a5d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Require at least one band if not set to `null`. [#372](https://github.com/Open-EO/openeo-processes/issues/372) - Added a `NoDataAvailable` exception - `inspect`: The parameter `message` has been moved to be the second argument. [#369](https://github.com/Open-EO/openeo-processes/issues/369) +- `mask` and `merge_cubes`: The spatial dimensions `x` and `y` can now be resampled implicitly instead of throwing an error. [#402](https://github.com/Open-EO/openeo-processes/issues/402) - `save_result`: Added a more concrete `DataCubeEmpty` exception. - The comparison processes `eq`, `neq`, `lt`, `lte`, `gt`, `gte` don't support temporal comparison any longer. Instead explicitly use `date_difference`. - New definition for `aggregate_spatial`: @@ -64,6 +65,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `array_interpolate_linear`: Return value was incorrectly specified as `number` or `null`. It must return an array instead. [#333](https://github.com/Open-EO/openeo-processes/issues/333) - `is_nan`: Fixed a wrong description of the return value and simplified/clarified the process descriptions overall. [#360](https://github.com/Open-EO/openeo-processes/issues/360) - `is_nodata`: Clarified that `NaN` can be considered as a no-data value only if it is explicitly specified as no-data value. [#361](https://github.com/Open-EO/openeo-processes/issues/361) +- `merge_cubes`: Clarified descriptions to better describe when a merge is possible. [#379](https://github.com/Open-EO/openeo-processes/issues/379) - `rename_labels`: Clarified that the `LabelsNotEnumerated` exception is thrown if `source` is empty instead of if `target` is empty. [#321](https://github.com/Open-EO/openeo-processes/issues/321) - `round`: Clarify that the rounding for ties applies not only for integers. [#326](https://github.com/Open-EO/openeo-processes/issues/326) diff --git a/mask.json b/mask.json index d5940b25..0381d220 100644 --- a/mask.json +++ b/mask.json @@ -1,7 +1,7 @@ { "id": "mask", "summary": "Apply a raster mask", - "description": "Applies a mask to a raster data cube. To apply a polygon as a mask, use ``mask_polygon()``.\n\nA mask is a raster data cube for which corresponding pixels among `data` and `mask` are compared and those pixels in `data` are replaced whose pixels in `mask` are non-zero (for numbers) or `true` (for boolean values). The pixel values are replaced with the value specified for `replacement`, which defaults to `null` (no data).\n\nThe data cubes have to be compatible so that each dimension in the mask must also be available in the raster data cube with the same name, type, reference system, resolution and labels. Dimensions can be missing in the mask with the result that the mask is applied to each label of the dimension in `data` that is missing in the data cube of the mask. The process fails if there's an incompatibility found between the raster data cube and the mask.", + "description": "Applies a mask to a raster data cube. To apply a polygon as a mask, use ``mask_polygon()``.\n\nA mask is a raster data cube for which corresponding pixels among `data` and `mask` are compared and those pixels in `data` are replaced whose pixels in `mask` are non-zero (for numbers) or `true` (for boolean values). The pixel values are replaced with the value specified for `replacement`, which defaults to `null` (no data).\n\nThe data cubes have to be compatible except that the horizontal spatial dimensions (axes `x` and `y`) will be aligned implicitly by ``resample_cube_spatial()``. `data` is the target data cube for resampling and the default parameters of ``resample_cube_spatial()`` apply. All other dimensions in the mask must also be available in the raster data cube with the same name, type, reference system, resolution and labels. Dimensions can be missing in the mask with the result that the mask is applied to each label of the dimension in `data` that is missing in the data cube of the mask. The process fails if there's an incompatibility found between the raster data cube and the mask.", "categories": [ "cubes", "masks" diff --git a/merge_cubes.json b/merge_cubes.json index e41d5f2e..c22421c2 100644 --- a/merge_cubes.json +++ b/merge_cubes.json @@ -1,14 +1,14 @@ { "id": "merge_cubes", "summary": "Merge two data cubes", - "description": "The process performs the join on overlapping dimensions. The data cubes have to be compatible. A merge operation without overlap should be reversible with (a set of) filter operations for each of the two cubes. As such it is not possible to merge a vector and a raster data cube. It is also not possible to merge vector data cubes that contain different base geometry types (points, lines/line strings, polygons). The base geometry types can be merged with their corresponding multi geometry types. In case of such a conflict, the `IncompatibleGeometryTypes` exception is thrown.\n\nOverlapping dimensions have the same name, type, reference system and resolution, but can have different labels. One of the dimensions can have different labels, for all other dimensions the labels must be equal. Equality for geometries follows the definition in the Simple Features standard by the OGC. If data overlaps, the parameter `overlap_resolver` must be specified to resolve the overlap.\n\n**Examples for merging two data cubes:**\n\n1. Data cubes with the dimensions (`x`, `y`, `t`, `bands`) have the same dimension labels in `x`, `y` and `t`, but the labels for the dimension `bands` are `B1` and `B2` for the first cube and `B3` and `B4`. An overlap resolver is *not needed*. The merged data cube has the dimensions `x`, `y`, `t` and `bands` and the dimension `bands` has four dimension labels: `B1`, `B2`, `B3`, `B4`.\n2. Data cubes with the dimensions (`x`, `y`, `t`, `bands`) have the same dimension labels in `x`, `y` and `t`, but the labels for the dimension `bands` are `B1` and `B2` for the first data cube and `B2` and `B3` for the second. An overlap resolver is *required* to resolve overlap in band `B2`. The merged data cube has the dimensions `x`, `y`, `t` and `bands` and the dimension `bands` has three dimension labels: `B1`, `B2`, `B3`.\n3. Data cubes with the dimensions (`x`, `y`, `t`) have the same dimension labels in `x`, `y` and `t`. There are two options:\n 1. Keep the overlapping values separately in the merged data cube: An overlap resolver is *not needed*, but for each data cube you need to add a new dimension using ``add_dimension()``. The new dimensions must be equal, except that the labels for the new dimensions must differ by name. The merged data cube has the same dimensions and labels as the original data cubes, plus the dimension added with ``add_dimension()``, which has the two dimension labels after the merge.\n 2. Combine the overlapping values into a single value: An overlap resolver is *required* to resolve the overlap for all values. The merged data cube has the same dimensions and labels as the original data cubes, but all values have been processed by the overlap resolver.\n4. A data cube with dimensions (`x`, `y`, `t` / `bands`) or (`x`, `y`, `t`, `bands`) and another data cube with dimensions (`x`, `y`) have the same dimension labels in `x` and `y`. Merging them will join dimensions `x` and `y`, so the lower dimension cube is merged with each time step and band available in the higher dimensional cube. This can for instance be used to apply a digital elevation model to a spatio-temporal data cube. An overlap resolver is *required* to resolve the overlap for all pixels.\n\nAfter the merge, the dimensions with a natural/inherent label order (with a reference system this is each spatial and temporal dimensions) still have all dimension labels sorted. For other dimensions where there is no inherent order, including bands, the dimension labels keep the order in which they are present in the original data cubes and the dimension labels of `cube2` are appended to the dimension labels of `cube1`.", + "description": "The process merges two 'compatible' data cubes.\n\nThe data cubes have to be compatible, which means that they must share a common subset of equal dimensions. To conveniently get to such a subset of equal dimensions, the process tries to align the horizontal spatial dimensions (axes `x` and `y`) implicitly with ``resample_cube_spatial()`` if required. `cube1` is the target data cube for resampling and the default parameters of ``resample_cube_spatial()`` apply. The equality for geometries follows the definition in the Simple Features standard by the OGC.\n\nAll dimensions share the same properties, such as name, type, reference system, and resolution. Dimensions can have disjoint or overlapping labels. If there is any overlap between the dimension labels, the parameter `overlap_resolver` must be specified to combine the two values for these overlapping labels. A merge operation without overlap should be reversible with (a set of) filter operations for each of the two cubes, if no implicit resampling was applied.\n\nIt is not possible to merge a vector and a raster data cube. Merging vector data cubes with different base geometry types (points, lines/line strings, polygons) is not possible and throws the `IncompatibleGeometryTypes` exception. The base geometry types can be merged with their corresponding multi geometry types.\n\nAfter the merge, the dimensions with a natural/inherent label order (with a reference system this is each spatial and temporal dimensions) still have all dimension labels sorted. For other dimensions without inherent order, including bands, the dimension labels keep the order in which they are present in the original data cubes, and the dimension labels of `cube2` get appended to the dimension labels of `cube1`.\n\n**Examples for merging two data cubes:**\n\n1. Data cubes with the dimensions (`x`, `y`, `t`, `bands`) have the same dimension labels in `x`, `y` and `t`, but the labels for the dimension `bands` are `B1` and `B2` for the base data cube and `B3` and `B4` for the other. An overlap resolver is *not needed*. The merged data cube has the dimensions `x`, `y`, `t`, `bands`, and the dimension `bands` has four dimension labels: `B1`, `B2`, `B3`, `B4`.\n2. Data cubes with the dimensions (`x`, `y`, `t`, `bands`) have the same dimension labels in `x`, `y` and `t`, but the labels for the dimension `bands` are `B1` and `B2` for the base data cube and `B2` and `B3` for the other. An overlap resolver is *required* to resolve overlap in band `B2`. The merged data cube has the dimensions `x`, `y`, `t` and `bands` and the dimension `bands` has three dimension labels: `B1`, `B2`, `B3`.\n3. Data cubes with the dimensions (`x`, `y`, `t`) have the same dimension labels in `x`, `y` and `t`. There are two options:\n 1. Keep the overlapping values separately in the merged data cube: An overlap resolver is *not needed*, but for each data cube you need to add a new dimension using ``add_dimension()``. The new dimensions must be equal, except that the labels for the new dimensions must differ. The merged data cube has the same dimensions and labels as the original data cubes, plus the dimension added with ``add_dimension()``, which has the two dimension labels after the merge.\n 2. Combine the overlapping values into a single value: An overlap resolver is *required* to resolve the overlap for all values. The merged data cube has the same dimensions and labels as the original data cubes, but all values have been processed by the overlap resolver.\n4. A data cube with dimensions (`x`, `y`, `t` / `bands`) or (`x`, `y`, `t`, `bands`) and another data cube with dimensions (`x`, `y`) have the same dimension labels in `x` and `y`. Merging them will join dimensions `x` and `y`, so the lower dimension cube is merged with each time step and band available in the higher dimensional cube. A use case for this is applying a digital elevation model to a spatio-temporal data cube. An overlap resolver is *required* to resolve the overlap for all pixels.", "categories": [ "cubes" ], "parameters": [ { "name": "cube1", - "description": "The first data cube.", + "description": "The base data cube.", "schema": { "type": "object", "subtype": "datacube" @@ -16,7 +16,7 @@ }, { "name": "cube2", - "description": "The second data cube.", + "description": "The other data cube to be merged with the base data cube.", "schema": { "type": "object", "subtype": "datacube" @@ -31,14 +31,14 @@ "parameters": [ { "name": "x", - "description": "The overlapping value from the first data cube `cube1`.", + "description": "The overlapping value from the base data cube `cube1`.", "schema": { "description": "Any data type." } }, { "name": "y", - "description": "The overlapping value from the second data cube `cube2`.", + "description": "The overlapping value from the other data cube `cube2`.", "schema": { "description": "Any data type." }