diff --git a/openeo_driver/ProcessGraphDeserializer.py b/openeo_driver/ProcessGraphDeserializer.py index 6a173a93..1d1224f0 100644 --- a/openeo_driver/ProcessGraphDeserializer.py +++ b/openeo_driver/ProcessGraphDeserializer.py @@ -491,7 +491,6 @@ def extract_arg(args: ProcessArgs, name: str, process_id="n/a"): return _as_process_args(args, process_id=process_id).get_required(name=name) - def _align_extent(extent,collection_id,env): metadata = None try: @@ -939,7 +938,7 @@ def apply_polygon(args: ProcessArgs, env: EvalEnv) -> DriverDataCube: @process_registry_100.add_function(spec=read_spec("openeo-processes/experimental/fit_class_random_forest.json")) @process_registry_2xx.add_function(spec=read_spec("openeo-processes/experimental/fit_class_random_forest.json")) -def fit_class_random_forest(args: dict, env: EvalEnv) -> DriverMlModel: +def fit_class_random_forest(args: ProcessArgs, env: EvalEnv) -> DriverMlModel: # Keep it simple for dry run if env.get(ENV_DRY_RUN_TRACER): return DriverMlModel() @@ -953,14 +952,11 @@ def fit_class_random_forest(args: dict, env: EvalEnv) -> DriverMlModel: reason=f"should be non-temporal vector-cube, but got {type(predictors)}.", ) - target = extract_arg(args, "target") + target: Union[dict, DriverVectorCube] = extract_arg(args, "target") if isinstance(target, DriverVectorCube): - pass - elif isinstance(target, dict) and target.get("type") == "FeatureCollection": - # TODO: convert to vector cube, e.g.: - # target = env.backend_implementation.vector_cube_cls.from_geojson(target) - pass - else: + # Convert target to geojson feature collection. + target: dict = shapely.geometry.mapping(target.get_geometries()) + if not (isinstance(target, dict) and target.get("type") == "FeatureCollection"): raise ProcessParameterInvalidException( parameter="target", process="fit_class_random_forest", @@ -987,6 +983,52 @@ def fit_class_random_forest(args: dict, env: EvalEnv) -> DriverMlModel: ) +@process_registry_100.add_function(spec=read_spec("openeo-processes/experimental/fit_class_catboost.json")) +@process_registry_2xx.add_function(spec=read_spec("openeo-processes/experimental/fit_class_catboost.json")) +def fit_class_catboost(args: ProcessArgs, env: EvalEnv) -> DriverMlModel: + process = "fit_class_catboost" + if env.get(ENV_DRY_RUN_TRACER): + return DriverMlModel() + + predictors = extract_arg(args, "predictors") + if not isinstance(predictors, (AggregatePolygonSpatialResult, DriverVectorCube)): + raise ProcessParameterInvalidException( + parameter="predictors", + process=process, + reason=f"should be non-temporal vector-cube, but got {type(predictors)}.", + ) + + target: Union[dict, DriverVectorCube] = extract_arg(args, "target") + if isinstance(target, DriverVectorCube): + # Convert target to geojson feature collection. + target: dict = shapely.geometry.mapping(target.get_geometries()) + if not (isinstance(target, dict) and target.get("type") == "FeatureCollection"): + raise ProcessParameterInvalidException( + parameter="target", + process=process, + reason=f"expected feature collection or vector-cube value, but got {type(target)}.", + ) + + # TODO: get defaults from process spec? + # TODO: do parameter checks automatically based on process spec? + def get_validated_parameter(args, param_name, default_value, expected_type, min_value=1, max_value=1000): + return args.get_optional( + param_name, + default=default_value, + expected_type=expected_type, + validator=ProcessArgs.validator_generic( + lambda v: v >= min_value and v <= max_value, + error_message=f"The `{param_name}` parameter should be an integer between {min_value} and {max_value}.", + ), + ) + + iterations = get_validated_parameter(args, "iterations", 5, int, 1, 500) + depth = get_validated_parameter(args, "depth", 5, int, 1, 16) + seed = get_validated_parameter(args, "seed", 0, int, 0, 2**31 - 1) + + return predictors.fit_class_catboost(target=target, iterations=iterations, depth=depth, seed=seed) + + @process_registry_100.add_function(spec=read_spec("openeo-processes/experimental/predict_random_forest.json")) @process_registry_2xx.add_function(spec=read_spec("openeo-processes/experimental/predict_random_forest.json")) def predict_random_forest(args: dict, env: EvalEnv): diff --git a/openeo_driver/datacube.py b/openeo_driver/datacube.py index dd9b688b..300549ae 100644 --- a/openeo_driver/datacube.py +++ b/openeo_driver/datacube.py @@ -17,7 +17,7 @@ import shapely.geometry.base import shapely.ops import xarray -from geopandas import GeoDataFrame +from geopandas import GeoDataFrame, GeoSeries from openeo.metadata import CollectionMetadata from openeo.util import ensure_dir, str_truncate from pyproj import CRS @@ -679,7 +679,7 @@ def geometry_count(self) -> int: """Size of the geometry dimension""" return len(self._geometries.index) - def get_geometries(self) -> Sequence[shapely.geometry.base.BaseGeometry]: + def get_geometries(self) -> GeoSeries[shapely.geometry.base.BaseGeometry]: return self._geometries.geometry def get_cube(self) -> Optional[xarray.DataArray]: diff --git a/openeo_driver/dry_run.py b/openeo_driver/dry_run.py index d6e8d48c..d3e407a6 100644 --- a/openeo_driver/dry_run.py +++ b/openeo_driver/dry_run.py @@ -789,9 +789,6 @@ def _nop(self, *args, **kwargs) -> 'DryRunDataCube': """No Operation: do nothing""" return self - def fit_class_random_forest(self, predictors, target, training, num_trees, mtry): - return self - # TODO: some methods need metadata manipulation? diff --git a/openeo_driver/dummy/dummy_backend.py b/openeo_driver/dummy/dummy_backend.py index 8884f8da..147abc47 100644 --- a/openeo_driver/dummy/dummy_backend.py +++ b/openeo_driver/dummy/dummy_backend.py @@ -335,6 +335,25 @@ def fit_class_random_forest( seed=seed, ) + def fit_class_catboost( + self, + target: DriverVectorCube, + iterations: int = 5, + depth=5, + border_count=254, + seed=0, + ) -> "DriverMlModel": + return DummyMlModel( + process_id="fit_class_catboost", + # TODO: handle `to_geojson` in `DummyMlModel.write_assets` instead of here? + data=self.to_geojson(), + target=target, + iterations=iterations, + depth=depth, + border_count=border_count, + seed=seed, + ) + class DummyMlModel(DriverMlModel): diff --git a/openeo_driver/save_result.py b/openeo_driver/save_result.py index fe7bbaa6..35be603c 100644 --- a/openeo_driver/save_result.py +++ b/openeo_driver/save_result.py @@ -802,11 +802,22 @@ def write_assets(self, directory: Union[str, Path]) -> Dict[str, StacAsset]: return {str(Path(filename).name): asset} def fit_class_random_forest( - self, - target: dict, - num_trees: int = 100, - max_variables: Optional[Union[int, str]] = None, - seed: Optional[int] = None + self, + target: dict, + num_trees: int = 100, + max_variables: Optional[Union[int, str]] = None, + seed: Optional[int] = None, + ) -> DriverMlModel: + # TODO: this method belongs eventually under DriverVectorCube + raise NotImplementedError + + def fit_class_catboost( + self, + target: dict, + iterations: int = 5, + depth=5, + border_count=254, + seed=0, ) -> DriverMlModel: # TODO: this method belongs eventually under DriverVectorCube raise NotImplementedError diff --git a/openeo_driver/specs/openeo-processes/experimental/fit_class_catboost.json b/openeo_driver/specs/openeo-processes/experimental/fit_class_catboost.json new file mode 100644 index 00000000..d66cc85e --- /dev/null +++ b/openeo_driver/specs/openeo-processes/experimental/fit_class_catboost.json @@ -0,0 +1,69 @@ +{ + "id": "fit_class_catboost", + "summary": "Train a catboost classification model", + "description": "Executes the fit of a catboost classification based on training data. The process does not include a separate split of the data in test, validation and training data.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "predictors", + "description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", + "schema": { + "type": "object", + "subtype": "vector-cube" + } + }, + { + "name": "target", + "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Catboost model. The geometry has to be associated with a value to predict (e.g. fractional forest canopy cover).", + "schema": { + "type": "object", + "subtype": "vector-cube" + } + }, + { + "name": "iterations", + "description": "The maximum number of trees that can be built during the training process.", + "optional": true, + "default": 5, + "schema": { + "type": "integer", + "minimum": 1, + "maximum": 500 + } + }, + { + "name": "depth", + "description": "Depth of the trees.", + "optional": true, + "default": 5, + "schema": { + "type": "integer", + "minimum": 1, + "maximum": 16 + } + }, + { + "name": "seed", + "description": "The random seed used for training, for reproducibility.", + "optional": true, + "default": 0, + "schema": { + "type": "integer", + "minimum": 0, + "maximum": 2147483647 + } + } + ], + "returns": { + "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + ] +} diff --git a/openeo_driver/specs/openeo-processes/experimental/fit_class_random_forest.json b/openeo_driver/specs/openeo-processes/experimental/fit_class_random_forest.json index a9a549d9..6d5f2020 100644 --- a/openeo_driver/specs/openeo-processes/experimental/fit_class_random_forest.json +++ b/openeo_driver/specs/openeo-processes/experimental/fit_class_random_forest.json @@ -17,7 +17,7 @@ }, { "name": "target", - "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", + "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to be associated with a value to predict (e.g. fractional forest canopy cover).", "schema": { "type": "object", "subtype": "vector-cube" diff --git a/tests/data/pg/1.0/fit_class_catboost.json b/tests/data/pg/1.0/fit_class_catboost.json new file mode 100644 index 00000000..8d30f4d9 --- /dev/null +++ b/tests/data/pg/1.0/fit_class_catboost.json @@ -0,0 +1,215 @@ +{ + "loadcollection1": { + "process_id": "load_collection", + "arguments": { + "id": "S2_FOOBAR", + "spatial_extent": null, + "temporal_extent": null + } + }, + "reducedimension1": { + "process_id": "reduce_dimension", + "arguments": { + "data": { + "from_node": "loadcollection1" + }, + "dimension": "t", + "reducer": { + "process_graph": { + "mean1": { + "process_id": "mean", + "aguments": { + "data": { + "from_parameter": "data" + }, + "result": true + } + } + } + } + } + }, + "aggregatespatial1": { + "process_id": "aggregate_spatial", + "arguments": { + "data": { + "from_node": "reducedimension1" + }, + "geometries": { + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "properties": { + "target": 0 + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 3, + 5 + ], + [ + 4, + 5 + ], + [ + 4, + 6 + ], + [ + 3, + 6 + ], + [ + 3, + 5 + ] + ] + ] + } + }, + { + "type": "Feature", + "properties": { + "target": 1 + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 8, + 1 + ], + [ + 9, + 1 + ], + [ + 9, + 2 + ], + [ + 8, + 2 + ], + [ + 8, + 1 + ] + ] + ] + } + } + ] + }, + "reducer": { + "process_graph": { + "mean1": { + "process_id": "mean", + "arguments": { + "data": { + "from_parameter": "data" + } + }, + "result": true + } + } + } + } + }, + "fitclassrandomforest1": { + "process_id": "fit_class_catboost", + "arguments": { + "predictors": { + "from_node": "aggregatespatial1" + }, + "target": { + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "properties": { + "target": 0 + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 3, + 5 + ], + [ + 4, + 5 + ], + [ + 4, + 6 + ], + [ + 3, + 6 + ], + [ + 3, + 5 + ] + ] + ] + } + }, + { + "type": "Feature", + "properties": { + "target": 1 + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 8, + 1 + ], + [ + 9, + 1 + ], + [ + 9, + 2 + ], + [ + 8, + 2 + ], + [ + 8, + 1 + ] + ] + ] + } + } + ] + }, + "iterations": 10, + "depth": 16, + "seed": 8 + } + }, + "savemlmodel1": { + "process_id": "save_ml_model", + "arguments": { + "data": { + "from_node": "fitclassrandomforest1" + } + }, + "result": true + } +} diff --git a/tests/test_views_execute.py b/tests/test_views_execute.py index 776f732b..bb3a5a68 100644 --- a/tests/test_views_execute.py +++ b/tests/test_views_execute.py @@ -3883,6 +3883,87 @@ def test_fit_class_random_forest(api): ) +def test_fit_class_catboost(api): + res = api.check_result("fit_class_catboost.json") + + geom1 = { + "type": "Polygon", + "coordinates": [[[3.0, 5.0], [4.0, 5.0], [4.0, 6.0], [3.0, 6.0], [3.0, 5.0]]], + } + geom2 = { + "type": "Polygon", + "coordinates": [[[8.0, 1.0], [9.0, 1.0], [9.0, 2.0], [8.0, 2.0], [8.0, 1.0]]], + } + assert res.json == DictSubSet( + { + "type": "DummyMlModel", + "creation_data": { + "process_id": "fit_class_catboost", + "data": DictSubSet( + { + "type": "FeatureCollection", + "features": [ + DictSubSet( + { + "type": "Feature", + "id": "0", + "geometry": geom1, + "properties": { + "B02": 2.345, + "B03": None, + "B04": 2.0, + "B08": 3.0, + "target": 0, + }, + } + ), + DictSubSet( + { + "type": "Feature", + "id": "1", + "geometry": geom2, + "properties": { + "B02": 4.0, + "B03": 5.0, + "B04": 6.0, + "B08": 7.0, + "target": 1, + }, + } + ), + ], + } + ), + "target": DictSubSet( + { + "type": "FeatureCollection", + "features": [ + DictSubSet( + { + "type": "Feature", + "geometry": geom1, + "properties": {"target": 0}, + } + ), + DictSubSet( + { + "type": "Feature", + "geometry": geom2, + "properties": {"target": 1}, + } + ), + ], + } + ), + "iterations": 10, + "depth": 16, + "seed": 8, + "border_count": 254, + }, + } + ) + + def test_if_merge_cubes(api): api.check_result({ "loadcollection1": {