Skip to content

Commit

Permalink
Merge pull request #304 from Open-EO/832-catboost-training-process
Browse files Browse the repository at this point in the history
832 catboost training process
  • Loading branch information
JeroenVerstraelen authored Aug 5, 2024
2 parents c63b66d + eb4b647 commit b744ec2
Show file tree
Hide file tree
Showing 9 changed files with 454 additions and 20 deletions.
60 changes: 51 additions & 9 deletions openeo_driver/ProcessGraphDeserializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,6 @@ def extract_arg(args: ProcessArgs, name: str, process_id="n/a"):
return _as_process_args(args, process_id=process_id).get_required(name=name)



def _align_extent(extent,collection_id,env):
metadata = None
try:
Expand Down Expand Up @@ -939,7 +938,7 @@ def apply_polygon(args: ProcessArgs, env: EvalEnv) -> DriverDataCube:

@process_registry_100.add_function(spec=read_spec("openeo-processes/experimental/fit_class_random_forest.json"))
@process_registry_2xx.add_function(spec=read_spec("openeo-processes/experimental/fit_class_random_forest.json"))
def fit_class_random_forest(args: dict, env: EvalEnv) -> DriverMlModel:
def fit_class_random_forest(args: ProcessArgs, env: EvalEnv) -> DriverMlModel:
# Keep it simple for dry run
if env.get(ENV_DRY_RUN_TRACER):
return DriverMlModel()
Expand All @@ -953,14 +952,11 @@ def fit_class_random_forest(args: dict, env: EvalEnv) -> DriverMlModel:
reason=f"should be non-temporal vector-cube, but got {type(predictors)}.",
)

target = extract_arg(args, "target")
target: Union[dict, DriverVectorCube] = extract_arg(args, "target")
if isinstance(target, DriverVectorCube):
pass
elif isinstance(target, dict) and target.get("type") == "FeatureCollection":
# TODO: convert to vector cube, e.g.:
# target = env.backend_implementation.vector_cube_cls.from_geojson(target)
pass
else:
# Convert target to geojson feature collection.
target: dict = shapely.geometry.mapping(target.get_geometries())
if not (isinstance(target, dict) and target.get("type") == "FeatureCollection"):
raise ProcessParameterInvalidException(
parameter="target",
process="fit_class_random_forest",
Expand All @@ -987,6 +983,52 @@ def fit_class_random_forest(args: dict, env: EvalEnv) -> DriverMlModel:
)


@process_registry_100.add_function(spec=read_spec("openeo-processes/experimental/fit_class_catboost.json"))
@process_registry_2xx.add_function(spec=read_spec("openeo-processes/experimental/fit_class_catboost.json"))
def fit_class_catboost(args: ProcessArgs, env: EvalEnv) -> DriverMlModel:
process = "fit_class_catboost"
if env.get(ENV_DRY_RUN_TRACER):
return DriverMlModel()

predictors = extract_arg(args, "predictors")
if not isinstance(predictors, (AggregatePolygonSpatialResult, DriverVectorCube)):
raise ProcessParameterInvalidException(
parameter="predictors",
process=process,
reason=f"should be non-temporal vector-cube, but got {type(predictors)}.",
)

target: Union[dict, DriverVectorCube] = extract_arg(args, "target")
if isinstance(target, DriverVectorCube):
# Convert target to geojson feature collection.
target: dict = shapely.geometry.mapping(target.get_geometries())
if not (isinstance(target, dict) and target.get("type") == "FeatureCollection"):
raise ProcessParameterInvalidException(
parameter="target",
process=process,
reason=f"expected feature collection or vector-cube value, but got {type(target)}.",
)

# TODO: get defaults from process spec?
# TODO: do parameter checks automatically based on process spec?
def get_validated_parameter(args, param_name, default_value, expected_type, min_value=1, max_value=1000):
return args.get_optional(
param_name,
default=default_value,
expected_type=expected_type,
validator=ProcessArgs.validator_generic(
lambda v: v >= min_value and v <= max_value,
error_message=f"The `{param_name}` parameter should be an integer between {min_value} and {max_value}.",
),
)

iterations = get_validated_parameter(args, "iterations", 5, int, 1, 500)
depth = get_validated_parameter(args, "depth", 5, int, 1, 16)
seed = get_validated_parameter(args, "seed", 0, int, 0, 2**31 - 1)

return predictors.fit_class_catboost(target=target, iterations=iterations, depth=depth, seed=seed)


@process_registry_100.add_function(spec=read_spec("openeo-processes/experimental/predict_random_forest.json"))
@process_registry_2xx.add_function(spec=read_spec("openeo-processes/experimental/predict_random_forest.json"))
def predict_random_forest(args: dict, env: EvalEnv):
Expand Down
4 changes: 2 additions & 2 deletions openeo_driver/datacube.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import shapely.geometry.base
import shapely.ops
import xarray
from geopandas import GeoDataFrame
from geopandas import GeoDataFrame, GeoSeries
from openeo.metadata import CollectionMetadata
from openeo.util import ensure_dir, str_truncate
from pyproj import CRS
Expand Down Expand Up @@ -679,7 +679,7 @@ def geometry_count(self) -> int:
"""Size of the geometry dimension"""
return len(self._geometries.index)

def get_geometries(self) -> Sequence[shapely.geometry.base.BaseGeometry]:
def get_geometries(self) -> GeoSeries[shapely.geometry.base.BaseGeometry]:
return self._geometries.geometry

def get_cube(self) -> Optional[xarray.DataArray]:
Expand Down
3 changes: 0 additions & 3 deletions openeo_driver/dry_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,9 +789,6 @@ def _nop(self, *args, **kwargs) -> 'DryRunDataCube':
"""No Operation: do nothing"""
return self

def fit_class_random_forest(self, predictors, target, training, num_trees, mtry):
return self

# TODO: some methods need metadata manipulation?


Expand Down
19 changes: 19 additions & 0 deletions openeo_driver/dummy/dummy_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,25 @@ def fit_class_random_forest(
seed=seed,
)

def fit_class_catboost(
self,
target: DriverVectorCube,
iterations: int = 5,
depth=5,
border_count=254,
seed=0,
) -> "DriverMlModel":
return DummyMlModel(
process_id="fit_class_catboost",
# TODO: handle `to_geojson` in `DummyMlModel.write_assets` instead of here?
data=self.to_geojson(),
target=target,
iterations=iterations,
depth=depth,
border_count=border_count,
seed=seed,
)


class DummyMlModel(DriverMlModel):

Expand Down
21 changes: 16 additions & 5 deletions openeo_driver/save_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -802,11 +802,22 @@ def write_assets(self, directory: Union[str, Path]) -> Dict[str, StacAsset]:
return {str(Path(filename).name): asset}

def fit_class_random_forest(
self,
target: dict,
num_trees: int = 100,
max_variables: Optional[Union[int, str]] = None,
seed: Optional[int] = None
self,
target: dict,
num_trees: int = 100,
max_variables: Optional[Union[int, str]] = None,
seed: Optional[int] = None,
) -> DriverMlModel:
# TODO: this method belongs eventually under DriverVectorCube
raise NotImplementedError

def fit_class_catboost(
self,
target: dict,
iterations: int = 5,
depth=5,
border_count=254,
seed=0,
) -> DriverMlModel:
# TODO: this method belongs eventually under DriverVectorCube
raise NotImplementedError
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
{
"id": "fit_class_catboost",
"summary": "Train a catboost classification model",
"description": "Executes the fit of a catboost classification based on training data. The process does not include a separate split of the data in test, validation and training data.",
"categories": [
"machine learning"
],
"experimental": true,
"parameters": [
{
"name": "predictors",
"description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.",
"schema": {
"type": "object",
"subtype": "vector-cube"
}
},
{
"name": "target",
"description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Catboost model. The geometry has to be associated with a value to predict (e.g. fractional forest canopy cover).",
"schema": {
"type": "object",
"subtype": "vector-cube"
}
},
{
"name": "iterations",
"description": "The maximum number of trees that can be built during the training process.",
"optional": true,
"default": 5,
"schema": {
"type": "integer",
"minimum": 1,
"maximum": 500
}
},
{
"name": "depth",
"description": "Depth of the trees.",
"optional": true,
"default": 5,
"schema": {
"type": "integer",
"minimum": 1,
"maximum": 16
}
},
{
"name": "seed",
"description": "The random seed used for training, for reproducibility.",
"optional": true,
"default": 0,
"schema": {
"type": "integer",
"minimum": 0,
"maximum": 2147483647
}
}
],
"returns": {
"description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.",
"schema": {
"type": "object",
"subtype": "ml-model"
}
},
"links": [
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
},
{
"name": "target",
"description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).",
"description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to be associated with a value to predict (e.g. fractional forest canopy cover).",
"schema": {
"type": "object",
"subtype": "vector-cube"
Expand Down
Loading

0 comments on commit b744ec2

Please sign in to comment.