From 158c2444c7a6614b3b0682427641e69211174c0b Mon Sep 17 00:00:00 2001 From: Matthias Mohr Date: Mon, 15 May 2023 13:58:05 +0200 Subject: [PATCH] Reddd ML processes for 2.1.0 #416 --- meta/subtype-schemas.json | 6 ++ proposals/fit_class_random_forest.json | 110 +++++++++++++++++++++++++ proposals/fit_regr_random_forest.json | 110 +++++++++++++++++++++++++ proposals/load_ml_model.json | 53 ++++++++++++ proposals/predict_random_forest.json | 42 ++++++++++ proposals/save_ml_model.json | 44 ++++++++++ 6 files changed, 365 insertions(+) create mode 100644 proposals/fit_class_random_forest.json create mode 100644 proposals/fit_regr_random_forest.json create mode 100644 proposals/load_ml_model.json create mode 100644 proposals/predict_random_forest.json create mode 100644 proposals/save_ml_model.json diff --git a/meta/subtype-schemas.json b/meta/subtype-schemas.json index b44cb8dc..6d0030fc 100644 --- a/meta/subtype-schemas.json +++ b/meta/subtype-schemas.json @@ -232,6 +232,12 @@ } } }, + "ml-model": { + "type": "object", + "subtype": "ml-model", + "title": "Machine Learning Model", + "description": "A machine learning model, accompanied with STAC metadata that implements the the STAC ml-model extension." + }, "output-format": { "type": "string", "subtype": "output-format", diff --git a/proposals/fit_class_random_forest.json b/proposals/fit_class_random_forest.json new file mode 100644 index 00000000..6eb874bf --- /dev/null +++ b/proposals/fit_class_random_forest.json @@ -0,0 +1,110 @@ +{ + "id": "fit_class_random_forest", + "summary": "Train a random forest classification model", + "description": "Executes the fit of a random forest classification based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest classification model is based on the approach by Breiman (2001).", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "predictors", + "description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + } + ] + }, + { + "name": "target", + "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + } + ] + } + }, + { + "name": "max_variables", + "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split.\n- `sqrt`: The square root of the number of variables are considered for each split. This is often the default for classification.", + "schema": [ + { + "type": "integer", + "minimum": 1 + }, + { + "type": "string", + "enum": [ + "all", + "log2", + "onethird", + "sqrt" + ] + } + ] + }, + { + "name": "num_trees", + "description": "The number of trees build within the Random Forest classification.", + "optional": true, + "default": 100, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://doi.org/10.1023/A:1010933404324", + "title": "Breiman (2001): Random Forests", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/fit_regr_random_forest.json b/proposals/fit_regr_random_forest.json new file mode 100644 index 00000000..51191fa5 --- /dev/null +++ b/proposals/fit_regr_random_forest.json @@ -0,0 +1,110 @@ +{ + "id": "fit_regr_random_forest", + "summary": "Train a random forest regression model", + "description": "Executes the fit of a random forest regression based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest regression model is based on the approach by Breiman (2001).", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "predictors", + "description": "The predictors for the regression model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + } + ] + }, + { + "name": "target", + "description": "The training sites for the regression model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + } + ] + } + }, + { + "name": "max_variables", + "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split. This is often the default for regression.\n- `sqrt`: The square root of the number of variables are considered for each split.", + "schema": [ + { + "type": "integer", + "minimum": 1 + }, + { + "type": "string", + "enum": [ + "all", + "log2", + "onethird", + "sqrt" + ] + } + ] + }, + { + "name": "num_trees", + "description": "The number of trees build within the Random Forest regression.", + "optional": true, + "default": 100, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://doi.org/10.1023/A:1010933404324", + "title": "Breiman (2001): Random Forests", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/load_ml_model.json b/proposals/load_ml_model.json new file mode 100644 index 00000000..151513c8 --- /dev/null +++ b/proposals/load_ml_model.json @@ -0,0 +1,53 @@ +{ + "id": "load_ml_model", + "summary": "Load a ML model", + "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as ``fit_regr_random_forest()`` and ``save_ml_model()``.", + "categories": [ + "machine learning", + "import" + ], + "experimental": true, + "parameters": [ + { + "name": "id", + "description": "The STAC Item to load the machine learning model from. The STAC Item must implement the `ml-model` extension.", + "schema": [ + { + "title": "URL", + "type": "string", + "format": "uri", + "subtype": "uri", + "pattern": "^https?://" + }, + { + "title": "Batch Job ID", + "description": "Loading a model by batch job ID is possible only if a single model has been saved by the job. Otherwise, you have to load a specific model from a batch job by URL.", + "type": "string", + "subtype": "job-id", + "pattern": "^[\\w\\-\\.~]+$" + }, + { + "title": "User-uploaded File", + "type": "string", + "subtype": "file-path", + "pattern": "^[^\r\n\\:'\"]+$" + } + ] + } + ], + "returns": { + "description": "A machine learning model to be used with machine learning processes such as ``predict_random_forest()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://github.com/stac-extensions/ml-model", + "title": "STAC ml-model extension", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/predict_random_forest.json b/proposals/predict_random_forest.json new file mode 100644 index 00000000..62c54e9f --- /dev/null +++ b/proposals/predict_random_forest.json @@ -0,0 +1,42 @@ +{ + "id": "predict_random_forest", + "summary": "Predict values based on a Random Forest model", + "description": "Applies a Random Forest machine learning model to an array and predict a value for it.", + "categories": [ + "machine learning", + "reducer" + ], + "experimental": true, + "parameters": [ + { + "name": "data", + "description": "An array of numbers.", + "schema": { + "type": "array", + "items": { + "type": [ + "number", + "null" + ] + } + } + }, + { + "name": "model", + "description": "A model object that can be trained with the processes ``fit_regr_random_forest()`` (regression) and ``fit_class_random_forest()`` (classification).", + "schema": { + "type": "object", + "subtype": "ml-model" + } + } + ], + "returns": { + "description": "The predicted value. Returns `null` if any of the given values in the array is a no-data value.", + "schema": { + "type": [ + "number", + "null" + ] + } + } +} diff --git a/proposals/save_ml_model.json b/proposals/save_ml_model.json new file mode 100644 index 00000000..5e9ea8b0 --- /dev/null +++ b/proposals/save_ml_model.json @@ -0,0 +1,44 @@ +{ + "id": "save_ml_model", + "summary": "Save a ML model", + "description": "Saves a machine learning model as part of a batch job.\n\nThe model will be accompanied by a separate STAC Item that implements the [ml-model extension](https://github.com/stac-extensions/ml-model).", + "categories": [ + "machine learning", + "import" + ], + "experimental": true, + "parameters": [ + { + "name": "data", + "description": "The data to store as a machine learning model.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + { + "name": "options", + "description": "Additional parameters to create the file(s).", + "schema": { + "type": "object", + "additionalParameters": false + }, + "default": {}, + "optional": true + } + ], + "returns": { + "description": "Returns `false` if the process failed to store the model, `true` otherwise.", + "schema": { + "type": "boolean" + } + }, + "links": [ + { + "href": "https://github.com/stac-extensions/ml-model", + "title": "STAC ml-model extension", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file