From 158c2444c7a6614b3b0682427641e69211174c0b Mon Sep 17 00:00:00 2001
From: Matthias Mohr <m.mohr@uni-muenster.de>
Date: Mon, 15 May 2023 13:58:05 +0200
Subject: [PATCH] Reddd ML processes for 2.1.0 #416

---
 meta/subtype-schemas.json              |   6 ++
 proposals/fit_class_random_forest.json | 110 +++++++++++++++++++++++++
 proposals/fit_regr_random_forest.json  | 110 +++++++++++++++++++++++++
 proposals/load_ml_model.json           |  53 ++++++++++++
 proposals/predict_random_forest.json   |  42 ++++++++++
 proposals/save_ml_model.json           |  44 ++++++++++
 6 files changed, 365 insertions(+)
 create mode 100644 proposals/fit_class_random_forest.json
 create mode 100644 proposals/fit_regr_random_forest.json
 create mode 100644 proposals/load_ml_model.json
 create mode 100644 proposals/predict_random_forest.json
 create mode 100644 proposals/save_ml_model.json

diff --git a/meta/subtype-schemas.json b/meta/subtype-schemas.json
index b44cb8dc..6d0030fc 100644
--- a/meta/subtype-schemas.json
+++ b/meta/subtype-schemas.json
@@ -232,6 +232,12 @@
                 }
             }
         },
+        "ml-model": {
+            "type": "object",
+            "subtype": "ml-model",
+            "title": "Machine Learning Model",
+            "description": "A machine learning model, accompanied with STAC metadata that implements the the STAC ml-model extension."
+        },
         "output-format": {
             "type": "string",
             "subtype": "output-format",
diff --git a/proposals/fit_class_random_forest.json b/proposals/fit_class_random_forest.json
new file mode 100644
index 00000000..6eb874bf
--- /dev/null
+++ b/proposals/fit_class_random_forest.json
@@ -0,0 +1,110 @@
+{
+    "id": "fit_class_random_forest",
+    "summary": "Train a random forest classification model",
+    "description": "Executes the fit of a random forest classification based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest classification model is based on the approach by Breiman (2001).",
+    "categories": [
+        "machine learning"
+    ],
+    "experimental": true,
+    "parameters": [
+        {
+            "name": "predictors",
+            "description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.",
+            "schema": [
+                {
+                    "type": "object",
+                    "subtype": "datacube",
+                    "dimensions": [
+                        {
+                            "type": "geometry"
+                        },
+                        {
+                            "type": "bands"
+                        }
+                    ]
+                },
+                {
+                    "type": "object",
+                    "subtype": "datacube",
+                    "dimensions": [
+                        {
+                            "type": "geometry"
+                        },
+                        {
+                            "type": "other"
+                        }
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "target",
+            "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).",
+            "schema": {
+                "type": "object",
+                "subtype": "datacube",
+                "dimensions": [
+                    {
+                        "type": "geometry"
+                    }
+                ]
+            }
+        },
+        {
+            "name": "max_variables",
+            "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split.\n- `sqrt`: The square root of the number of variables are considered for each split. This is often the default for classification.",
+            "schema": [
+                {
+                    "type": "integer",
+                    "minimum": 1
+                },
+                {
+                    "type": "string",
+                    "enum": [
+                        "all",
+                        "log2",
+                        "onethird",
+                        "sqrt"
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "num_trees",
+            "description": "The number of trees build within the Random Forest classification.",
+            "optional": true,
+            "default": 100,
+            "schema": {
+                "type": "integer",
+                "minimum": 1
+            }
+        },
+        {
+            "name": "seed",
+            "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.",
+            "optional": true,
+            "default": null,
+            "schema": {
+                "type": [
+                    "integer",
+                    "null"
+                ]
+            }
+        }
+    ],
+    "returns": {
+        "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.",
+        "schema": {
+            "type": "object",
+            "subtype": "ml-model"
+        }
+    },
+    "links": [
+        {
+            "href": "https://doi.org/10.1023/A:1010933404324",
+            "title": "Breiman (2001): Random Forests",
+            "type": "text/html",
+            "rel": "about"
+        }
+    ]
+}
diff --git a/proposals/fit_regr_random_forest.json b/proposals/fit_regr_random_forest.json
new file mode 100644
index 00000000..51191fa5
--- /dev/null
+++ b/proposals/fit_regr_random_forest.json
@@ -0,0 +1,110 @@
+{
+    "id": "fit_regr_random_forest",
+    "summary": "Train a random forest regression model",
+    "description": "Executes the fit of a random forest regression based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest regression model is based on the approach by Breiman (2001).",
+    "categories": [
+        "machine learning"
+    ],
+    "experimental": true,
+    "parameters": [
+        {
+            "name": "predictors",
+            "description": "The predictors for the regression model as a vector data cube. Aggregated to the features (vectors) of the target input variable.",
+            "schema": [
+                {
+                    "type": "object",
+                    "subtype": "datacube",
+                    "dimensions": [
+                        {
+                            "type": "geometry"
+                        },
+                        {
+                            "type": "bands"
+                        }
+                    ]
+                },
+                {
+                    "type": "object",
+                    "subtype": "datacube",
+                    "dimensions": [
+                        {
+                            "type": "geometry"
+                        },
+                        {
+                            "type": "other"
+                        }
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "target",
+            "description": "The training sites for the regression model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).",
+            "schema": {
+                "type": "object",
+                "subtype": "datacube",
+                "dimensions": [
+                    {
+                        "type": "geometry"
+                    }
+                ]
+            }
+        },
+        {
+            "name": "max_variables",
+            "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split. This is often the default for regression.\n- `sqrt`: The square root of the number of variables are considered for each split.",
+            "schema": [
+                {
+                    "type": "integer",
+                    "minimum": 1
+                },
+                {
+                    "type": "string",
+                    "enum": [
+                        "all",
+                        "log2",
+                        "onethird",
+                        "sqrt"
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "num_trees",
+            "description": "The number of trees build within the Random Forest regression.",
+            "optional": true,
+            "default": 100,
+            "schema": {
+                "type": "integer",
+                "minimum": 1
+            }
+        },
+        {
+            "name": "seed",
+            "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.",
+            "optional": true,
+            "default": null,
+            "schema": {
+                "type": [
+                    "integer",
+                    "null"
+                ]
+            }
+        }
+    ],
+    "returns": {
+        "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.",
+        "schema": {
+            "type": "object",
+            "subtype": "ml-model"
+        }
+    },
+    "links": [
+        {
+            "href": "https://doi.org/10.1023/A:1010933404324",
+            "title": "Breiman (2001): Random Forests",
+            "type": "text/html",
+            "rel": "about"
+        }
+    ]
+}
diff --git a/proposals/load_ml_model.json b/proposals/load_ml_model.json
new file mode 100644
index 00000000..151513c8
--- /dev/null
+++ b/proposals/load_ml_model.json
@@ -0,0 +1,53 @@
+{
+    "id": "load_ml_model",
+    "summary": "Load a ML model",
+    "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as  ``fit_regr_random_forest()`` and ``save_ml_model()``.",
+    "categories": [
+        "machine learning",
+        "import"
+    ],
+    "experimental": true,
+    "parameters": [
+        {
+            "name": "id",
+            "description": "The STAC Item to load the machine learning model from. The STAC Item must implement the `ml-model` extension.",
+            "schema": [
+                {
+                    "title": "URL",
+                    "type": "string",
+                    "format": "uri",
+                    "subtype": "uri",
+                    "pattern": "^https?://"
+                },
+                {
+                    "title": "Batch Job ID",
+                    "description": "Loading a model by batch job ID is possible only if a single model has been saved by the job. Otherwise, you have to load a specific model from a batch job by URL.",
+                    "type": "string",
+                    "subtype": "job-id",
+                    "pattern": "^[\\w\\-\\.~]+$"
+                },
+                {
+                    "title": "User-uploaded File",
+                    "type": "string",
+                    "subtype": "file-path",
+                    "pattern": "^[^\r\n\\:'\"]+$"
+                }
+            ]
+        }
+    ],
+    "returns": {
+        "description": "A machine learning model to be used with machine learning processes such as ``predict_random_forest()``.",
+        "schema": {
+            "type": "object",
+            "subtype": "ml-model"
+        }
+    },
+    "links": [
+        {
+            "href": "https://github.com/stac-extensions/ml-model",
+            "title": "STAC ml-model extension",
+            "type": "text/html",
+            "rel": "about"
+        }
+    ]
+}
diff --git a/proposals/predict_random_forest.json b/proposals/predict_random_forest.json
new file mode 100644
index 00000000..62c54e9f
--- /dev/null
+++ b/proposals/predict_random_forest.json
@@ -0,0 +1,42 @@
+{
+    "id": "predict_random_forest",
+    "summary": "Predict values based on a Random Forest model",
+    "description": "Applies a Random Forest machine learning model to an array and predict a value for it.",
+    "categories": [
+        "machine learning",
+        "reducer"
+    ],
+    "experimental": true,
+    "parameters": [
+        {
+            "name": "data",
+            "description": "An array of numbers.",
+            "schema": {
+                "type": "array",
+                "items": {
+                    "type": [
+                        "number",
+                        "null"
+                    ]
+                }
+            }
+        },
+        {
+            "name": "model",
+            "description": "A model object that can be trained with the processes ``fit_regr_random_forest()`` (regression) and ``fit_class_random_forest()`` (classification).",
+            "schema": {
+                "type": "object",
+                "subtype": "ml-model"
+            }
+        }
+    ],
+    "returns": {
+        "description": "The predicted value. Returns `null` if any of the given values in the array is a no-data value.",
+        "schema": {
+            "type": [
+                "number",
+                "null"
+            ]
+        }
+    }
+}
diff --git a/proposals/save_ml_model.json b/proposals/save_ml_model.json
new file mode 100644
index 00000000..5e9ea8b0
--- /dev/null
+++ b/proposals/save_ml_model.json
@@ -0,0 +1,44 @@
+{
+    "id": "save_ml_model",
+    "summary": "Save a ML model",
+    "description": "Saves a machine learning model as part of a batch job.\n\nThe model will be accompanied by a separate STAC Item that implements the [ml-model extension](https://github.com/stac-extensions/ml-model).",
+    "categories": [
+        "machine learning",
+        "import"
+    ],
+    "experimental": true,
+    "parameters": [
+        {
+            "name": "data",
+            "description": "The data to store as a machine learning model.",
+            "schema": {
+                "type": "object",
+                "subtype": "ml-model"
+            }
+        },
+        {
+            "name": "options",
+            "description": "Additional parameters to create the file(s).",
+            "schema": {
+                "type": "object",
+                "additionalParameters": false
+            },
+            "default": {},
+            "optional": true
+        }
+    ],
+    "returns": {
+        "description": "Returns `false` if the process failed to store the model, `true` otherwise.",
+        "schema": {
+            "type": "boolean"
+        }
+    },
+    "links": [
+        {
+            "href": "https://github.com/stac-extensions/ml-model",
+            "title": "STAC ml-model extension",
+            "type": "text/html",
+            "rel": "about"
+        }
+    ]
+}
\ No newline at end of file