diff --git a/CHANGELOG.md b/CHANGELOG.md index f0b68043..636226d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `load_ml_model` - `load_url` - `ml_fit_class_random_forest` + - `ml_fit_class_xgboost` - `ml_fit_regr_random_forest` - `ml_predict` - `save_ml_model` diff --git a/proposals/ml_fit_class_xgboost.json b/proposals/ml_fit_class_xgboost.json new file mode 100644 index 00000000..e9e2d713 --- /dev/null +++ b/proposals/ml_fit_class_xgboost.json @@ -0,0 +1,115 @@ +{ + "id": "ml_fit_class_xgboost", + "summary": "Train an XGBoost classification model", + "description": "Fit an XGBoost classification model to training data. XGBoost is a high-performance, flexible, and portable distributed gradient boosting library. It implements machine algorithms within the Gradient Boosting framework, featuring parallel tree boosting for efficiency", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "predictors", + "description": "The predictors for the XGBoost classification model as a vector data cube. They are the independent variables that the XGBoost algorithm analyses to learn patterns and relationships within the data.", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + } + }, + { + "name": "target", + "description": "Labeled data for XGBoost classification, aligning with predictor values based on a shared geometry dimension. This ensures a clear connection between predictor rows and labels.", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + } + ] + } + }, + { + "name": "learning_rate", + "description": "Step size shrinkage used in update to prevent overfitting.", + "schema": { + "type": "number", + "minimum": 0, + "default": 0.15 + } + }, + { + "name": "max_depth", + "description": "Maximum depth of a tree.", + "schema": { + "type": "integer", + "minimum": 1, + "default": 5 + } + }, + { + "name": "min_child_weight", + "description": "Minimum sum of instance weight (hessian) needed in a child.", + "schema": { + "type": "number", + "minimum": 0, + "default": 1 + } + }, + { + "name": "subsample", + "description": "Subsample ratio of the training instance.", + "optional": true, + "default": 0.8, + "schema": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + { + "name": "min_split_loss", + "description": "Minimum loss reduction required to make a further partition on a leaf node of the tree.", + "optional": true, + "default": 1, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be saved with `save_ml_model()` and restored with `load_ml_model()`.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://dl.acm.org/doi/10.1145/2939672.2939785", + "title": "Chen and Guestrin (2016), XGBoost: A Scalable Tree Boosting System", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/tests/.words b/tests/.words index a50285ba..17960777 100644 --- a/tests/.words +++ b/tests/.words @@ -47,3 +47,9 @@ Hyndman date1 date2 favor +XGBoost +Chen +Guestrin +Subsample +hessian +overfitting