From 526055717426f9337bd2d0a07d3a67689d9f105a Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 15 Nov 2017 16:23:27 +0100 Subject: [PATCH 01/32] Going back --- online_forest.py | 193 ++++++ setup.py | 6 +- tick/inference/__init__.py | 5 +- tick/inference/online_forest_regressor.py | 178 +++++ tick/inference/src/CMakeLists.txt | 3 +- .../inference/src/online_forest_regressor.cpp | 655 ++++++++++++++++++ tick/inference/src/online_forest_regressor.h | 332 +++++++++ tick/inference/swig/inference_module.i | 4 +- tick/inference/swig/online_forest_regressor.i | 41 ++ 9 files changed, 1412 insertions(+), 5 deletions(-) create mode 100644 online_forest.py create mode 100644 tick/inference/online_forest_regressor.py create mode 100644 tick/inference/src/online_forest_regressor.cpp create mode 100644 tick/inference/src/online_forest_regressor.h create mode 100644 tick/inference/swig/online_forest_regressor.i diff --git a/online_forest.py b/online_forest.py new file mode 100644 index 000000000..794b97e02 --- /dev/null +++ b/online_forest.py @@ -0,0 +1,193 @@ +from tick.simulation import SimuLinReg, weights_sparse_gauss +from sklearn.model_selection import train_test_split +import numpy as np +from tick.inference import OnlineForestRegressor +from matplotlib.colors import ListedColormap + +from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor +import matplotlib.pyplot as plt + +from time import time + +n_samples = 2000 +n_features = 2 +seed = 123 + +np.set_printoptions(precision=2) + + +w0 = weights_sparse_gauss(n_features, nnz=2) +X, y = SimuLinReg(w0, -1., n_samples=n_samples, seed=seed).simulate() + +# X_train, X_test, y_train, y_test = train_test_split(X, y) + + +def plot_decisions(clfs, datasets, names, use_aggregation=None): + i = 1 + h = .02 + fig = plt.figure(figsize=(4 * (len(clfs) + 1), 4 * len(datasets))) + # iterate over datasets + for ds_cnt, ds in enumerate(datasets): + X, y = ds + # X = StandardScaler().fit_transform(X) + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + + # just plot the dataset first + cm = plt.cm.RdBu + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = plt.subplot(len(datasets), len(clfs) + 1, i) + if ds_cnt == 0: + ax.set_title("Input data") + # Plot the training points + # plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=10, cmap=cm) + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=25, cmap=cm) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=25, + alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + + # iterate over classifiers + for name, clf in zip(names, clfs): + ax = plt.subplot(len(datasets), len(clfs) + 1, i) + + t1 = time() + clf.fit(X_train, y_train) + t2 = time() + + mse = np.linalg.norm(y_test - clf.predict(X_test)) + # score = clf.score(X_test, y_test) + + Z = clf.predict(np.array([xx.ravel(), yy.ravel()]).T) + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + + # Plot also the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm, s=15) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, + s=15, alpha=0.6) + + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + if ds_cnt == 0: + ax.set_title(name) + + ax.text(xx.max() - .3, yy.min() + .3, ('%.2f (%.2f)' % (mse, t2-t1)).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 + + plt.tight_layout() + # plt.show() + + +# def plot_decision_regions(clfs, X_test, y_test, n_iter=None, use_aggregation=None, +# title=None): +# from matplotlib.colors import ListedColormap +# +# cm = plt.cm.RdBu +# cmap = ListedColormap(['red', 'white', 'blue']) +# fig = plt.figure(figsize=(8, 5)) +# +# ax = plt.subplot(1, 1, 1) +# # plot the decision surface +# x1_min, x1_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1 +# x2_min, x2_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1 +# +# xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, 0.02), +# np.arange(x2_min, x2_max, 0.02)) +# +# plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=10, cmap=cm) +# +# if use_aggregation is None: +# Z = clf.predict(np.array([xx1.ravel(), xx2.ravel()]).T) +# else: +# Z = clf.predict(np.array([xx1.ravel(), xx2.ravel()]).T, use_aggregation) +# Z = Z.reshape(xx1.shape) +# ct = plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cm) +# plt.colorbar(ct) +# plt.xlim(xx1.min(), xx1.max()) +# plt.ylim(xx2.min(), xx2.max()) +# +# plt.xlabel('x1', fontsize=16) +# plt.ylabel('x2', fontsize=16) +# if title is not None: +# plt.title(title) +# plt.legend(loc='upper left') +# plt.tight_layout() + + +# clf = OnlineForestRegressor(n_trees=1, seed=123) +# print(clf.predict(X)) + +# plot_decision_regions(clf, X, y, use_aggregation=False) + +path = '/Users/stephane.gaiffas/Downloads/' + +import os + +# plt.savefig(os.path.join(path, 'online1.pdf')) + +n_trees = 10 + +datasets = [ + (X, y) +] + +clfs = [ + OnlineForestRegressor(n_trees=n_trees, seed=123, step=0.25), + ExtraTreesRegressor(n_estimators=n_trees), + RandomForestRegressor(n_estimators=n_trees) +] + +names = [ + "Online forest", + "Extra trees", + "Breiman RF" +] + +plot_decisions(clfs, datasets, names) +plt.show() + +# plt.savefig(os.path.join(path, 'decisions.pdf')) + + +# plot_decision_regions(clf, X, y, use_aggregation=True) + +# plt.savefig(os.path.join(path, 'online2.pdf')) + +# clf.print() + +# plt.show() + + +# clf.fit(X, y) + +# print(y) +# print(clf.predict(X)) +# clf.print() + + +# plot_decision_regions(clf, X, y, n_iter=None, use_aggregation=True) +# plt.show() + +# exit(0) +# forest = OnlineForestRegressor(n_trees=100, min_samples_split=50) + +# plot_decision_regions(clf, X, y, n_samples) + + +# plt.savefig('/Users/stephane.gaiffas/Downloads/online-forest.pdf') diff --git a/setup.py b/setup.py index c811c7d80..8538cd196 100644 --- a/setup.py +++ b/setup.py @@ -655,10 +655,12 @@ def add_dir_name(dir_name, filenames): inference_extension_info = { "cpp_files": ["hawkes_conditional_law.cpp", "hawkes_em.cpp", "hawkes_adm4.cpp", "hawkes_basis_kernels.cpp", - "hawkes_sumgaussians.cpp"], + "hawkes_sumgaussians.cpp", + "online_forest_regressor.cpp"], "h_files": ["hawkes_conditional_law.h", "hawkes_em.h", "hawkes_adm4.h", "hawkes_basis_kernels.h", - "hawkes_sumgaussians.h"], + "hawkes_sumgaussians.h", + "online_forest_regressor.h"], "swig_files": ["inference_module.i"], "module_dir": "./tick/inference/", "extension_name": "inference", diff --git a/tick/inference/__init__.py b/tick/inference/__init__.py index f8b0b5ca5..b1c980256 100644 --- a/tick/inference/__init__.py +++ b/tick/inference/__init__.py @@ -17,6 +17,8 @@ from .survival import kaplan_meier, nelson_aalen from .robust import std_iqr, std_mad +from .online_forest_regressor import OnlineForestRegressor + __all__ = [ "LinearRegression", "RobustLinearRegression", @@ -29,7 +31,8 @@ "HawkesEM", "HawkesADM4", "HawkesBasisKernels", - "HawkesSumGaussians," + "HawkesSumGaussians", + "OnlineForestRegressor", "kaplan_meier", "nelson_aalen" ] diff --git a/tick/inference/online_forest_regressor.py b/tick/inference/online_forest_regressor.py new file mode 100644 index 000000000..b8a387996 --- /dev/null +++ b/tick/inference/online_forest_regressor.py @@ -0,0 +1,178 @@ +# License: BSD 3 clause + +from abc import ABC + +from tick.base import Base +from tick.base import actual_kwargs + +from .build.inference import OnlineForestRegressor as _OnlineForestRegressor +from tick.preprocessing.utils import safe_array + +from .build.inference import Criterion_unif as unif +from .build.inference import Criterion_mse as mse + + +class OnlineForestRegressor(ABC, Base): + """Truly online random forest for regression (continuous labels). BLABLA + + Parameters + ---------- + n_trees : `int`, default=10 + Number of trees to grow in the forest. Cannot be changed after the first + call to ``fit``. + + criterion : {'unif', 'mse'}, default='unif' + The criterion used to selected a split. Supported criteria are: + * 'unif': splits are sampled uniformly in the range of the features, and + the feature to be splitted is chosen uniformly at random + * 'mse': the split and feature leading to the best variance reduction + is selected + This cannot be changed after the first call to ``fit`` + + max_depth : `int`, default=-1 + The maximum depth of a tree. If <= 0, nodes are splitted with no limit + on the depth of the tree + + min_samples_split : `int`, default=50 + A node waits to contain `min_samples_split` before splitting. + + n_threads : `int`, default=1 + The number of threads used to grow trees in parallel during training. + If n_threads < 0, then all available cores will be used. + + seed : `int`, default=-1 + If seed >= 0, this is used to seed the random number generators of the + forest. + + verbose : `bool`, default=True + If True, then verboses things during training + + warm_start : `bool`, default=True + If True, then successive calls to ``fit`` will continue to grow existing + trees. Otherwise, we start from empty trees + + n_splits : `int`, default=10 + Number of potential splits to consider for a feature. BLABLA ??? + + Attributes + ---------- + n_samples : `int` + Number of samples seen during training + + n_features : int + The number of features from the training dataset (passed to ``fit``) + """ + + _attrinfos = { + '_actual_kwargs': {'writable': False}, + '_fitted': {'writable': False}, + '_forest': {'writable': False}, + '_criterion': {'writable': False, 'cpp_setter': 'set_criterion'}, + 'n_trees': {'writable': True, 'cpp_setter': 'set_n_trees'}, + 'max_depth': {'writable': True, 'cpp_setter': 'set_max_depth'}, + 'min_samples_split': {'writable': True, + 'cpp_setter': 'set_min_samples_split'}, + 'n_threads': {'writable': True, 'cpp_setter': 'set_n_threads'}, + 'seed': {'writable': True, 'cpp_setter': 'set_seed'}, + 'verbose': {'writable': True, 'cpp_setter': 'set_verbose'}, + 'warm_start': {'writable': True, 'cpp_setter': 'set_warm_start'}, + 'n_splits': {'writable': True, 'cpp_setter': 'set_n_splits'}, + } + + _cpp_obj_name = "_forest" + + @actual_kwargs + def __init__(self, n_trees: int = 10, step: float = 1., + criterion: str = 'unif', + max_depth: int = -1, min_samples_split: int = 50, + n_threads: int = 1, seed: int = -1, verbose: bool = True, + warm_start: bool = True, n_splits: int = 10): + Base.__init__(self) + if not hasattr(self, "_actual_kwargs"): + self._actual_kwargs = {} + self._fitted = False + self.n_trees = n_trees + self.step = step + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.n_threads = n_threads + self.seed = seed + self.verbose = verbose + self.warm_start = warm_start + self.n_splits = n_splits + self._forest = _OnlineForestRegressor(n_trees, + step, + self._criterion, + #max_depth, + # min_samples_split, + n_threads, + seed, + verbose) + #warm_start, n_splits) + + def set_data(self, X, y): + X = safe_array(X) + y = safe_array(y) + self._forest.set_data(X, y) + + def fit(self, X, y): + X = safe_array(X) + y = safe_array(y) + self._set("_fitted", True) + self._forest.fit(X, y) + return self + + def apply(self, X): + """Make the samples from X follow the trees from the forest, and return + the indices of the leaves + """ + raise NotImplementedError() + + def predict(self, X, use_aggregation: bool=True): + """Predict class for given samples + + Parameters + ---------- + X : `np.ndarray` or `scipy.sparse.csr_matrix`, shape=(n_samples, n_features) + Features matrix to predict for. + + Returns + ------- + output : `np.array`, shape=(n_samples,) + Returns predicted values. + """ + import numpy as np + y_pred = np.empty(X.shape[0]) + if not self._fitted: + raise ValueError("You must call ``fit`` before") + else: + X = safe_array(X) + self._forest.predict(X, y_pred, True) + return y_pred + + def score(self, X, y): + from sklearn.metrics import r2_score + + def print(self): + self._forest._print() + + # TODO: property for splits + + @property + def criterion(self): + if self._criterion == unif: + return 'unif' + else: + return 'mse' + + @criterion.setter + def criterion(self, value): + if value == 'unif': + self._set('_criterion', unif) + # self._forest.set_criterion(unif) + elif value == 'mse': + self._set('_criterion', mse) + # self._forest.set_criterion(mse) + else: + raise ValueError("``criterion`` must be either 'unif' or 'mse'.") diff --git a/tick/inference/src/CMakeLists.txt b/tick/inference/src/CMakeLists.txt index acbb54d80..706bc50bd 100644 --- a/tick/inference/src/CMakeLists.txt +++ b/tick/inference/src/CMakeLists.txt @@ -3,7 +3,8 @@ add_library(tick_inference EXCLUDE_FROM_ALL hawkes_em.cpp hawkes_em.h hawkes_adm4.h hawkes_adm4.cpp hawkes_basis_kernels.cpp hawkes_basis_kernels.h - hawkes_sumgaussians.h hawkes_sumgaussians.cpp) + hawkes_sumgaussians.h hawkes_sumgaussians.cpp + online_forest_regressor.h online_forest_regressor.cpp) target_link_libraries(tick_inference diff --git a/tick/inference/src/online_forest_regressor.cpp b/tick/inference/src/online_forest_regressor.cpp new file mode 100644 index 000000000..68104f690 --- /dev/null +++ b/tick/inference/src/online_forest_regressor.cpp @@ -0,0 +1,655 @@ + +// License: BSD 3 clause + +#include "online_forest_regressor.h" + +/********************************************************************************* + * Node methods + *********************************************************************************/ + +template +Node::Node(Tree &tree, ulong parent) + : _tree(tree) { + _n_samples = 0; + _is_leaf = true; + _left = 0; + _right = 0; + _weight = 1; + _weight_tree = 1; + this->_parent = parent; +} + +template +Node::Node(const Node &node) + : _tree(node._tree), + _left(node._left), _right(node._right), _parent(node._parent), + _feature(node._feature), _threshold(node._threshold), + _n_samples(node._n_samples), + _x_t(node._x_t), + _y_t(node._y_t), + _weight(node._weight), _weight_tree(node._weight_tree), + _is_leaf(node._is_leaf) {} + +template +Node::Node(const Node &&node) : _tree(_tree) { + _left = node._left; + _right = node._right; + _parent = node._parent; + _feature = node._feature; + _threshold = node._threshold; + _n_samples = node._n_samples; + _weight = node._weight; + _weight_tree = node._weight_tree; + _is_leaf = node._is_leaf; + _x_t = node._x_t; +} + +template +Node::~Node() {} + +template +void Node::update_downwards(const ArrayDouble &x_t, double y_t) { + _n_samples++; + // TODO: Make compute loss virtual insteal + update_weight(y_t); + update_predict(y_t); +} + +template +void Node::update_weight(const double y_t) { + _weight *= exp(-step() * loss(y_t)); +} + +template +inline Tree &Node::tree() const { + return _tree; +} + +template +inline NodeType &Node::node(ulong index) const { + return _tree.node(index); +} + +template +ulong Node::n_features() const { + return _tree.n_features(); +} + +template +inline double Node::step() const { + return _tree.step(); +} + +template +void Node::print() { + std::cout // << "Node(i: " << _index << ", p: " << _parent + // << ", f: " << _feature + // << ", th: " << _threshold + << ", l: " << _left + << ", r: " << _right + // << ", d: " << _depth + // << ", n: " << n_samples() + // << ", i: " << _is_leaf + // << ", avg: " << std::setprecision(2) << _labels_average + // << ", feat_min=[" << std::setprecision(2) << _features_min[0] << ", " << std::setprecision(2) + // << _features_min[1] << "]" + // << ", feat_max=[" << std::setprecision(2) << _features_max[0] << ", " << std::setprecision(2) + // << _features_max[1] << "]" + << ")\n"; +} + +template +inline ulong Node::parent() const { + return _parent; +} + +template +inline ulong Node::left() const { + return _left; +} + +template +inline Node &Node::set_left(ulong left) { + _left = left; + return *this; +} + +template +inline ulong Node::right() const { + return _right; +} + +template +inline Node &Node::set_right(ulong right) { + _right = right; + return *this; +} + +template +inline bool Node::is_leaf() const { + return _is_leaf; +} + +template +inline Node &Node::set_is_leaf(bool is_leaf) { + _is_leaf = is_leaf; + return *this; +} + +template +inline ulong Node::feature() const { + return _feature; +} + +template +inline Node &Node::set_feature(ulong feature) { + _feature = feature; + return *this; +} + +template +inline double Node::threshold() const { + return _threshold; +} + +template +inline Node &Node::set_threshold(double threshold) { + _threshold = threshold; + return *this; +} + +template +inline ulong Node::n_samples() const { + return _n_samples; +} + +template +inline Node &Node::set_n_samples(ulong n_samples) { + _n_samples = n_samples; + return *this; +} + +template +inline double Node::weight() const { + return _weight; +} + +template +inline Node &Node::set_weight(double weight) { + _weight = weight; + return *this; +} + +template +inline double Node::weight_tree() const { + return _weight_tree; +} + +template +inline Node &Node::set_weight_tree(double weight_tree) { + _weight_tree = weight_tree; + return *this; +} + +template +inline const ArrayDouble &Node::x_t() const { + return _x_t; +} + +template +inline Node &Node::set_x_t(const ArrayDouble &x_t) { + _x_t = x_t; + return *this; +} + +template +inline double Node::y_t() const { + return _y_t; +} + +template +inline Node &Node::set_y_t(const double y_t) { + _y_t = y_t; + return *this; +} + +/********************************************************************************* + * NodeRegressor methods + *********************************************************************************/ + +NodeRegressor::NodeRegressor(Tree &tree, ulong parent) + : Node(tree, parent) { + _predict = 0; +} + +NodeRegressor::NodeRegressor(const NodeRegressor &node) + : Node(node), _predict(node._predict), _y_t(node._y_t) {} + +NodeRegressor::NodeRegressor(const NodeRegressor &&node) + : Node(node) { + _predict = node._predict; + _y_t = node._y_t; +} + +NodeRegressor::~NodeRegressor() {} + +inline double NodeRegressor::predict() const { + return _predict; +} + +void NodeRegressor::update_predict(double y_t) { + // When a node is updated, it necessarily contains already a sample + _predict = ((_n_samples - 1) * _predict + y_t) / _n_samples; +} + +double NodeRegressor::loss(const double y_t) { + double diff = _predict - y_t; + return diff * diff / 2; +} + +void NodeRegressor::print() { + std::cout // << "Node(idx: " << _index << ", parent: " << _parent + // << ", f: " << _feature + // << ", th: " << _threshold + << ", left: " << _left + << ", right: " << _right + // << ", d: " << _depth + // << ", n: " << n_samples() + // << ", i: " << _is_leaf + << ", thresh: " << _threshold + << ", y_hat: " << _predict + << ", sample: "; + // << ", has_sample:" << _has_sample; + if (_is_leaf) { + std::cout << "[" << std::setprecision(2) << _x_t[0] << ", " << std::setprecision(2) << _x_t[1] + << "]"; + } else { + std::cout << "null"; + } + std::cout << ", weight: " << _weight; + std::cout << ", weight_tree: " << _weight_tree; + std::cout << ")\n"; +} + +/********************************************************************************* + * Tree methods + *********************************************************************************/ + +template +Tree::Tree(const Tree &tree) + : nodes(tree.nodes), forest(tree.forest) { +} + +template +Tree::Tree(const Tree &&tree) : nodes(tree.nodes), forest(tree.forest) { +} + +//template +//ulong Node::n_features() const { +// return _tree.n_features(); +//} + +template +Tree::Tree(OnlineForestRegressor &forest) : forest(forest) { + // TODO: pre-allocate the vector to make things faster ? + add_node(0, 0); +} + +template +ulong Tree::split_leaf(ulong index, const ArrayDouble &x_t, double y_t) { + // std::cout << "Splitting node " << index << std::endl; + ulong left = add_node(index, iteration); + ulong right = add_node(index, iteration); + node(index).set_left(left).set_right(right).set_is_leaf(false); + + // TODO: better feature sampling + ulong feature = forest.sample_feature(); + + double x1_tj = x_t[feature]; + double x2_tj = node(index).x_t()[feature]; + double threshold; + + // The leaf that contains the passed sample (x_t, y_t) + ulong data_leaf; + ulong other_leaf; + + // std::cout << "x1_tj= " << x1_tj << " x2_tj= " << x2_tj << " threshold= " << threshold << std::endl; + // TODO: what if x1_tj == x2_tj. Must be taken care of by sample_feature() + if (x1_tj < x2_tj) { + threshold = forest.sample_threshold(x1_tj, x2_tj); + data_leaf = left; + other_leaf = right; + } else { + threshold = forest.sample_threshold(x2_tj, x1_tj); + data_leaf = right; + other_leaf = left; + } + // TODO: code a move_sample + + node(index).set_feature(feature).set_threshold(threshold); + + // We pass the sample to the new leaves, and initialize the _label_average with the value + node(data_leaf).set_x_t(x_t).set_y_t(y_t); + node(other_leaf).set_x_t(node(index).x_t()).set_y_t(node(index).y_t()); + + // Update downwards of v' + node(other_leaf).update_downwards(node(index).x_t(), node(index).y_t()); + // Update upwards of v': it's a leaf + node(other_leaf).set_weight_tree(node(other_leaf).weight()); + // Update downwards of v'' + node(data_leaf).update_downwards(x_t, y_t); + // Note: the update_up of v'' is done in the go_up method, called in fit() + + return data_leaf; +} + +template +ulong Tree::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { + // Find the leaf that contains the sample + // Start at the root. Index of the root is always 0 + // If predict == true, this call to find_leaf is for + // prediction only, so that no leaf update and splits can be done + ulong index_current_node = 0; + bool is_leaf = false; + while (!is_leaf) { + // Get the current node + Node ¤t_node = node(index_current_node); + if (!predict) { + current_node.update_downwards(x_t, y_t); + } + // Is the node a leaf ? + is_leaf = current_node.is_leaf(); + if (!is_leaf) { + if (x_t[current_node.feature()] <= current_node.threshold()) { + index_current_node = current_node.left(); + } else { + index_current_node = current_node.right(); + } + } + } + return index_current_node; +} + +template +void Tree::go_upwards(ulong leaf_index) { + + ulong current = leaf_index; + + while (true) { + // TODO: use a node::update_upward + Node ¤t_node = node(current); + if (current_node.is_leaf()) { + current_node.set_weight_tree(current_node.weight()); + } else { + double w = current_node.weight(); + double w0 = node(current_node.left()).weight_tree(); + double w1 = node(current_node.right()).weight_tree(); + current_node.set_weight_tree((w + w0 * w1) / 2); +// double a = current_node.weight(); +// double b = weight_tree_left + weight_tree_right; +// double toto; +// if(a > b) { +// toto = a + log(1 + exp(b - a)) - log(2); +// } else { +// toto = b + log(1 + exp(a - b)) - log(2); +// } + } + if (current == 0) { + break; + } + // We must update the root node + current = node(current).parent(); + } +} + +template +inline ulong Tree::n_nodes() const { + return _n_nodes; +} + +template +void Tree::fit(const ArrayDouble &x_t, double y_t) { + // TODO: Test that the size does not change within successive calls to fit + if (iteration == 0) { + nodes[0].set_x_t(x_t).set_y_t(y_t); + iteration++; + return; + } + + ulong leaf = go_downwards(x_t, y_t, false); + ulong new_leaf = split_leaf(leaf, x_t, y_t); + +// for(ulong j=0; j < n_features(); ++j) { +// double delta = std::abs(x_t[j] - node(leaf).sample().first[j]); +// if (delta > 0.) { +// new_leaf = split_node(leaf, x_t, y_t); +// break; +// } +// } + go_upwards(new_leaf); + iteration++; +} + +/********************************************************************************* +* TreeRegressor methods +*********************************************************************************/ + +TreeRegressor::TreeRegressor(OnlineForestRegressor &forest) + : Tree(forest) {} + +TreeRegressor::TreeRegressor(const TreeRegressor &tree) + : Tree(forest) {} + +TreeRegressor::TreeRegressor(const TreeRegressor &&tree) + : Tree(forest) {} + +double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { + ulong leaf = go_downwards(x_t, 0., true); + if (!use_aggregation) { + return node(leaf).y_t(); + } + ulong current = leaf; + // The child of the current node that does not contain the data + ulong other; + ulong parent; + double weight; + while (true) { + NodeRegressor ¤t_node = node(current); + if (current_node.is_leaf()) { + weight = current_node.weight() * current_node.predict(); + // weight = std::exp(current_node.weight()) * current_node.labels_average(); + } else { + weight = 0.5 * current_node.weight() * current_node.predict() + + 0.5 * node(other).weight_tree() * weight; +// weight = 0.5 * std::exp(current_node.weight()) * current_node.labels_average() +// + 0.5 * std::exp(node(other).weight_tree() + weight); + } + parent = node(current).parent(); + if (node(parent).left() == current) { + other = node(parent).right(); + } else { + other = node(parent).left(); + } + // Root must be updated as well + if (current == 0) { + break; + } + current = parent; + } + return weight / nodes[0].weight_tree(); + // return weight / std::exp(nodes[0].weight_tree()); +} + +template +ulong Tree::add_node(ulong parent, ulong creation_time) { + nodes.emplace_back(*this, parent); + return _n_nodes++; +} + +template +inline ulong Tree::n_features() const { + return forest.n_features(); +} + +template +inline double Tree::step() const { + return forest.step(); +} + +template +inline Criterion Tree::criterion() const { + return forest.criterion(); +} + +/********************************************************************************* + * OnlineForestRegressor methods + *********************************************************************************/ + +OnlineForestRegressor::OnlineForestRegressor(uint32_t n_trees, + double step, + Criterion criterion, + int32_t n_threads, + int seed, + bool verbose) + : // _n_trees(n_trees), + _n_threads(n_threads), _criterion(criterion), _step(step), _verbose(verbose), trees() { + // No iteration so far + _n_trees = 10; + _iteration = 0; + create_trees(); + // Seed the random number generators + set_seed(seed); +} + +OnlineForestRegressor::~OnlineForestRegressor() {} + +void OnlineForestRegressor::create_trees() { + // Just in case... + trees.clear(); + trees.reserve(_n_trees); + for (uint32_t i = 0; i < _n_trees; ++i) { + trees.emplace_back(*this); + } +} + +void OnlineForestRegressor::fit(const SArrayDouble2dPtr features, + const SArrayDoublePtr labels) { + ulong n_samples = features->n_rows(); + ulong n_features = features->n_cols(); + set_n_features(n_features); + for (ulong i = 0; i < n_samples; ++i) { + for (TreeRegressor &tree : trees) { + // Fit the tree online using the new data point + tree.fit(view_row(*features, i), (*labels)[i]); + } + _iteration++; + } +} + +void OnlineForestRegressor::predict(const SArrayDouble2dPtr features, + SArrayDoublePtr predictions, + bool use_aggregation) { + if (_iteration > 0) { + ulong n_samples = features->n_rows(); + for (ulong i = 0; i < n_samples; ++i) { + // The prediction is simply the average of the predictions + double y_pred = 0; + for (TreeRegressor &tree : trees) { + y_pred += tree.predict(view_row(*features, i), use_aggregation); + } + (*predictions)[i] = y_pred / _n_trees; + } + } else { + TICK_ERROR("You must call ``fit`` before ``predict``.") + } +} + +inline ulong OnlineForestRegressor::sample_feature() { + return rand.uniform_int(0L, n_features() - 1); +} + +inline double OnlineForestRegressor::sample_threshold(double left, double right) { + return rand.uniform(left, right); +} + +//inline double OnlineForestRegressor::step() const { +// return _step; +//} +// +//void OnlineForestRegressor::print() { +// for (Tree &tree: trees) { +// tree.print(); +// } +//} +// +//inline ulong OnlineForestRegressor::n_samples() const { +// if (_iteration > 0) { +// return _iteration; +// } else { +// TICK_ERROR("You must call ``fit`` before asking for ``n_samples``.") +// } +//} + +//inline ulong OnlineForestRegressor::n_features() const { +// if (_iteration > 0) { +// return _n_features; +// } else { +// TICK_ERROR("You must call ``fit`` before asking for ``n_features``.") +// } +//} + +//inline OnlineForestRegressor &OnlineForestRegressor::set_n_features(ulong n_features) { +// if (_iteration == 0) { +// _n_features = n_features; +// } else { +// TICK_ERROR("OnlineForest::set_n_features can be called only once !") +// } +// return *this; +//} + +//inline uint32_t OnlineForestRegressor::n_trees() const { +// return _n_trees; +//} + + +//inline OnlineForestRegressor &OnlineForestRegressor::set_n_trees(uint32_t n_trees) { +// _n_trees = n_trees; +// return *this; +//} + +//inline int32_t OnlineForestRegressor::n_threads() const { +// return _n_threads; +//} + +//OnlineForestRegressor &OnlineForestRegressor::set_n_threads(int32_t n_threads) { +// _n_threads = n_threads; +// return *this; +//} + +//inline Criterion OnlineForestRegressor::criterion() const { +// return _criterion; +//} + +//inline OnlineForestRegressor &OnlineForestRegressor::set_criterion(Criterion criterion) { +// _criterion = criterion; +// return *this; +//} +// +//inline int OnlineForestRegressor::seed() const { +// return _seed; +//} + +//inline OnlineForestRegressor &OnlineForestRegressor::set_seed(int seed) { +// _seed = seed; +// rand.reseed(seed); +// return *this; +//} + +//inline bool OnlineForestRegressor::verbose() const { +// return _verbose; +//} +// +//inline OnlineForestRegressor &OnlineForestRegressor::set_verbose(bool verbose) { +// _verbose = verbose; +// return *this; +//} diff --git a/tick/inference/src/online_forest_regressor.h b/tick/inference/src/online_forest_regressor.h new file mode 100644 index 000000000..77ad35649 --- /dev/null +++ b/tick/inference/src/online_forest_regressor.h @@ -0,0 +1,332 @@ + +#ifndef TICK_ONLINEFOREST_H +#define TICK_ONLINEFOREST_H + +// License: BSD 3 clause + +#include "base.h" +#include +#include "../../random/src/rand.h" + + +// TODO: faire tres attention au features binaires si le range est 0 sur toutes les coordonnées, ne rien faire +// TODO: code a classifier + +// TODO: choisir la feature proportionnellement au ratio de la longueur du cote / perimetre. Ca suppose qu'on enregistre +// les vraies dimensions de la cellule, et le threhsold est du coup aussi tiré là dedans +// TODO: choisir la feature proportionnellement au ratio des range de features, mais attention au cas de features +// discretes +// TODO: une option pour créer une cellule vide, enfin oublier les donnes dans la cellule quand elle a ete splitee + +// TODO: choix de la feature les labels + +// TODO: des fit_online qui prend un mini batch et qui met à jour la foret, mais dans ce cas on ne met qu'un point par +// cellule, du coup pas besoin d'enregistrer les sample index ou les points. Ca suppose que min_sample_split == 1 + +// TODO: pour la regression, on utilise la moyenne des y +// TODO: pour la classification, on utilise pas les frequences, on utilise des frequences regularisees, prior Dirichlet p_c = (n_c + 0.5) + (\sum n_c + C / 2). En fait une option + +// TODO: check that not using reserve in the forest works as well... + + +enum class Criterion { + unif = 0, + mse +}; + +typedef uint32_t index_t; + +template +class Tree; + +/********************************************************************************* + * Node + *********************************************************************************/ + +template +class Node { + protected: + // Tree containing the node + Tree &_tree; + // Index of the left child + ulong _left; + // Index of the right child + ulong _right; + // Index of the parent + ulong _parent; + // Index of the feature used for the split + ulong _feature; + // Threshold used for the split + double _threshold; + // Number of samples in the node + ulong _n_samples; + // The features of the sample saved in the node + // TODO: use a unique_ptr on x_t + ArrayDouble _x_t; + // The label of the sample saved in the node + double _y_t; + // Aggregation weight for the node + double _weight; + // Aggregation weight for the sub-tree starting at this node + double _weight_tree; + // true if the node is a leaf + bool _is_leaf; + + public: + Node(Tree &tree, ulong parent); + Node(const Node &node); + Node(const Node &&node); + Node &operator=(const Node &) = delete; + Node &operator=(const Node &&) = delete; + virtual ~Node(); + + // Update to apply to a node when going forward in the tree (towards leaves) + virtual void update_downwards(const ArrayDouble &x_t, double y_t); + // Update of the aggregation weights + virtual void update_weight(const double y_t) final; + // Update the prediction of the label + virtual void update_predict(double y_t) = 0; + // Loss function used for aggregation + virtual double loss(const double y_t) = 0; + + inline Tree &tree() const; + inline NodeType &node(ulong index) const; + ulong n_features() const; + inline double step() const; + + virtual void print(); + + inline ulong parent() const; + inline ulong left() const; + inline Node &set_left(ulong left); + inline ulong right() const; + inline Node &set_right(ulong right); + inline bool is_leaf() const; + inline Node &set_is_leaf(bool is_leaf); + inline ulong feature() const; + inline Node &set_feature(ulong feature); + inline double threshold() const; + inline Node &set_threshold(double threshold); + inline ulong n_samples() const; + inline Node &set_n_samples(ulong n_samples); + inline double weight() const; + inline Node &set_weight(double weight); + inline double weight_tree() const; + inline Node &set_weight_tree(double weight); + inline const ArrayDouble &x_t() const; + inline Node &set_x_t(const ArrayDouble &x_t); + inline double y_t() const; + inline Node& set_y_t(const double y_t); +}; + +/********************************************************************************* + * NodeRegressor + *********************************************************************************/ + +class NodeRegressor : public Node { + private: + // Average of the labels in the node (regression only for now) + double _predict = 0; + // Label of the stored sample point + double _y_t; + + public: + NodeRegressor(Tree &tree, ulong parent); + NodeRegressor(const NodeRegressor &node); + NodeRegressor(const NodeRegressor &&node); + NodeRegressor &operator=(const NodeRegressor &) = delete; + NodeRegressor &operator=(const NodeRegressor &&) = delete; + virtual ~NodeRegressor(); + + inline double predict() const; + virtual void update_predict(double y_t); + virtual double loss(const double y_t); + virtual void print(); + + +}; + +class OnlineForestRegressor; + +/********************************************************************************* + * Tree + *********************************************************************************/ + +template +class Tree { + protected: + // The forest of the tree + OnlineForestRegressor &forest; + // Number of nodes in the tree + ulong _n_nodes = 0; + // Iteration counter + ulong iteration = 0; + // Nodes of the tree + std::vector nodes = std::vector(); + // Split the node at given index + ulong split_leaf(ulong index, const ArrayDouble &x_t, double y_t); + // Add nodes in the tree + virtual ulong add_node(ulong parent, ulong creation_time); + + ulong go_downwards(const ArrayDouble &x_t, double y_t, bool predict); + void go_upwards(ulong leaf_index); + + public: + Tree(OnlineForestRegressor &forest); + Tree(const Tree &tree); + Tree(const Tree &&tree); + Tree &operator=(const Tree &) = delete; + Tree &operator=(const Tree &&) = delete; + ~Tree() {} + + void fit(const ArrayDouble &x_t, double y_t); + + // double predict(const ArrayDouble& x_t); + + inline ulong n_features() const; + inline ulong n_nodes() const; + inline double step() const; + + void print() { + for (NodeType &node : nodes) { + node.print(); + } + } + + inline Criterion criterion() const; + + NodeType &node(ulong index) { + return nodes[index]; + } +}; + +/********************************************************************************* + * TreeRegressor + *********************************************************************************/ + +class TreeRegressor : public Tree { + public: + TreeRegressor(OnlineForestRegressor &forest); + TreeRegressor(const TreeRegressor &tree); + TreeRegressor(const TreeRegressor &&tree); + TreeRegressor &operator=(const TreeRegressor &) = delete; + TreeRegressor &operator=(const TreeRegressor &&) = delete; + + double predict(const ArrayDouble &x_t, bool use_aggregation); +}; + +/********************************************************************************* + * OnlineForestRegressor + *********************************************************************************/ + +class OnlineForestRegressor { + private: + // Number of Trees in the forest + uint32_t _n_trees; + // Number of threads to use for parallel growing of trees + int32_t _n_threads; + // Criterion used for splitting (not used for now) + Criterion _criterion; + // Step-size used for aggregation + double _step; + // Number of features. + ulong _n_features; + // Seed for random number generation + int _seed; + // Verbose things or not + bool _verbose; + // Iteration counter + ulong _iteration; + // The list of trees in the forest + std::vector trees; + // Random number generator for feature and threshold sampling + Rand rand; + // Create trees + void create_trees(); + + public: + OnlineForestRegressor(uint32_t n_trees, double step, Criterion criterion, int32_t n_threads, int seed, bool verbose); + virtual ~OnlineForestRegressor(); + + void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); + void predict(const SArrayDouble2dPtr features, SArrayDoublePtr predictions, bool use_aggregation); + + inline ulong sample_feature(); + inline double sample_threshold(double left, double right); + + inline double step() const { + return _step; + } + + void print() { + for (Tree &tree: trees) { + tree.print(); + } + } + + inline ulong n_samples() const { + if (_iteration > 0) { + return _iteration; + } else { + TICK_ERROR("You must call ``fit`` before asking for ``n_samples``.") + } + } + + inline ulong n_features() const { + if (_iteration > 0) { + return _n_features; + } else { + TICK_ERROR("You must call ``fit`` before asking for ``n_features``.") + } + } + + inline OnlineForestRegressor &set_n_features(ulong n_features) { + if (_iteration == 0) { + _n_features = n_features; + } else { + TICK_ERROR("OnlineForest::set_n_features can be called only once !") + } + return *this; + } + + inline uint32_t n_trees() const { + return _n_trees; + } + + inline OnlineForestRegressor &set_n_trees(uint32_t n_trees) { + _n_trees = n_trees; + return *this; + } + + inline int32_t n_threads() const { + return _n_threads; + } + + inline OnlineForestRegressor &set_n_threads(int32_t n_threads) { + _n_threads = n_threads; + return *this; + } + + inline Criterion criterion() const { + return _criterion; + } + + inline OnlineForestRegressor &set_criterion(Criterion criterion) { + _criterion = criterion; + return *this; + } + + inline int seed() const { + return _seed; + } + + inline OnlineForestRegressor &set_seed(int seed) { + _seed = seed; + rand.reseed(seed); + return *this; + } +// inline bool verbose() const; +// inline OnlineForestRegressor &set_verbose(bool verbose); +}; + +#endif //TICK_ONLINEFOREST_H diff --git a/tick/inference/swig/inference_module.i b/tick/inference/swig/inference_module.i index 964c9d72f..dca630e91 100644 --- a/tick/inference/swig/inference_module.i +++ b/tick/inference/swig/inference_module.i @@ -22,4 +22,6 @@ %include hawkes_em.i %include hawkes_adm4.i %include hawkes_basis_kernels.i -%include hawkes_sumgaussians.i \ No newline at end of file +%include hawkes_sumgaussians.i + +%include online_forest_regressor.i diff --git a/tick/inference/swig/online_forest_regressor.i b/tick/inference/swig/online_forest_regressor.i new file mode 100644 index 000000000..4fbe4c36d --- /dev/null +++ b/tick/inference/swig/online_forest_regressor.i @@ -0,0 +1,41 @@ +// License: BSD 3 clause + +%include std_shared_ptr.i +%shared_ptr(OnlineForestRegressor); + +%{ +#include "online_forest_regressor.h" +%} + + +enum class Criterion { + unif = 0, + mse +}; + +class OnlineForestRegressor { + public: + OnlineForestRegressor(uint32_t n_trees, double step, Criterion criterion, int32_t n_threads, int seed, bool verbose); + + void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); + void predict(const SArrayDouble2dPtr features, SArrayDoublePtr predictions, bool use_aggregation); + + inline double step() const; + void print(); + + ulong n_samples() const; + ulong n_features() const; + OnlineForestRegressor &set_n_features(ulong n_features); + + // uint32_t n_trees() const; + // OnlineForestRegressor &set_n_trees(uint32_t n_trees); + + int32_t n_threads() const; + OnlineForestRegressor &set_n_threads(int32_t n_threads); + Criterion criterion() const; + OnlineForestRegressor &set_criterion(Criterion criterion); + int seed() const; + OnlineForestRegressor &set_seed(int seed); + // bool verbose() const; + // OnlineForestRegressor &set_verbose(bool verbose); +}; From 0b04c2e4ca4db97ffcfd135e65d671f4b6135831 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 15 Nov 2017 17:52:55 +0100 Subject: [PATCH 02/32] ... --- .../inference/src/online_forest_regressor.cpp | 200 +++++------------- tick/inference/src/online_forest_regressor.h | 128 ++++------- 2 files changed, 96 insertions(+), 232 deletions(-) diff --git a/tick/inference/src/online_forest_regressor.cpp b/tick/inference/src/online_forest_regressor.cpp index 68104f690..e2328da9e 100644 --- a/tick/inference/src/online_forest_regressor.cpp +++ b/tick/inference/src/online_forest_regressor.cpp @@ -4,11 +4,10 @@ #include "online_forest_regressor.h" /********************************************************************************* - * Node methods + * NodeRegressor methods *********************************************************************************/ -template -Node::Node(Tree &tree, ulong parent) +NodeRegressor::NodeRegressor(TreeRegressor &tree, ulong parent) : _tree(tree) { _n_samples = 0; _is_leaf = true; @@ -17,10 +16,10 @@ Node::Node(Tree &tree, ulong parent) _weight = 1; _weight_tree = 1; this->_parent = parent; + _predict = 0; } -template -Node::Node(const Node &node) +NodeRegressor::NodeRegressor(const NodeRegressor &node) : _tree(node._tree), _left(node._left), _right(node._right), _parent(node._parent), _feature(node._feature), _threshold(node._threshold), @@ -28,10 +27,10 @@ Node::Node(const Node &node) _x_t(node._x_t), _y_t(node._y_t), _weight(node._weight), _weight_tree(node._weight_tree), - _is_leaf(node._is_leaf) {} + _is_leaf(node._is_leaf), + _predict(node._predict) { } -template -Node::Node(const Node &&node) : _tree(_tree) { +NodeRegressor::NodeRegressor(const NodeRegressor &&node) : _tree(_tree) { _left = node._left; _right = node._right; _parent = node._parent; @@ -42,197 +41,127 @@ Node::Node(const Node &&node) : _tree(_tree) { _weight_tree = node._weight_tree; _is_leaf = node._is_leaf; _x_t = node._x_t; + _y_t = node._y_t; } -template -Node::~Node() {} - -template -void Node::update_downwards(const ArrayDouble &x_t, double y_t) { +void NodeRegressor::update_downwards(const ArrayDouble &x_t, double y_t) { _n_samples++; // TODO: Make compute loss virtual insteal update_weight(y_t); update_predict(y_t); } -template -void Node::update_weight(const double y_t) { +void NodeRegressor::update_weight(const double y_t) { _weight *= exp(-step() * loss(y_t)); } -template -inline Tree &Node::tree() const { - return _tree; -} - -template -inline NodeType &Node::node(ulong index) const { +inline NodeRegressor &NodeRegressor::node(ulong index) const { return _tree.node(index); } -template -ulong Node::n_features() const { +ulong NodeRegressor::n_features() const { return _tree.n_features(); } -template -inline double Node::step() const { - return _tree.step(); -} -template -void Node::print() { - std::cout // << "Node(i: " << _index << ", p: " << _parent - // << ", f: " << _feature - // << ", th: " << _threshold - << ", l: " << _left - << ", r: " << _right - // << ", d: " << _depth - // << ", n: " << n_samples() - // << ", i: " << _is_leaf - // << ", avg: " << std::setprecision(2) << _labels_average - // << ", feat_min=[" << std::setprecision(2) << _features_min[0] << ", " << std::setprecision(2) - // << _features_min[1] << "]" - // << ", feat_max=[" << std::setprecision(2) << _features_max[0] << ", " << std::setprecision(2) - // << _features_max[1] << "]" - << ")\n"; +inline double NodeRegressor::step() const { + return _tree.step(); } -template -inline ulong Node::parent() const { +inline ulong NodeRegressor::parent() const { return _parent; } -template -inline ulong Node::left() const { +inline ulong NodeRegressor::left() const { return _left; } -template -inline Node &Node::set_left(ulong left) { +inline NodeRegressor &NodeRegressor::set_left(ulong left) { _left = left; return *this; } -template -inline ulong Node::right() const { +inline ulong NodeRegressor::right() const { return _right; } -template -inline Node &Node::set_right(ulong right) { +inline NodeRegressor &NodeRegressor::set_right(ulong right) { _right = right; return *this; } -template -inline bool Node::is_leaf() const { +inline bool NodeRegressor::is_leaf() const { return _is_leaf; } -template -inline Node &Node::set_is_leaf(bool is_leaf) { +inline NodeRegressor &NodeRegressor::set_is_leaf(bool is_leaf) { _is_leaf = is_leaf; return *this; } -template -inline ulong Node::feature() const { +inline ulong NodeRegressor::feature() const { return _feature; } -template -inline Node &Node::set_feature(ulong feature) { +inline NodeRegressor &NodeRegressor::set_feature(ulong feature) { _feature = feature; return *this; } -template -inline double Node::threshold() const { +inline double NodeRegressor::threshold() const { return _threshold; } -template -inline Node &Node::set_threshold(double threshold) { +inline NodeRegressor &NodeRegressor::set_threshold(double threshold) { _threshold = threshold; return *this; } -template -inline ulong Node::n_samples() const { +inline ulong NodeRegressor::n_samples() const { return _n_samples; } -template -inline Node &Node::set_n_samples(ulong n_samples) { +inline NodeRegressor &NodeRegressor::set_n_samples(ulong n_samples) { _n_samples = n_samples; return *this; } -template -inline double Node::weight() const { +inline double NodeRegressor::weight() const { return _weight; } -template -inline Node &Node::set_weight(double weight) { +inline NodeRegressor &NodeRegressor::set_weight(double weight) { _weight = weight; return *this; } -template -inline double Node::weight_tree() const { +inline double NodeRegressor::weight_tree() const { return _weight_tree; } -template -inline Node &Node::set_weight_tree(double weight_tree) { +inline NodeRegressor &NodeRegressor::set_weight_tree(double weight_tree) { _weight_tree = weight_tree; return *this; } -template -inline const ArrayDouble &Node::x_t() const { +inline const ArrayDouble &NodeRegressor::x_t() const { return _x_t; } -template -inline Node &Node::set_x_t(const ArrayDouble &x_t) { +inline NodeRegressor &NodeRegressor::set_x_t(const ArrayDouble &x_t) { _x_t = x_t; return *this; } -template -inline double Node::y_t() const { +inline double NodeRegressor::y_t() const { return _y_t; } -template -inline Node &Node::set_y_t(const double y_t) { +inline NodeRegressor & NodeRegressor::set_y_t(const double y_t) { _y_t = y_t; return *this; } -/********************************************************************************* - * NodeRegressor methods - *********************************************************************************/ - -NodeRegressor::NodeRegressor(Tree &tree, ulong parent) - : Node(tree, parent) { - _predict = 0; -} - -NodeRegressor::NodeRegressor(const NodeRegressor &node) - : Node(node), _predict(node._predict), _y_t(node._y_t) {} - -NodeRegressor::NodeRegressor(const NodeRegressor &&node) - : Node(node) { - _predict = node._predict; - _y_t = node._y_t; -} - -NodeRegressor::~NodeRegressor() {} - inline double NodeRegressor::predict() const { return _predict; } @@ -272,16 +201,14 @@ void NodeRegressor::print() { } /********************************************************************************* - * Tree methods - *********************************************************************************/ +* TreeRegressor methods +*********************************************************************************/ -template -Tree::Tree(const Tree &tree) +TreeRegressor::TreeRegressor(const TreeRegressor &tree) : nodes(tree.nodes), forest(tree.forest) { } -template -Tree::Tree(const Tree &&tree) : nodes(tree.nodes), forest(tree.forest) { +TreeRegressor::TreeRegressor(const TreeRegressor &&tree) : nodes(tree.nodes), forest(tree.forest) { } //template @@ -289,14 +216,13 @@ Tree::Tree(const Tree &&tree) : nodes(tree.nodes), forest(tr // return _tree.n_features(); //} -template -Tree::Tree(OnlineForestRegressor &forest) : forest(forest) { + +TreeRegressor::TreeRegressor(OnlineForestRegressor &forest) : forest(forest) { // TODO: pre-allocate the vector to make things faster ? add_node(0, 0); } -template -ulong Tree::split_leaf(ulong index, const ArrayDouble &x_t, double y_t) { +ulong TreeRegressor::split_leaf(ulong index, const ArrayDouble &x_t, double y_t) { // std::cout << "Splitting node " << index << std::endl; ulong left = add_node(index, iteration); ulong right = add_node(index, iteration); @@ -343,8 +269,7 @@ ulong Tree::split_leaf(ulong index, const ArrayDouble &x_t, double y_t return data_leaf; } -template -ulong Tree::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { +ulong TreeRegressor::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { // Find the leaf that contains the sample // Start at the root. Index of the root is always 0 // If predict == true, this call to find_leaf is for @@ -353,7 +278,7 @@ ulong Tree::go_downwards(const ArrayDouble &x_t, double y_t, bool pred bool is_leaf = false; while (!is_leaf) { // Get the current node - Node ¤t_node = node(index_current_node); + NodeRegressor ¤t_node = node(index_current_node); if (!predict) { current_node.update_downwards(x_t, y_t); } @@ -370,14 +295,13 @@ ulong Tree::go_downwards(const ArrayDouble &x_t, double y_t, bool pred return index_current_node; } -template -void Tree::go_upwards(ulong leaf_index) { +void TreeRegressor::go_upwards(ulong leaf_index) { ulong current = leaf_index; while (true) { // TODO: use a node::update_upward - Node ¤t_node = node(current); + NodeRegressor ¤t_node = node(current); if (current_node.is_leaf()) { current_node.set_weight_tree(current_node.weight()); } else { @@ -402,13 +326,12 @@ void Tree::go_upwards(ulong leaf_index) { } } -template -inline ulong Tree::n_nodes() const { + +inline ulong TreeRegressor::n_nodes() const { return _n_nodes; } -template -void Tree::fit(const ArrayDouble &x_t, double y_t) { +void TreeRegressor::fit(const ArrayDouble &x_t, double y_t) { // TODO: Test that the size does not change within successive calls to fit if (iteration == 0) { nodes[0].set_x_t(x_t).set_y_t(y_t); @@ -430,19 +353,6 @@ void Tree::fit(const ArrayDouble &x_t, double y_t) { iteration++; } -/********************************************************************************* -* TreeRegressor methods -*********************************************************************************/ - -TreeRegressor::TreeRegressor(OnlineForestRegressor &forest) - : Tree(forest) {} - -TreeRegressor::TreeRegressor(const TreeRegressor &tree) - : Tree(forest) {} - -TreeRegressor::TreeRegressor(const TreeRegressor &&tree) - : Tree(forest) {} - double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { ulong leaf = go_downwards(x_t, 0., true); if (!use_aggregation) { @@ -480,24 +390,20 @@ double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { // return weight / std::exp(nodes[0].weight_tree()); } -template -ulong Tree::add_node(ulong parent, ulong creation_time) { +ulong TreeRegressor::add_node(ulong parent, ulong creation_time) { nodes.emplace_back(*this, parent); return _n_nodes++; } -template -inline ulong Tree::n_features() const { +inline ulong TreeRegressor::n_features() const { return forest.n_features(); } -template -inline double Tree::step() const { +inline double TreeRegressor::step() const { return forest.step(); } -template -inline Criterion Tree::criterion() const { +inline Criterion TreeRegressor::criterion() const { return forest.criterion(); } diff --git a/tick/inference/src/online_forest_regressor.h b/tick/inference/src/online_forest_regressor.h index 77ad35649..3f454c69b 100644 --- a/tick/inference/src/online_forest_regressor.h +++ b/tick/inference/src/online_forest_regressor.h @@ -34,20 +34,16 @@ enum class Criterion { mse }; -typedef uint32_t index_t; - -template -class Tree; +class TreeRegressor; /********************************************************************************* - * Node + * NodeRegressor *********************************************************************************/ -template -class Node { +class NodeRegressor { protected: // Tree containing the node - Tree &_tree; + TreeRegressor &_tree; // Index of the left child ulong _left; // Index of the right child @@ -71,89 +67,64 @@ class Node { double _weight_tree; // true if the node is a leaf bool _is_leaf; + // Average of the labels in the node (regression only for now) + double _predict = 0; public: - Node(Tree &tree, ulong parent); - Node(const Node &node); - Node(const Node &&node); - Node &operator=(const Node &) = delete; - Node &operator=(const Node &&) = delete; - virtual ~Node(); + NodeRegressor(TreeRegressor &tree, ulong parent); + NodeRegressor(const NodeRegressor &node); + NodeRegressor(const NodeRegressor &&node); + NodeRegressor &operator=(const NodeRegressor &) = delete; + NodeRegressor &operator=(const NodeRegressor &&) = delete; // Update to apply to a node when going forward in the tree (towards leaves) - virtual void update_downwards(const ArrayDouble &x_t, double y_t); + void update_downwards(const ArrayDouble &x_t, double y_t); // Update of the aggregation weights - virtual void update_weight(const double y_t) final; + void update_weight(const double y_t); // Update the prediction of the label - virtual void update_predict(double y_t) = 0; + void update_predict(double y_t); // Loss function used for aggregation - virtual double loss(const double y_t) = 0; + double loss(const double y_t); - inline Tree &tree() const; - inline NodeType &node(ulong index) const; + // inline TreeRegressor &tree() const; + inline NodeRegressor &node(ulong index) const; ulong n_features() const; inline double step() const; - virtual void print(); + void print(); + + double predict() const; inline ulong parent() const; inline ulong left() const; - inline Node &set_left(ulong left); + inline NodeRegressor &set_left(ulong left); inline ulong right() const; - inline Node &set_right(ulong right); + inline NodeRegressor &set_right(ulong right); inline bool is_leaf() const; - inline Node &set_is_leaf(bool is_leaf); + inline NodeRegressor &set_is_leaf(bool is_leaf); inline ulong feature() const; - inline Node &set_feature(ulong feature); + inline NodeRegressor &set_feature(ulong feature); inline double threshold() const; - inline Node &set_threshold(double threshold); + inline NodeRegressor &set_threshold(double threshold); inline ulong n_samples() const; - inline Node &set_n_samples(ulong n_samples); + inline NodeRegressor &set_n_samples(ulong n_samples); inline double weight() const; - inline Node &set_weight(double weight); + inline NodeRegressor &set_weight(double weight); inline double weight_tree() const; - inline Node &set_weight_tree(double weight); + inline NodeRegressor &set_weight_tree(double weight); inline const ArrayDouble &x_t() const; - inline Node &set_x_t(const ArrayDouble &x_t); + inline NodeRegressor &set_x_t(const ArrayDouble &x_t); inline double y_t() const; - inline Node& set_y_t(const double y_t); -}; - -/********************************************************************************* - * NodeRegressor - *********************************************************************************/ - -class NodeRegressor : public Node { - private: - // Average of the labels in the node (regression only for now) - double _predict = 0; - // Label of the stored sample point - double _y_t; - - public: - NodeRegressor(Tree &tree, ulong parent); - NodeRegressor(const NodeRegressor &node); - NodeRegressor(const NodeRegressor &&node); - NodeRegressor &operator=(const NodeRegressor &) = delete; - NodeRegressor &operator=(const NodeRegressor &&) = delete; - virtual ~NodeRegressor(); - - inline double predict() const; - virtual void update_predict(double y_t); - virtual double loss(const double y_t); - virtual void print(); - - + inline NodeRegressor &set_y_t(const double y_t); }; class OnlineForestRegressor; /********************************************************************************* - * Tree + * TreeRegressor *********************************************************************************/ -template -class Tree { +class TreeRegressor { protected: // The forest of the tree OnlineForestRegressor &forest; @@ -162,22 +133,21 @@ class Tree { // Iteration counter ulong iteration = 0; // Nodes of the tree - std::vector nodes = std::vector(); + std::vector nodes = std::vector(); // Split the node at given index ulong split_leaf(ulong index, const ArrayDouble &x_t, double y_t); // Add nodes in the tree - virtual ulong add_node(ulong parent, ulong creation_time); + ulong add_node(ulong parent, ulong creation_time); ulong go_downwards(const ArrayDouble &x_t, double y_t, bool predict); void go_upwards(ulong leaf_index); public: - Tree(OnlineForestRegressor &forest); - Tree(const Tree &tree); - Tree(const Tree &&tree); - Tree &operator=(const Tree &) = delete; - Tree &operator=(const Tree &&) = delete; - ~Tree() {} + TreeRegressor(OnlineForestRegressor &forest); + TreeRegressor(const TreeRegressor &tree); + TreeRegressor(const TreeRegressor &&tree); + TreeRegressor &operator=(const TreeRegressor &) = delete; + TreeRegressor &operator=(const TreeRegressor &&) = delete; void fit(const ArrayDouble &x_t, double y_t); @@ -188,33 +158,21 @@ class Tree { inline double step() const; void print() { - for (NodeType &node : nodes) { + for (NodeRegressor &node : nodes) { node.print(); } } inline Criterion criterion() const; - NodeType &node(ulong index) { + NodeRegressor &node(ulong index) { return nodes[index]; } -}; - -/********************************************************************************* - * TreeRegressor - *********************************************************************************/ - -class TreeRegressor : public Tree { - public: - TreeRegressor(OnlineForestRegressor &forest); - TreeRegressor(const TreeRegressor &tree); - TreeRegressor(const TreeRegressor &&tree); - TreeRegressor &operator=(const TreeRegressor &) = delete; - TreeRegressor &operator=(const TreeRegressor &&) = delete; double predict(const ArrayDouble &x_t, bool use_aggregation); }; + /********************************************************************************* * OnlineForestRegressor *********************************************************************************/ @@ -259,7 +217,7 @@ class OnlineForestRegressor { } void print() { - for (Tree &tree: trees) { + for (TreeRegressor &tree: trees) { tree.print(); } } From 4582c01aaf2224484c90ec785c7eb5dab0820692 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 15 Nov 2017 18:22:18 +0100 Subject: [PATCH 03/32] ... --- .../inference/src/online_forest_regressor.cpp | 105 +++++++++--------- tick/inference/src/online_forest_regressor.h | 32 +++--- 2 files changed, 68 insertions(+), 69 deletions(-) diff --git a/tick/inference/src/online_forest_regressor.cpp b/tick/inference/src/online_forest_regressor.cpp index e2328da9e..db2fcdfd5 100644 --- a/tick/inference/src/online_forest_regressor.cpp +++ b/tick/inference/src/online_forest_regressor.cpp @@ -9,13 +9,14 @@ NodeRegressor::NodeRegressor(TreeRegressor &tree, ulong parent) : _tree(tree) { - _n_samples = 0; - _is_leaf = true; + _parent = parent; _left = 0; _right = 0; + _n_samples = 0; + _is_leaf = true; _weight = 1; _weight_tree = 1; - this->_parent = parent; + _predict = 0; } @@ -28,7 +29,7 @@ NodeRegressor::NodeRegressor(const NodeRegressor &node) _y_t(node._y_t), _weight(node._weight), _weight_tree(node._weight_tree), _is_leaf(node._is_leaf), - _predict(node._predict) { } + _predict(node._predict) {} NodeRegressor::NodeRegressor(const NodeRegressor &&node) : _tree(_tree) { _left = node._left; @@ -44,15 +45,28 @@ NodeRegressor::NodeRegressor(const NodeRegressor &&node) : _tree(_tree) { _y_t = node._y_t; } -void NodeRegressor::update_downwards(const ArrayDouble &x_t, double y_t) { +void NodeRegressor::update_downwards(const ArrayDouble &x_t, const double y_t) { _n_samples++; - // TODO: Make compute loss virtual insteal - update_weight(y_t); + _weight *= exp(-step() * loss(y_t)); update_predict(y_t); } -void NodeRegressor::update_weight(const double y_t) { - _weight *= exp(-step() * loss(y_t)); +void NodeRegressor::update_upwards() { + if (_is_leaf) { + _weight_tree = _weight; + } else { + _weight_tree = (_weight + node(_left).weight_tree() * node(_right).weight_tree()) / 2; + } +} + +void NodeRegressor::update_predict(const double y_t) { + // When a node is updated, it necessarily contains already a sample + _predict = ((_n_samples - 1) * _predict + y_t) / _n_samples; +} + +double NodeRegressor::loss(const double y_t) { + double diff = _predict - y_t; + return diff * diff / 2; } inline NodeRegressor &NodeRegressor::node(ulong index) const { @@ -63,7 +77,6 @@ ulong NodeRegressor::n_features() const { return _tree.n_features(); } - inline double NodeRegressor::step() const { return _tree.step(); } @@ -157,7 +170,7 @@ inline double NodeRegressor::y_t() const { return _y_t; } -inline NodeRegressor & NodeRegressor::set_y_t(const double y_t) { +inline NodeRegressor &NodeRegressor::set_y_t(const double y_t) { _y_t = y_t; return *this; } @@ -166,16 +179,6 @@ inline double NodeRegressor::predict() const { return _predict; } -void NodeRegressor::update_predict(double y_t) { - // When a node is updated, it necessarily contains already a sample - _predict = ((_n_samples - 1) * _predict + y_t) / _n_samples; -} - -double NodeRegressor::loss(const double y_t) { - double diff = _predict - y_t; - return diff * diff / 2; -} - void NodeRegressor::print() { std::cout // << "Node(idx: " << _index << ", parent: " << _parent // << ", f: " << _feature @@ -205,27 +208,20 @@ void NodeRegressor::print() { *********************************************************************************/ TreeRegressor::TreeRegressor(const TreeRegressor &tree) - : nodes(tree.nodes), forest(tree.forest) { -} - -TreeRegressor::TreeRegressor(const TreeRegressor &&tree) : nodes(tree.nodes), forest(tree.forest) { -} - -//template -//ulong Node::n_features() const { -// return _tree.n_features(); -//} + : nodes(tree.nodes), forest(tree.forest) {} +TreeRegressor::TreeRegressor(const TreeRegressor &&tree) + : nodes(tree.nodes), forest(tree.forest) {} TreeRegressor::TreeRegressor(OnlineForestRegressor &forest) : forest(forest) { // TODO: pre-allocate the vector to make things faster ? - add_node(0, 0); + add_node(0); } ulong TreeRegressor::split_leaf(ulong index, const ArrayDouble &x_t, double y_t) { // std::cout << "Splitting node " << index << std::endl; - ulong left = add_node(index, iteration); - ulong right = add_node(index, iteration); + ulong left = add_node(index); + ulong right = add_node(index); node(index).set_left(left).set_right(right).set_is_leaf(false); // TODO: better feature sampling @@ -261,7 +257,8 @@ ulong TreeRegressor::split_leaf(ulong index, const ArrayDouble &x_t, double y_t) // Update downwards of v' node(other_leaf).update_downwards(node(index).x_t(), node(index).y_t()); // Update upwards of v': it's a leaf - node(other_leaf).set_weight_tree(node(other_leaf).weight()); + node(other_leaf).update_upwards(); + // node(other_leaf).set_weight_tree(node(other_leaf).weight()); // Update downwards of v'' node(data_leaf).update_downwards(x_t, y_t); // Note: the update_up of v'' is done in the go_up method, called in fit() @@ -302,22 +299,23 @@ void TreeRegressor::go_upwards(ulong leaf_index) { while (true) { // TODO: use a node::update_upward NodeRegressor ¤t_node = node(current); - if (current_node.is_leaf()) { - current_node.set_weight_tree(current_node.weight()); - } else { - double w = current_node.weight(); - double w0 = node(current_node.left()).weight_tree(); - double w1 = node(current_node.right()).weight_tree(); - current_node.set_weight_tree((w + w0 * w1) / 2); -// double a = current_node.weight(); -// double b = weight_tree_left + weight_tree_right; -// double toto; -// if(a > b) { -// toto = a + log(1 + exp(b - a)) - log(2); -// } else { -// toto = b + log(1 + exp(a - b)) - log(2); -// } - } + current_node.update_upwards(); +// if (current_node.is_leaf()) { +// current_node.set_weight_tree(current_node.weight()); +// } else { +// double w = current_node.weight(); +// double w0 = node(current_node.left()).weight_tree(); +// double w1 = node(current_node.right()).weight_tree(); +// current_node.set_weight_tree((w + w0 * w1) / 2); +//// double a = current_node.weight(); +//// double b = weight_tree_left + weight_tree_right; +//// double toto; +//// if(a > b) { +//// toto = a + log(1 + exp(b - a)) - log(2); +//// } else { +//// toto = b + log(1 + exp(a - b)) - log(2); +//// } +// } if (current == 0) { break; } @@ -326,7 +324,6 @@ void TreeRegressor::go_upwards(ulong leaf_index) { } } - inline ulong TreeRegressor::n_nodes() const { return _n_nodes; } @@ -390,7 +387,7 @@ double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { // return weight / std::exp(nodes[0].weight_tree()); } -ulong TreeRegressor::add_node(ulong parent, ulong creation_time) { +ulong TreeRegressor::add_node(ulong parent) { nodes.emplace_back(*this, parent); return _n_nodes++; } @@ -418,7 +415,7 @@ OnlineForestRegressor::OnlineForestRegressor(uint32_t n_trees, int seed, bool verbose) : // _n_trees(n_trees), - _n_threads(n_threads), _criterion(criterion), _step(step), _verbose(verbose), trees() { + _n_threads(n_threads), _criterion(criterion), _step(step), _verbose(verbose), trees() { // No iteration so far _n_trees = 10; _iteration = 0; diff --git a/tick/inference/src/online_forest_regressor.h b/tick/inference/src/online_forest_regressor.h index 3f454c69b..aedc26ca2 100644 --- a/tick/inference/src/online_forest_regressor.h +++ b/tick/inference/src/online_forest_regressor.h @@ -44,12 +44,12 @@ class NodeRegressor { protected: // Tree containing the node TreeRegressor &_tree; + // Index of the parent + ulong _parent; // Index of the left child ulong _left; // Index of the right child ulong _right; - // Index of the parent - ulong _parent; // Index of the feature used for the split ulong _feature; // Threshold used for the split @@ -78,23 +78,24 @@ class NodeRegressor { NodeRegressor &operator=(const NodeRegressor &&) = delete; // Update to apply to a node when going forward in the tree (towards leaves) - void update_downwards(const ArrayDouble &x_t, double y_t); - // Update of the aggregation weights - void update_weight(const double y_t); + void update_downwards(const ArrayDouble &x_t, const double y_t); + // Update to apply to a node when going upward in the tree (towards the root) + void update_upwards(); // Update the prediction of the label - void update_predict(double y_t); + void update_predict(const double y_t); + // Predict function (average of the labels of samples that passed through the node) + double predict() const; // Loss function used for aggregation double loss(const double y_t); - - // inline TreeRegressor &tree() const; + // Get node at index in the tree inline NodeRegressor &node(ulong index) const; - ulong n_features() const; + // Get number of features + inline ulong n_features() const; + // Step to use for aggrgation inline double step() const; - + // Print of the node void print(); - double predict() const; - inline ulong parent() const; inline ulong left() const; inline NodeRegressor &set_left(ulong left); @@ -137,11 +138,13 @@ class TreeRegressor { // Split the node at given index ulong split_leaf(ulong index, const ArrayDouble &x_t, double y_t); // Add nodes in the tree - ulong add_node(ulong parent, ulong creation_time); + ulong add_node(ulong parent); ulong go_downwards(const ArrayDouble &x_t, double y_t, bool predict); void go_upwards(ulong leaf_index); + double predict(const ArrayDouble &x_t, bool use_aggregation); + public: TreeRegressor(OnlineForestRegressor &forest); TreeRegressor(const TreeRegressor &tree); @@ -169,9 +172,8 @@ class TreeRegressor { return nodes[index]; } - double predict(const ArrayDouble &x_t, bool use_aggregation); -}; +}; /********************************************************************************* * OnlineForestRegressor From 9e9f2cfdec1de0ad91e85e7f4e53221138a275b1 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 15 Nov 2017 23:18:50 +0100 Subject: [PATCH 04/32] was working before new predict --- .../inference/src/online_forest_regressor.cpp | 116 +++++++----------- tick/inference/src/online_forest_regressor.h | 22 ++-- 2 files changed, 61 insertions(+), 77 deletions(-) diff --git a/tick/inference/src/online_forest_regressor.cpp b/tick/inference/src/online_forest_regressor.cpp index db2fcdfd5..bbb99cf3f 100644 --- a/tick/inference/src/online_forest_regressor.cpp +++ b/tick/inference/src/online_forest_regressor.cpp @@ -14,15 +14,14 @@ NodeRegressor::NodeRegressor(TreeRegressor &tree, ulong parent) _right = 0; _n_samples = 0; _is_leaf = true; - _weight = 1; - _weight_tree = 1; - + _weight = 0; + _weight_tree = 0; _predict = 0; } NodeRegressor::NodeRegressor(const NodeRegressor &node) : _tree(node._tree), - _left(node._left), _right(node._right), _parent(node._parent), + _parent(node._parent), _left(node._left), _right(node._right), _feature(node._feature), _threshold(node._threshold), _n_samples(node._n_samples), _x_t(node._x_t), @@ -47,7 +46,7 @@ NodeRegressor::NodeRegressor(const NodeRegressor &&node) : _tree(_tree) { void NodeRegressor::update_downwards(const ArrayDouble &x_t, const double y_t) { _n_samples++; - _weight *= exp(-step() * loss(y_t)); + _weight -= step() * loss(y_t); update_predict(y_t); } @@ -55,7 +54,7 @@ void NodeRegressor::update_upwards() { if (_is_leaf) { _weight_tree = _weight; } else { - _weight_tree = (_weight + node(_left).weight_tree() * node(_right).weight_tree()) / 2; + _weight_tree = log_sum_2_exp(_weight, node(_left).weight_tree() + node(_right).weight_tree()); } } @@ -180,27 +179,17 @@ inline double NodeRegressor::predict() const { } void NodeRegressor::print() { - std::cout // << "Node(idx: " << _index << ", parent: " << _parent - // << ", f: " << _feature - // << ", th: " << _threshold + std::cout << "Node(parent: " << _parent << ", left: " << _left << ", right: " << _right - // << ", d: " << _depth - // << ", n: " << n_samples() - // << ", i: " << _is_leaf + << ", n_samples: " << _n_samples + << ", is_leaf: " << _is_leaf + << ", feature: " << _feature << ", thresh: " << _threshold - << ", y_hat: " << _predict - << ", sample: "; - // << ", has_sample:" << _has_sample; - if (_is_leaf) { - std::cout << "[" << std::setprecision(2) << _x_t[0] << ", " << std::setprecision(2) << _x_t[1] - << "]"; - } else { - std::cout << "null"; - } - std::cout << ", weight: " << _weight; - std::cout << ", weight_tree: " << _weight_tree; - std::cout << ")\n"; + << ", predict: " << _predict + << ", weight: " << _weight + << ", weight_tree: " << _weight_tree + << ")\n"; } /********************************************************************************* @@ -208,10 +197,10 @@ void NodeRegressor::print() { *********************************************************************************/ TreeRegressor::TreeRegressor(const TreeRegressor &tree) - : nodes(tree.nodes), forest(tree.forest) {} + : forest(tree.forest), nodes(tree.nodes) {} TreeRegressor::TreeRegressor(const TreeRegressor &&tree) - : nodes(tree.nodes), forest(tree.forest) {} + : forest(tree.forest), nodes(tree.nodes) {} TreeRegressor::TreeRegressor(OnlineForestRegressor &forest) : forest(forest) { // TODO: pre-allocate the vector to make things faster ? @@ -247,22 +236,21 @@ ulong TreeRegressor::split_leaf(ulong index, const ArrayDouble &x_t, double y_t) other_leaf = left; } // TODO: code a move_sample - - node(index).set_feature(feature).set_threshold(threshold); - + NodeRegressor & current_node = node(index); + NodeRegressor & data_node = node(data_leaf); + NodeRegressor & other_node = node(other_leaf); + current_node.set_feature(feature).set_threshold(threshold); // We pass the sample to the new leaves, and initialize the _label_average with the value - node(data_leaf).set_x_t(x_t).set_y_t(y_t); - node(other_leaf).set_x_t(node(index).x_t()).set_y_t(node(index).y_t()); - + data_node.set_x_t(x_t).set_y_t(y_t); + other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); // Update downwards of v' - node(other_leaf).update_downwards(node(index).x_t(), node(index).y_t()); + other_node.update_downwards(current_node.x_t(), current_node.y_t()); // Update upwards of v': it's a leaf - node(other_leaf).update_upwards(); + other_node.update_upwards(); // node(other_leaf).set_weight_tree(node(other_leaf).weight()); // Update downwards of v'' - node(data_leaf).update_downwards(x_t, y_t); + data_node.update_downwards(x_t, y_t); // Note: the update_up of v'' is done in the go_up method, called in fit() - return data_leaf; } @@ -293,29 +281,10 @@ ulong TreeRegressor::go_downwards(const ArrayDouble &x_t, double y_t, bool predi } void TreeRegressor::go_upwards(ulong leaf_index) { - ulong current = leaf_index; - while (true) { - // TODO: use a node::update_upward NodeRegressor ¤t_node = node(current); current_node.update_upwards(); -// if (current_node.is_leaf()) { -// current_node.set_weight_tree(current_node.weight()); -// } else { -// double w = current_node.weight(); -// double w0 = node(current_node.left()).weight_tree(); -// double w1 = node(current_node.right()).weight_tree(); -// current_node.set_weight_tree((w + w0 * w1) / 2); -//// double a = current_node.weight(); -//// double b = weight_tree_left + weight_tree_right; -//// double toto; -//// if(a > b) { -//// toto = a + log(1 + exp(b - a)) - log(2); -//// } else { -//// toto = b + log(1 + exp(a - b)) - log(2); -//// } -// } if (current == 0) { break; } @@ -335,10 +304,8 @@ void TreeRegressor::fit(const ArrayDouble &x_t, double y_t) { iteration++; return; } - ulong leaf = go_downwards(x_t, y_t, false); ulong new_leaf = split_leaf(leaf, x_t, y_t); - // for(ulong j=0; j < n_features(); ++j) { // double delta = std::abs(x_t[j] - node(leaf).sample().first[j]); // if (delta > 0.) { @@ -357,33 +324,44 @@ double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { } ulong current = leaf; // The child of the current node that does not contain the data - ulong other; + ulong other_index; ulong parent; - double weight; + ulong data_index; + double pred; while (true) { NodeRegressor ¤t_node = node(current); if (current_node.is_leaf()) { - weight = current_node.weight() * current_node.predict(); + pred = current_node.predict(); + // weight = current_node.weight() * current_node.predict(); // weight = std::exp(current_node.weight()) * current_node.labels_average(); } else { - weight = 0.5 * current_node.weight() * current_node.predict() - + 0.5 * node(other).weight_tree() * weight; + pred = 0.5 * std::exp(current_node.weight() - current_node.weight_tree()) * current_node.predict(); + parent = current_node.parent(); + if (node(parent).left() == current) { + // other_index = node(parent).right(); + data_index = node(parent).left(); + + } else { + // other_index = node(parent).left(); + data_index = node(parent).right(); + } + NodeRegressor& data_node = node(data_index); + + pred += (1 - 0.5 * std::exp(current_node.weight() - current_node.weight_tree())) * data_node.predict(); + +// weight = 0.5 * current_node.weight() * current_node.predict() +// + 0.5 * node(other).weight_tree() * weight; // weight = 0.5 * std::exp(current_node.weight()) * current_node.labels_average() // + 0.5 * std::exp(node(other).weight_tree() + weight); } - parent = node(current).parent(); - if (node(parent).left() == current) { - other = node(parent).right(); - } else { - other = node(parent).left(); - } // Root must be updated as well if (current == 0) { break; } current = parent; } - return weight / nodes[0].weight_tree(); + // return weight / nodes[0].weight_tree(); + return pred; // return weight / std::exp(nodes[0].weight_tree()); } diff --git a/tick/inference/src/online_forest_regressor.h b/tick/inference/src/online_forest_regressor.h index aedc26ca2..a368ca80a 100644 --- a/tick/inference/src/online_forest_regressor.h +++ b/tick/inference/src/online_forest_regressor.h @@ -34,6 +34,17 @@ enum class Criterion { mse }; + +// Computation of log( (e^a + e^b) / 2) in an overproof way +inline double log_sum_2_exp(const double a, const double b) { + // TODO if |a - b| > 50 skip + if (a > b) { + return a + std::log((1 + std::exp(b - a)) / 2); + } else { + return b + std::log((1 + std::exp(a - b)) / 2); + } +} + class TreeRegressor; /********************************************************************************* @@ -61,9 +72,9 @@ class NodeRegressor { ArrayDouble _x_t; // The label of the sample saved in the node double _y_t; - // Aggregation weight for the node + // Logarithm of the aggregation weight for the node double _weight; - // Aggregation weight for the sub-tree starting at this node + // Logarithm of the agregation weight for the sub-tree starting at this node double _weight_tree; // true if the node is a leaf bool _is_leaf; @@ -143,8 +154,6 @@ class TreeRegressor { ulong go_downwards(const ArrayDouble &x_t, double y_t, bool predict); void go_upwards(ulong leaf_index); - double predict(const ArrayDouble &x_t, bool use_aggregation); - public: TreeRegressor(OnlineForestRegressor &forest); TreeRegressor(const TreeRegressor &tree); @@ -153,8 +162,7 @@ class TreeRegressor { TreeRegressor &operator=(const TreeRegressor &&) = delete; void fit(const ArrayDouble &x_t, double y_t); - - // double predict(const ArrayDouble& x_t); + double predict(const ArrayDouble &x_t, bool use_aggregation); inline ulong n_features() const; inline ulong n_nodes() const; @@ -171,8 +179,6 @@ class TreeRegressor { NodeRegressor &node(ulong index) { return nodes[index]; } - - }; /********************************************************************************* From fb481c7c13f07f65728c399072c6af4af33f3071 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Thu, 16 Nov 2017 08:38:50 +0100 Subject: [PATCH 05/32] Overflow proof. --- online_forest.py | 8 +++++++- tick/inference/src/online_forest_regressor.cpp | 10 +++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/online_forest.py b/online_forest.py index 794b97e02..08dda1ee3 100644 --- a/online_forest.py +++ b/online_forest.py @@ -9,7 +9,7 @@ from time import time -n_samples = 2000 +n_samples = 1000 n_features = 2 seed = 123 @@ -162,6 +162,12 @@ def plot_decisions(clfs, datasets, names, use_aggregation=None): plot_decisions(clfs, datasets, names) plt.show() +# forest = OnlineForestRegressor(n_trees=n_trees, seed=123, step=0.25) +# +# forest.fit(X, y) +# +# forest.predict(X) + # plt.savefig(os.path.join(path, 'decisions.pdf')) diff --git a/tick/inference/src/online_forest_regressor.cpp b/tick/inference/src/online_forest_regressor.cpp index bbb99cf3f..0f2ac77a1 100644 --- a/tick/inference/src/online_forest_regressor.cpp +++ b/tick/inference/src/online_forest_regressor.cpp @@ -318,7 +318,9 @@ void TreeRegressor::fit(const ArrayDouble &x_t, double y_t) { } double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { + // std::cout << "Going downwards" << std::endl; ulong leaf = go_downwards(x_t, 0., true); + // std::cout << "Done." << std::endl; if (!use_aggregation) { return node(leaf).y_t(); } @@ -329,13 +331,13 @@ double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { ulong data_index; double pred; while (true) { + // std::cout << "node: " << current << std::endl; NodeRegressor ¤t_node = node(current); if (current_node.is_leaf()) { pred = current_node.predict(); // weight = current_node.weight() * current_node.predict(); // weight = std::exp(current_node.weight()) * current_node.labels_average(); } else { - pred = 0.5 * std::exp(current_node.weight() - current_node.weight_tree()) * current_node.predict(); parent = current_node.parent(); if (node(parent).left() == current) { // other_index = node(parent).right(); @@ -347,18 +349,20 @@ double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { } NodeRegressor& data_node = node(data_index); - pred += (1 - 0.5 * std::exp(current_node.weight() - current_node.weight_tree())) * data_node.predict(); + double a = 0.5 * std::exp(current_node.weight() - current_node.weight_tree()) * current_node.predict(); + pred = a + (1 - 0.5 * std::exp(current_node.weight() - current_node.weight_tree())) * pred; // weight = 0.5 * current_node.weight() * current_node.predict() // + 0.5 * node(other).weight_tree() * weight; // weight = 0.5 * std::exp(current_node.weight()) * current_node.labels_average() // + 0.5 * std::exp(node(other).weight_tree() + weight); } + // std::cout << "pred: " << pred << std::endl; // Root must be updated as well if (current == 0) { break; } - current = parent; + current = current_node.parent(); } // return weight / nodes[0].weight_tree(); return pred; From f13ccb80f410a0eaa5417db9f05c7e002a0e31c4 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Thu, 16 Nov 2017 10:44:08 +0100 Subject: [PATCH 06/32] Starting classification --- online_forest.py | 4 +- setup.py | 6 +- tick/inference/__init__.py | 2 + tick/inference/online_forest_classifier.py | 174 ++++++++++++++++++ tick/inference/online_forest_regressor.py | 4 +- tick/inference/src/CMakeLists.txt | 3 +- .../inference/src/online_forest_regressor.cpp | 113 ++---------- tick/inference/src/online_forest_regressor.h | 59 +++--- tick/inference/swig/inference_module.i | 1 + tick/inference/swig/online_forest_regressor.i | 9 +- 10 files changed, 240 insertions(+), 135 deletions(-) create mode 100644 tick/inference/online_forest_classifier.py diff --git a/online_forest.py b/online_forest.py index 08dda1ee3..085aedc7b 100644 --- a/online_forest.py +++ b/online_forest.py @@ -1,7 +1,7 @@ from tick.simulation import SimuLinReg, weights_sparse_gauss from sklearn.model_selection import train_test_split import numpy as np -from tick.inference import OnlineForestRegressor +from tick.inference import OnlineForestRegressor, OnlineForestClassifier from matplotlib.colors import ListedColormap from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor @@ -148,7 +148,7 @@ def plot_decisions(clfs, datasets, names, use_aggregation=None): ] clfs = [ - OnlineForestRegressor(n_trees=n_trees, seed=123, step=0.25), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=0.25), ExtraTreesRegressor(n_estimators=n_trees), RandomForestRegressor(n_estimators=n_trees) ] diff --git a/setup.py b/setup.py index 8538cd196..f61e97157 100644 --- a/setup.py +++ b/setup.py @@ -656,11 +656,13 @@ def add_dir_name(dir_name, filenames): "cpp_files": ["hawkes_conditional_law.cpp", "hawkes_em.cpp", "hawkes_adm4.cpp", "hawkes_basis_kernels.cpp", "hawkes_sumgaussians.cpp", - "online_forest_regressor.cpp"], + "online_forest_regressor.cpp", + "online_forest_classifier.cpp"], "h_files": ["hawkes_conditional_law.h", "hawkes_em.h", "hawkes_adm4.h", "hawkes_basis_kernels.h", "hawkes_sumgaussians.h", - "online_forest_regressor.h"], + "online_forest_regressor.h", + "online_forest_classifier.h"], "swig_files": ["inference_module.i"], "module_dir": "./tick/inference/", "extension_name": "inference", diff --git a/tick/inference/__init__.py b/tick/inference/__init__.py index b1c980256..5e3f06e96 100644 --- a/tick/inference/__init__.py +++ b/tick/inference/__init__.py @@ -18,6 +18,7 @@ from .robust import std_iqr, std_mad from .online_forest_regressor import OnlineForestRegressor +from .online_forest_classifier import OnlineForestClassifier __all__ = [ "LinearRegression", @@ -33,6 +34,7 @@ "HawkesBasisKernels", "HawkesSumGaussians", "OnlineForestRegressor", + "OnlineForestClassifier", "kaplan_meier", "nelson_aalen" ] diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py new file mode 100644 index 000000000..16f32b12e --- /dev/null +++ b/tick/inference/online_forest_classifier.py @@ -0,0 +1,174 @@ +# License: BSD 3 clause + +from abc import ABC + +from tick.base import Base +from tick.base import actual_kwargs + +from .build.inference import OnlineForestClassifier as _OnlineForestClassifier +from tick.preprocessing.utils import safe_array + +from .build.inference import CriterionClassifier_log as log + + +class OnlineForestClassifier(ABC, Base): + """Truly online random forest for regression (continuous labels). BLABLA + + TODO: update docstrings + + Parameters + ---------- + n_trees : `int`, default=10 + Number of trees to grow in the forest. Cannot be changed after the first + call to ``fit``. + + criterion : {'log'}, default='log' + The criterion used to selected a split. Supported criteria are: + * 'unif': splits are sampled uniformly in the range of the features, and + the feature to be splitted is chosen uniformly at random + * 'mse': the split and feature leading to the best variance reduction + is selected + This cannot be changed after the first call to ``fit`` + + max_depth : `int`, default=-1 + The maximum depth of a tree. If <= 0, nodes are splitted with no limit + on the depth of the tree + + min_samples_split : `int`, default=50 + A node waits to contain `min_samples_split` before splitting. + + n_threads : `int`, default=1 + The number of threads used to grow trees in parallel during training. + If n_threads < 0, then all available cores will be used. + + seed : `int`, default=-1 + If seed >= 0, this is used to seed the random number generators of the + forest. + + verbose : `bool`, default=True + If True, then verboses things during training + + warm_start : `bool`, default=True + If True, then successive calls to ``fit`` will continue to grow existing + trees. Otherwise, we start from empty trees + + n_splits : `int`, default=10 + Number of potential splits to consider for a feature. BLABLA ??? + + Attributes + ---------- + n_samples : `int` + Number of samples seen during training + + n_features : int + The number of features from the training dataset (passed to ``fit``) + """ + + _attrinfos = { + '_actual_kwargs': {'writable': False}, + '_fitted': {'writable': False}, + '_forest': {'writable': False}, + '_criterion': {'writable': False, 'cpp_setter': 'set_criterion'}, + 'n_trees': {'writable': True, 'cpp_setter': 'set_n_trees'}, + 'max_depth': {'writable': True, 'cpp_setter': 'set_max_depth'}, + 'min_samples_split': {'writable': True, + 'cpp_setter': 'set_min_samples_split'}, + 'n_threads': {'writable': True, 'cpp_setter': 'set_n_threads'}, + 'seed': {'writable': True, 'cpp_setter': 'set_seed'}, + 'verbose': {'writable': True, 'cpp_setter': 'set_verbose'}, + 'warm_start': {'writable': True, 'cpp_setter': 'set_warm_start'}, + 'n_splits': {'writable': True, 'cpp_setter': 'set_n_splits'}, + } + + _cpp_obj_name = "_forest" + + @actual_kwargs + def __init__(self, n_trees: int = 10, step: float = 1., + criterion: str = 'log', + max_depth: int = -1, min_samples_split: int = 50, + n_threads: int = 1, seed: int = -1, verbose: bool = True, + warm_start: bool = True, n_splits: int = 10): + Base.__init__(self) + if not hasattr(self, "_actual_kwargs"): + self._actual_kwargs = {} + self._fitted = False + self.n_trees = n_trees + self.step = step + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.n_threads = n_threads + self.seed = seed + self.verbose = verbose + self.warm_start = warm_start + self.n_splits = n_splits + self._forest = _OnlineForestClassifier(n_trees, + step, + self._criterion, + #max_depth, + # min_samples_split, + n_threads, + seed, + verbose) + #warm_start, n_splits) + + def set_data(self, X, y): + X = safe_array(X) + y = safe_array(y) + self._forest.set_data(X, y) + + def fit(self, X, y): + X = safe_array(X) + y = safe_array(y) + self._set("_fitted", True) + self._forest.fit(X, y) + return self + + def apply(self, X): + """Make the samples from X follow the trees from the forest, and return + the indices of the leaves + """ + raise NotImplementedError() + + def predict(self, X, use_aggregation: bool=True): + """Predict class for given samples + + Parameters + ---------- + X : `np.ndarray` or `scipy.sparse.csr_matrix`, shape=(n_samples, n_features) + Features matrix to predict for. + + Returns + ------- + output : `np.array`, shape=(n_samples,) + Returns predicted values. + """ + import numpy as np + y_pred = np.empty(X.shape[0]) + if not self._fitted: + raise ValueError("You must call ``fit`` before") + else: + X = safe_array(X) + self._forest.predict(X, y_pred, True) + return y_pred + + def score(self, X, y): + from sklearn.metrics import r2_score + + def print(self): + self._forest._print() + + # TODO: property for splits + + @property + def criterion(self): + if self._criterion == log: + return 'log' + + @criterion.setter + def criterion(self, value): + if value == 'log': + self._set('_criterion', log) + # self._forest.set_criterion(unif) + else: + raise ValueError("``criterion`` must be either 'unif' or 'mse'.") diff --git a/tick/inference/online_forest_regressor.py b/tick/inference/online_forest_regressor.py index b8a387996..1a5b28ba2 100644 --- a/tick/inference/online_forest_regressor.py +++ b/tick/inference/online_forest_regressor.py @@ -8,8 +8,8 @@ from .build.inference import OnlineForestRegressor as _OnlineForestRegressor from tick.preprocessing.utils import safe_array -from .build.inference import Criterion_unif as unif -from .build.inference import Criterion_mse as mse +from .build.inference import CriterionRegressor_unif as unif +from .build.inference import CriterionRegressor_mse as mse class OnlineForestRegressor(ABC, Base): diff --git a/tick/inference/src/CMakeLists.txt b/tick/inference/src/CMakeLists.txt index 706bc50bd..81d9f38d9 100644 --- a/tick/inference/src/CMakeLists.txt +++ b/tick/inference/src/CMakeLists.txt @@ -4,7 +4,8 @@ add_library(tick_inference EXCLUDE_FROM_ALL hawkes_adm4.h hawkes_adm4.cpp hawkes_basis_kernels.cpp hawkes_basis_kernels.h hawkes_sumgaussians.h hawkes_sumgaussians.cpp - online_forest_regressor.h online_forest_regressor.cpp) + online_forest_regressor.h online_forest_regressor.cpp + online_forest_classifier.h online_forest_classifier.cpp) target_link_libraries(tick_inference diff --git a/tick/inference/src/online_forest_regressor.cpp b/tick/inference/src/online_forest_regressor.cpp index 0f2ac77a1..3098385d9 100644 --- a/tick/inference/src/online_forest_regressor.cpp +++ b/tick/inference/src/online_forest_regressor.cpp @@ -242,7 +242,10 @@ ulong TreeRegressor::split_leaf(ulong index, const ArrayDouble &x_t, double y_t) current_node.set_feature(feature).set_threshold(threshold); // We pass the sample to the new leaves, and initialize the _label_average with the value data_node.set_x_t(x_t).set_y_t(y_t); + + // other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); + // Update downwards of v' other_node.update_downwards(current_node.x_t(), current_node.y_t()); // Update upwards of v': it's a leaf @@ -326,31 +329,28 @@ double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { } ulong current = leaf; // The child of the current node that does not contain the data - ulong other_index; - ulong parent; - ulong data_index; +// ulong other_index; +// ulong parent; +// ulong data_index; double pred; while (true) { // std::cout << "node: " << current << std::endl; NodeRegressor ¤t_node = node(current); if (current_node.is_leaf()) { pred = current_node.predict(); - // weight = current_node.weight() * current_node.predict(); - // weight = std::exp(current_node.weight()) * current_node.labels_average(); } else { - parent = current_node.parent(); - if (node(parent).left() == current) { - // other_index = node(parent).right(); - data_index = node(parent).left(); - - } else { - // other_index = node(parent).left(); - data_index = node(parent).right(); - } - NodeRegressor& data_node = node(data_index); - - double a = 0.5 * std::exp(current_node.weight() - current_node.weight_tree()) * current_node.predict(); - pred = a + (1 - 0.5 * std::exp(current_node.weight() - current_node.weight_tree())) * pred; +// parent = current_node.parent(); +// if (node(parent).left() == current) { +// // other_index = node(parent).right(); +// data_index = node(parent).left(); +// +// } else { +// // other_index = node(parent).left(); +// data_index = node(parent).right(); +// } +// NodeRegressor& data_node = node(data_index); + double w = std::exp(current_node.weight() - current_node.weight_tree()); + pred = 0.5 * w * current_node.predict() + (1 - 0.5 * w) * pred; // weight = 0.5 * current_node.weight() * current_node.predict() // + 0.5 * node(other).weight_tree() * weight; @@ -382,7 +382,7 @@ inline double TreeRegressor::step() const { return forest.step(); } -inline Criterion TreeRegressor::criterion() const { +inline CriterionRegressor TreeRegressor::criterion() const { return forest.criterion(); } @@ -392,7 +392,7 @@ inline Criterion TreeRegressor::criterion() const { OnlineForestRegressor::OnlineForestRegressor(uint32_t n_trees, double step, - Criterion criterion, + CriterionRegressor criterion, int32_t n_threads, int seed, bool verbose) @@ -457,79 +457,6 @@ inline double OnlineForestRegressor::sample_threshold(double left, double right) return rand.uniform(left, right); } -//inline double OnlineForestRegressor::step() const { -// return _step; -//} -// -//void OnlineForestRegressor::print() { -// for (Tree &tree: trees) { -// tree.print(); -// } -//} -// -//inline ulong OnlineForestRegressor::n_samples() const { -// if (_iteration > 0) { -// return _iteration; -// } else { -// TICK_ERROR("You must call ``fit`` before asking for ``n_samples``.") -// } -//} - -//inline ulong OnlineForestRegressor::n_features() const { -// if (_iteration > 0) { -// return _n_features; -// } else { -// TICK_ERROR("You must call ``fit`` before asking for ``n_features``.") -// } -//} - -//inline OnlineForestRegressor &OnlineForestRegressor::set_n_features(ulong n_features) { -// if (_iteration == 0) { -// _n_features = n_features; -// } else { -// TICK_ERROR("OnlineForest::set_n_features can be called only once !") -// } -// return *this; -//} - -//inline uint32_t OnlineForestRegressor::n_trees() const { -// return _n_trees; -//} - - -//inline OnlineForestRegressor &OnlineForestRegressor::set_n_trees(uint32_t n_trees) { -// _n_trees = n_trees; -// return *this; -//} - -//inline int32_t OnlineForestRegressor::n_threads() const { -// return _n_threads; -//} - -//OnlineForestRegressor &OnlineForestRegressor::set_n_threads(int32_t n_threads) { -// _n_threads = n_threads; -// return *this; -//} - -//inline Criterion OnlineForestRegressor::criterion() const { -// return _criterion; -//} - -//inline OnlineForestRegressor &OnlineForestRegressor::set_criterion(Criterion criterion) { -// _criterion = criterion; -// return *this; -//} -// -//inline int OnlineForestRegressor::seed() const { -// return _seed; -//} - -//inline OnlineForestRegressor &OnlineForestRegressor::set_seed(int seed) { -// _seed = seed; -// rand.reseed(seed); -// return *this; -//} - //inline bool OnlineForestRegressor::verbose() const { // return _verbose; //} diff --git a/tick/inference/src/online_forest_regressor.h b/tick/inference/src/online_forest_regressor.h index a368ca80a..18a9c8326 100644 --- a/tick/inference/src/online_forest_regressor.h +++ b/tick/inference/src/online_forest_regressor.h @@ -12,38 +12,23 @@ // TODO: faire tres attention au features binaires si le range est 0 sur toutes les coordonnées, ne rien faire // TODO: code a classifier -// TODO: choisir la feature proportionnellement au ratio de la longueur du cote / perimetre. Ca suppose qu'on enregistre -// les vraies dimensions de la cellule, et le threhsold est du coup aussi tiré là dedans // TODO: choisir la feature proportionnellement au ratio des range de features, mais attention au cas de features // discretes // TODO: une option pour créer une cellule vide, enfin oublier les donnes dans la cellule quand elle a ete splitee // TODO: choix de la feature les labels -// TODO: des fit_online qui prend un mini batch et qui met à jour la foret, mais dans ce cas on ne met qu'un point par -// cellule, du coup pas besoin d'enregistrer les sample index ou les points. Ca suppose que min_sample_split == 1 - -// TODO: pour la regression, on utilise la moyenne des y // TODO: pour la classification, on utilise pas les frequences, on utilise des frequences regularisees, prior Dirichlet p_c = (n_c + 0.5) + (\sum n_c + C / 2). En fait une option // TODO: check that not using reserve in the forest works as well... -enum class Criterion { +enum class CriterionRegressor { unif = 0, mse }; -// Computation of log( (e^a + e^b) / 2) in an overproof way -inline double log_sum_2_exp(const double a, const double b) { - // TODO if |a - b| > 50 skip - if (a > b) { - return a + std::log((1 + std::exp(b - a)) / 2); - } else { - return b + std::log((1 + std::exp(a - b)) / 2); - } -} class TreeRegressor; @@ -88,6 +73,15 @@ class NodeRegressor { NodeRegressor &operator=(const NodeRegressor &) = delete; NodeRegressor &operator=(const NodeRegressor &&) = delete; + // Computation of log( (e^a + e^b) / 2) in an overproof way + inline static double log_sum_2_exp(const double a, const double b) { + // TODO if |a - b| > 50 skip + if (a > b) { + return a + std::log((1 + std::exp(b - a)) / 2); + } else { + return b + std::log((1 + std::exp(a - b)) / 2); + } + } // Update to apply to a node when going forward in the tree (towards leaves) void update_downwards(const ArrayDouble &x_t, const double y_t); // Update to apply to a node when going upward in the tree (towards the root) @@ -174,7 +168,7 @@ class TreeRegressor { } } - inline Criterion criterion() const; + inline CriterionRegressor criterion() const; NodeRegressor &node(ulong index) { return nodes[index]; @@ -191,8 +185,8 @@ class OnlineForestRegressor { uint32_t _n_trees; // Number of threads to use for parallel growing of trees int32_t _n_threads; - // Criterion used for splitting (not used for now) - Criterion _criterion; + // CriterionRegressor used for splitting (not used for now) + CriterionRegressor _criterion; // Step-size used for aggregation double _step; // Number of features. @@ -211,7 +205,8 @@ class OnlineForestRegressor { void create_trees(); public: - OnlineForestRegressor(uint32_t n_trees, double step, Criterion criterion, int32_t n_threads, int seed, bool verbose); + OnlineForestRegressor(uint32_t n_trees, double step, CriterionRegressor criterion, int32_t n_threads, + int seed, bool verbose); virtual ~OnlineForestRegressor(); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); @@ -268,20 +263,10 @@ class OnlineForestRegressor { return _n_threads; } - inline OnlineForestRegressor &set_n_threads(int32_t n_threads) { - _n_threads = n_threads; - return *this; - } - - inline Criterion criterion() const { + inline CriterionRegressor criterion() const { return _criterion; } - inline OnlineForestRegressor &set_criterion(Criterion criterion) { - _criterion = criterion; - return *this; - } - inline int seed() const { return _seed; } @@ -291,6 +276,18 @@ class OnlineForestRegressor { rand.reseed(seed); return *this; } + + OnlineForestRegressor &set_n_threads(int32_t n_threads) { + _n_threads = n_threads; + return *this; + } + + inline OnlineForestRegressor &set_criterion(CriterionRegressor criterion) { + _criterion = criterion; + return *this; + } + + // inline bool verbose() const; // inline OnlineForestRegressor &set_verbose(bool verbose); }; diff --git a/tick/inference/swig/inference_module.i b/tick/inference/swig/inference_module.i index dca630e91..281c4d842 100644 --- a/tick/inference/swig/inference_module.i +++ b/tick/inference/swig/inference_module.i @@ -25,3 +25,4 @@ %include hawkes_sumgaussians.i %include online_forest_regressor.i +%include online_forest_classifier.i diff --git a/tick/inference/swig/online_forest_regressor.i b/tick/inference/swig/online_forest_regressor.i index 4fbe4c36d..f79b65de8 100644 --- a/tick/inference/swig/online_forest_regressor.i +++ b/tick/inference/swig/online_forest_regressor.i @@ -8,14 +8,15 @@ %} -enum class Criterion { +enum class CriterionRegressor { unif = 0, mse }; class OnlineForestRegressor { public: - OnlineForestRegressor(uint32_t n_trees, double step, Criterion criterion, int32_t n_threads, int seed, bool verbose); + OnlineForestRegressor(uint32_t n_trees, double step, CriterionRegressor criterion, + int32_t n_threads, int seed, bool verbose); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); void predict(const SArrayDouble2dPtr features, SArrayDoublePtr predictions, bool use_aggregation); @@ -32,8 +33,8 @@ class OnlineForestRegressor { int32_t n_threads() const; OnlineForestRegressor &set_n_threads(int32_t n_threads); - Criterion criterion() const; - OnlineForestRegressor &set_criterion(Criterion criterion); + CriterionRegressor criterion() const; + OnlineForestRegressor &set_criterion(CriterionRegressor criterion); int seed() const; OnlineForestRegressor &set_seed(int seed); // bool verbose() const; From 511811ef93b3da6d315b75c95c8a284d25e7abb5 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Fri, 17 Nov 2017 00:03:23 +0100 Subject: [PATCH 07/32] ... --- online_forest.py | 205 +++++++++++++----- tick/inference/online_forest_classifier.py | 19 +- .../inference/src/online_forest_regressor.cpp | 6 +- video.py | 56 +++++ 4 files changed, 220 insertions(+), 66 deletions(-) create mode 100644 video.py diff --git a/online_forest.py b/online_forest.py index 085aedc7b..b8338125a 100644 --- a/online_forest.py +++ b/online_forest.py @@ -1,15 +1,18 @@ -from tick.simulation import SimuLinReg, weights_sparse_gauss +from tick.simulation import SimuLinReg, SimuLogReg, weights_sparse_gauss from sklearn.model_selection import train_test_split import numpy as np from tick.inference import OnlineForestRegressor, OnlineForestClassifier from matplotlib.colors import ListedColormap -from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor +from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, \ + RandomForestClassifier, ExtraTreesClassifier +from sklearn.datasets import make_moons, make_classification, make_circles +from sklearn.metrics import roc_auc_score import matplotlib.pyplot as plt from time import time -n_samples = 1000 +n_samples = 500 n_features = 2 seed = 123 @@ -17,12 +20,14 @@ w0 = weights_sparse_gauss(n_features, nnz=2) -X, y = SimuLinReg(w0, -1., n_samples=n_samples, seed=seed).simulate() +X, y = SimuLogReg(w0, -1., n_samples=n_samples, seed=seed).simulate() +y = (y + 1) / 2 + # X_train, X_test, y_train, y_test = train_test_split(X, y) -def plot_decisions(clfs, datasets, names, use_aggregation=None): +def plot_decisions_regression(clfs, datasets, names, use_aggregation=None): i = 1 h = .02 fig = plt.figure(figsize=(4 * (len(clfs) + 1), 4 * len(datasets))) @@ -64,7 +69,7 @@ def plot_decisions(clfs, datasets, names, use_aggregation=None): clf.fit(X_train, y_train) t2 = time() - mse = np.linalg.norm(y_test - clf.predict(X_test)) + # mse = np.linalg.norm(y_test - clf.predict(X_test)) # score = clf.score(X_test, y_test) Z = clf.predict(np.array([xx.ravel(), yy.ravel()]).T) @@ -86,54 +91,115 @@ def plot_decisions(clfs, datasets, names, use_aggregation=None): if ds_cnt == 0: ax.set_title(name) - ax.text(xx.max() - .3, yy.min() + .3, ('%.2f (%.2f)' % (mse, t2-t1)).lstrip('0'), - size=15, horizontalalignment='right') + # ax.text(xx.max() - .3, yy.min() + .3, ('%.2f (%.2f)' % (mse, t2-t1)).lstrip('0'), + # size=15, horizontalalignment='right') i += 1 plt.tight_layout() # plt.show() -# def plot_decision_regions(clfs, X_test, y_test, n_iter=None, use_aggregation=None, -# title=None): -# from matplotlib.colors import ListedColormap -# -# cm = plt.cm.RdBu -# cmap = ListedColormap(['red', 'white', 'blue']) -# fig = plt.figure(figsize=(8, 5)) -# -# ax = plt.subplot(1, 1, 1) -# # plot the decision surface -# x1_min, x1_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1 -# x2_min, x2_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1 -# -# xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, 0.02), -# np.arange(x2_min, x2_max, 0.02)) -# -# plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=10, cmap=cm) -# -# if use_aggregation is None: -# Z = clf.predict(np.array([xx1.ravel(), xx2.ravel()]).T) -# else: -# Z = clf.predict(np.array([xx1.ravel(), xx2.ravel()]).T, use_aggregation) -# Z = Z.reshape(xx1.shape) -# ct = plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cm) -# plt.colorbar(ct) -# plt.xlim(xx1.min(), xx1.max()) -# plt.ylim(xx2.min(), xx2.max()) -# -# plt.xlabel('x1', fontsize=16) -# plt.ylabel('x2', fontsize=16) -# if title is not None: -# plt.title(title) -# plt.legend(loc='upper left') -# plt.tight_layout() +def plot_decision_classification(classifiers, datasets, names): + h = .02 + fig = plt.figure(figsize=(2 * (len(classifiers) + 1), 2 * len(datasets))) + i = 1 + # iterate over datasets + for ds_cnt, ds in enumerate(datasets): + # preprocess dataset, split into training and test part + X, y = ds + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + # just plot the dataset first + cm = plt.cm.RdBu + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = plt.subplot(len(datasets), len(classifiers) + 1, i) + if ds_cnt == 0: + ax.set_title("Input data") + # Plot the training points -# clf = OnlineForestRegressor(n_trees=1, seed=123) -# print(clf.predict(X)) + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=10, cmap=cm) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=10, + alpha=0.6) + + # ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, + # edgecolors='k') + # # and testing points + # ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, + # alpha=0.6, + # edgecolors='k') + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + + # iterate over classifiers + for name, clf in zip(names, classifiers): + ax = plt.subplot(len(datasets), len(classifiers) + 1, i) + + if hasattr(clf, 'clear'): + clf.clear() + clf.fit(X_train, y_train) + + Z = clf.predict_proba(np.array([xx.ravel(), yy.ravel()]).T)[:, 1] + + score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) + # score = clf.score(X_test, y_test) + + # Plot the decision boundary. For that, we will assign a color to + # each + # point in the mesh [x_min, x_max]x[y_min, y_max]. + # if hasattr(clf, "decision_function"): + # Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) + # else: + # Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] + + # Z = clf.predict(np.array([xx.ravel(), yy.ravel()]).T)[:, 1] + + + # Z = Z[:, 1] + # print(Z) + # print(Z.shape) + # print(xx.shape, xx.shape[0] * xx.shape[1]) + + # Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])[:, 1] + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + + # Plot also the training points + # ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm, s=15) + # # and testing points + # ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, + # s=15, alpha=0.6) + + # ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, + # edgecolors='k') + # # and testing points + # ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, + # edgecolors='k', alpha=0.6) + + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + if ds_cnt == 0: + ax.set_title(name) + ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 + + plt.tight_layout() + plt.show() -# plot_decision_regions(clf, X, y, use_aggregation=False) path = '/Users/stephane.gaiffas/Downloads/' @@ -141,25 +207,50 @@ def plot_decisions(clfs, datasets, names, use_aggregation=None): # plt.savefig(os.path.join(path, 'online1.pdf')) -n_trees = 10 - -datasets = [ - (X, y) +n_trees = 50 + +X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) +linearly_separable = (X, y) + +datasets = [make_moons(n_samples=n_samples, noise=0.3, random_state=0), + make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), + linearly_separable + ] + +# datasets = [ +# (X, y) +# ] + +# clfs = [ +# OnlineForestClassifier(n_trees=n_trees, seed=123, step=0.25), +# ExtraTreesRegressor(n_estimators=n_trees), +# RandomForestRegressor(n_estimators=n_trees) +# ] + +classifiers = [ + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1.), + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) ] - -clfs = [ - OnlineForestClassifier(n_trees=n_trees, seed=123, step=0.25), - ExtraTreesRegressor(n_estimators=n_trees), - RandomForestRegressor(n_estimators=n_trees) -] - names = [ "Online forest", "Extra trees", "Breiman RF" ] -plot_decisions(clfs, datasets, names) + +# forest = OnlineForestClassifier(n_trees=n_trees, n_classes=2, seed=123, step=1.) +# print(y) + +# forest.fit(X, y) +# forest.predict(X) + + +plot_decision_classification(classifiers, datasets, names) plt.show() # forest = OnlineForestRegressor(n_trees=n_trees, seed=123, step=0.25) diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py index 16f32b12e..0421f4bc4 100644 --- a/tick/inference/online_forest_classifier.py +++ b/tick/inference/online_forest_classifier.py @@ -82,8 +82,10 @@ class OnlineForestClassifier(ABC, Base): _cpp_obj_name = "_forest" + # TODO: n_classes must be mandatory + @actual_kwargs - def __init__(self, n_trees: int = 10, step: float = 1., + def __init__(self, n_trees: int = 10, n_classes: int=2, step: float = 1., criterion: str = 'log', max_depth: int = -1, min_samples_split: int = 50, n_threads: int = 1, seed: int = -1, verbose: bool = True, @@ -93,6 +95,7 @@ def __init__(self, n_trees: int = 10, step: float = 1., self._actual_kwargs = {} self._fitted = False self.n_trees = n_trees + self.n_classes = n_classes self.step = step self.criterion = criterion self.max_depth = max_depth @@ -103,6 +106,7 @@ def __init__(self, n_trees: int = 10, step: float = 1., self.warm_start = warm_start self.n_splits = n_splits self._forest = _OnlineForestClassifier(n_trees, + n_classes, step, self._criterion, #max_depth, @@ -130,7 +134,7 @@ def apply(self, X): """ raise NotImplementedError() - def predict(self, X, use_aggregation: bool=True): + def predict_proba(self, X, use_aggregation: bool=True): """Predict class for given samples Parameters @@ -140,17 +144,20 @@ def predict(self, X, use_aggregation: bool=True): Returns ------- - output : `np.array`, shape=(n_samples,) + output : `np.ndarray`, shape=(n_samples, n_classes) Returns predicted values. """ import numpy as np - y_pred = np.empty(X.shape[0]) + scores = np.empty((X.shape[0], self.n_classes)) if not self._fitted: raise ValueError("You must call ``fit`` before") else: X = safe_array(X) - self._forest.predict(X, y_pred, True) - return y_pred + self._forest.predict(X, scores, True) + return scores + + def clear(self): + self._forest.clear() def score(self, X, y): from sklearn.metrics import r2_score diff --git a/tick/inference/src/online_forest_regressor.cpp b/tick/inference/src/online_forest_regressor.cpp index 3098385d9..253ae48eb 100644 --- a/tick/inference/src/online_forest_regressor.cpp +++ b/tick/inference/src/online_forest_regressor.cpp @@ -329,9 +329,9 @@ double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { } ulong current = leaf; // The child of the current node that does not contain the data -// ulong other_index; -// ulong parent; -// ulong data_index; + ulong other_index; + ulong parent; + ulong data_index; double pred; while (true) { // std::cout << "node: " << current << std::endl; diff --git a/video.py b/video.py new file mode 100644 index 000000000..baa67fc77 --- /dev/null +++ b/video.py @@ -0,0 +1,56 @@ + +import matplotlib.animation as animation + +from sklearn.model_selection import train_test_split +import numpy as np +from tick.inference import OnlineForestClassifier +from sklearn.datasets import make_moons, make_classification, make_circles +from sklearn.metrics import roc_auc_score +import matplotlib.pyplot as plt + +n_samples = 500 +n_features = 2 +seed = 123 + +X, y = make_moons(n_samples=n_samples, noise=0.3, random_state=0) + +X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.5, random_state=42) + +h = .1 +x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 +y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 +xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) +Z = np.zeros(xx.shape) + +cm = plt.cm.RdBu + +fig = plt.figure(figsize=(5, 5)) +ax = plt.subplot(1, 1, 1) + +ax.set_xlim(xx.min(), xx.max()) +ax.set_ylim(yy.min(), yy.max()) +ax.set_xticks(()) +ax.set_yticks(()) + +ax.scatter(X_train[:2, 0], X_train[:2, 1], c=np.array([0, 1]), s=25, cmap=cm) + +n_trees = 50 +clf = OnlineForestClassifier(n_trees=n_trees, n_classes=2, seed=123, step=1e-1) + +def animate(i): + clf.fit(X_train[i, :].reshape(1, 2), np.array([y_train[i]])) + Z = clf.predict_proba(np.array([xx.ravel(), yy.ravel()]).T)[:, 1] + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.5) + ax.scatter(X_train[:i, 0], X_train[:i, 1], c=y_train[:i], s=25, cmap=cm) + score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) + fig.suptitle('test auc: %.2f' % score, fontsize=18) + return ax + +# Interval in seconds +interval = 10 +ani = animation.FuncAnimation(fig, animate, 200, interval=interval) + +plt.show() From da90adb936938116684a82238cc18b91b13ff99b Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Fri, 17 Nov 2017 00:04:11 +0100 Subject: [PATCH 08/32] ... --- .../src/online_forest_classifier.cpp | 486 ++++++++++++++++++ tick/inference/src/online_forest_classifier.h | 318 ++++++++++++ .../inference/swig/online_forest_classifier.i | 47 ++ 3 files changed, 851 insertions(+) create mode 100644 tick/inference/src/online_forest_classifier.cpp create mode 100644 tick/inference/src/online_forest_classifier.h create mode 100644 tick/inference/swig/online_forest_classifier.i diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp new file mode 100644 index 000000000..841a4948f --- /dev/null +++ b/tick/inference/src/online_forest_classifier.cpp @@ -0,0 +1,486 @@ + +// License: BSD 3 clause + +#include "online_forest_classifier.h" + +/********************************************************************************* + * NodeClassifier methods + *********************************************************************************/ + +NodeClassifier::NodeClassifier(TreeClassifier &tree, ulong parent) + : _tree(tree) { + _parent = parent; + _left = 0; + _right = 0; + _n_samples = 0; + _is_leaf = true; + _weight = 0; + _weight_tree = 0; + _counts = ArrayULong(n_classes()); + _counts.fill(0); +} + +NodeClassifier::NodeClassifier(const NodeClassifier &node) + : _tree(node._tree), + _parent(node._parent), _left(node._left), _right(node._right), + _feature(node._feature), _threshold(node._threshold), + _n_samples(node._n_samples), + _x_t(node._x_t), + _y_t(node._y_t), + _weight(node._weight), _weight_tree(node._weight_tree), + _is_leaf(node._is_leaf), + _counts(node._counts) {} + +NodeClassifier::NodeClassifier(const NodeClassifier &&node) : _tree(_tree) { + _parent = node._parent; + _left = node._left; + _right = node._right; + _feature = node._feature; + _threshold = node._threshold; + _n_samples = node._n_samples; + _x_t = node._x_t; + _y_t = node._y_t; + _weight = node._weight; + _weight_tree = node._weight_tree; + _is_leaf = node._is_leaf; + _counts = node._counts; +} + +void NodeClassifier::update_downwards(const ArrayDouble &x_t, const double y_t) { + _n_samples++; + _weight -= step() * loss(y_t); + update_predict(y_t); +} + +void NodeClassifier::update_upwards() { + if (_is_leaf) { + _weight_tree = _weight; + } else { + _weight_tree = log_sum_2_exp(_weight, node(_left).weight_tree() + node(_right).weight_tree()); + } +} + +void NodeClassifier::update_predict(const double y_t) { + // We update the counts for the class y_t + _counts[static_cast(y_t)]++; +} + +double NodeClassifier::score(uint8_t c) const { + // Using Dirichet(1/2, ... 1/2) prior + return static_cast(2 * _counts[c] + 1) / (2 * _n_samples + n_classes()); +} + +inline void NodeClassifier::predict(ArrayDouble& scores) const { + for (uint8_t c=0; c < n_classes(); ++c) { + scores[c] = score(c); + } +} + +double NodeClassifier::loss(const double y_t) { + // Log-loss + uint8_t c = static_cast(y_t); + return -std::log(score(c)); +} + +inline NodeClassifier &NodeClassifier::node(ulong index) const { + return _tree.node(index); +} + +ulong NodeClassifier::n_features() const { + return _tree.n_features(); +} + +uint8_t NodeClassifier::n_classes() const { + return _tree.n_classes(); +} + +inline double NodeClassifier::step() const { + return _tree.step(); +} + +inline ulong NodeClassifier::parent() const { + return _parent; +} + +inline ulong NodeClassifier::left() const { + return _left; +} + +inline NodeClassifier &NodeClassifier::set_left(ulong left) { + _left = left; + return *this; +} + +inline ulong NodeClassifier::right() const { + return _right; +} + +inline NodeClassifier &NodeClassifier::set_right(ulong right) { + _right = right; + return *this; +} + +inline bool NodeClassifier::is_leaf() const { + return _is_leaf; +} + +inline NodeClassifier &NodeClassifier::set_is_leaf(bool is_leaf) { + _is_leaf = is_leaf; + return *this; +} + +inline ulong NodeClassifier::feature() const { + return _feature; +} + +inline NodeClassifier &NodeClassifier::set_feature(ulong feature) { + _feature = feature; + return *this; +} + +inline double NodeClassifier::threshold() const { + return _threshold; +} + +inline NodeClassifier &NodeClassifier::set_threshold(double threshold) { + _threshold = threshold; + return *this; +} + +inline ulong NodeClassifier::n_samples() const { + return _n_samples; +} + +inline NodeClassifier &NodeClassifier::set_n_samples(ulong n_samples) { + _n_samples = n_samples; + return *this; +} + +inline double NodeClassifier::weight() const { + return _weight; +} + +inline NodeClassifier &NodeClassifier::set_weight(double weight) { + _weight = weight; + return *this; +} + +inline double NodeClassifier::weight_tree() const { + return _weight_tree; +} + +inline NodeClassifier &NodeClassifier::set_weight_tree(double weight_tree) { + _weight_tree = weight_tree; + return *this; +} + +inline const ArrayDouble &NodeClassifier::x_t() const { + return _x_t; +} + +inline NodeClassifier &NodeClassifier::set_x_t(const ArrayDouble &x_t) { + _x_t = x_t; + return *this; +} + +inline double NodeClassifier::y_t() const { + return _y_t; +} + +inline NodeClassifier &NodeClassifier::set_y_t(const double y_t) { + _y_t = y_t; + return *this; +} + +void NodeClassifier::print() { + std::cout << "Node(parent: " << _parent + << ", left: " << _left + << ", right: " << _right + << ", n_samples: " << _n_samples + << ", is_leaf: " << _is_leaf + << ", feature: " << _feature + << ", thresh: " << _threshold + << ", scores: [" << std::setprecision(2) << score(0) << ", " << std::setprecision(2) << score(1) << "]" + << ", counts: [" << std::setprecision(2) << _counts[0] << ", " << std::setprecision(2) << _counts[1] << "]" + << ", weight: " << _weight + << ", weight_tree: " << _weight_tree + << ")\n"; +} + +/********************************************************************************* +* TreeClassifier methods +*********************************************************************************/ + +TreeClassifier::TreeClassifier(const TreeClassifier &tree) + : forest(tree.forest), nodes(tree.nodes) {} + +TreeClassifier::TreeClassifier(const TreeClassifier &&tree) + : forest(tree.forest), nodes(tree.nodes) {} + +TreeClassifier::TreeClassifier(OnlineForestClassifier &forest) : forest(forest) { + // TODO: pre-allocate the vector to make things faster ? + add_node(0); +} + +ulong TreeClassifier::split_leaf(ulong index, const ArrayDouble &x_t, double y_t) { + // std::cout << "Splitting node " << index << std::endl; + ulong left = add_node(index); + ulong right = add_node(index); + node(index).set_left(left).set_right(right).set_is_leaf(false); + + // TODO: better feature sampling + ulong feature = forest.sample_feature(); + + double x1_tj = x_t[feature]; + double x2_tj = node(index).x_t()[feature]; + double threshold; + + // The leaf that contains the passed sample (x_t, y_t) + ulong data_leaf; + ulong other_leaf; + + // std::cout << "x1_tj= " << x1_tj << " x2_tj= " << x2_tj << " threshold= " << threshold << std::endl; + // TODO: what if x1_tj == x2_tj. Must be taken care of by sample_feature() + if (x1_tj < x2_tj) { + threshold = forest.sample_threshold(x1_tj, x2_tj); + data_leaf = left; + other_leaf = right; + } else { + threshold = forest.sample_threshold(x2_tj, x1_tj); + data_leaf = right; + other_leaf = left; + } + // TODO: code a move_sample + NodeClassifier & current_node = node(index); + NodeClassifier & data_node = node(data_leaf); + NodeClassifier & other_node = node(other_leaf); + current_node.set_feature(feature).set_threshold(threshold); + // We pass the sample to the new leaves, and initialize the _label_average with the value + data_node.set_x_t(x_t).set_y_t(y_t); + + // other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); + other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); + + // Update downwards of v' + other_node.update_downwards(current_node.x_t(), current_node.y_t()); + // Update upwards of v': it's a leaf + other_node.update_upwards(); + // node(other_leaf).set_weight_tree(node(other_leaf).weight()); + // Update downwards of v'' + data_node.update_downwards(x_t, y_t); + // Note: the update_up of v'' is done in the go_up method, called in fit() + // std::cout << "Done splitting node." << std::endl; + return data_leaf; +} + +ulong TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { + // Find the leaf that contains the sample + // Start at the root. Index of the root is always 0 + // If predict == true, this call to find_leaf is for + // prediction only, so that no leaf update and splits can be done + // std::cout << "Going downwards" << std::endl; + ulong index_current_node = 0; + bool is_leaf = false; + while (!is_leaf) { + // Get the current node + NodeClassifier ¤t_node = node(index_current_node); + if (!predict) { + current_node.update_downwards(x_t, y_t); + } + // Is the node a leaf ? + is_leaf = current_node.is_leaf(); + if (!is_leaf) { + if (x_t[current_node.feature()] <= current_node.threshold()) { + index_current_node = current_node.left(); + } else { + index_current_node = current_node.right(); + } + } + } + // std::cout << "Done going downwards" << std::endl; + return index_current_node; +} + +void TreeClassifier::go_upwards(ulong leaf_index) { + // std::cout << "Going upwards" << std::endl; + ulong current = leaf_index; + while (true) { + NodeClassifier ¤t_node = node(current); + current_node.update_upwards(); + if (current == 0) { + // std::cout << "Done going upwards" << std::endl; + break; + } + // We must update the root node + current = node(current).parent(); + } +} + +inline ulong TreeClassifier::n_nodes() const { + return _n_nodes; +} + +void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { + // TODO: Test that the size does not change within successive calls to fit +// std::cout << "iteration: " << iteration << std::endl; +// print(); + if (iteration == 0) { + nodes[0].set_x_t(x_t).set_y_t(y_t); + iteration++; + return; + } + ulong leaf = go_downwards(x_t, y_t, false); + ulong new_leaf = split_leaf(leaf, x_t, y_t); +// for(ulong j=0; j < n_features(); ++j) { +// double delta = std::abs(x_t[j] - node(leaf).sample().first[j]); +// if (delta > 0.) { +// new_leaf = split_node(leaf, x_t, y_t); +// break; +// } +// } + go_upwards(new_leaf); + iteration++; +} + +void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores) { + // std::cout << "Going downwards" << std::endl; + ulong leaf = go_downwards(x_t, 0., true); + // std::cout << "Done." << std::endl; + ulong current = leaf; + // The child of the current node that does not contain the data + ArrayDouble pred_new(n_classes()); + while (true) { + // std::cout << "node: " << current << std::endl; + NodeClassifier ¤t_node = node(current); + if (current_node.is_leaf()) { + current_node.predict(scores); + } else { + double w = std::exp(current_node.weight() - current_node.weight_tree()); + // Get the predictions of the current node + current_node.predict(pred_new); + for(uint8_t c = 0; c < n_classes(); ++c) { + scores[c] = 0.5 * w * pred_new[c] + (1 - 0.5 * w) * scores[c]; + } + } + // Root must be updated as well + if (current == 0) { + break; + } + current = current_node.parent(); + } +} + +ulong TreeClassifier::add_node(ulong parent) { + nodes.emplace_back(*this, parent); + return _n_nodes++; +} + +inline ulong TreeClassifier::n_features() const { + return forest.n_features(); +} + +inline uint8_t TreeClassifier::n_classes() const { + return forest.n_classes(); +} + +inline double TreeClassifier::step() const { + return forest.step(); +} + +inline CriterionClassifier TreeClassifier::criterion() const { + return forest.criterion(); +} + +/********************************************************************************* + * OnlineForestClassifier methods + *********************************************************************************/ + +OnlineForestClassifier::OnlineForestClassifier(uint32_t n_trees, + uint8_t n_classes, + double step, + CriterionClassifier criterion, + int32_t n_threads, + int seed, + bool verbose) + : _n_trees(n_trees), _n_classes(n_classes), _n_threads(n_threads), + _criterion(criterion), _step(step), _verbose(verbose), trees() { + // No iteration so far + _iteration = 0; + create_trees(); + // Seed the random number generators + set_seed(seed); +} + +OnlineForestClassifier::~OnlineForestClassifier() {} + +void OnlineForestClassifier::create_trees() { + // Just in case... + trees.clear(); + trees.reserve(_n_trees); + for (uint32_t i = 0; i < _n_trees; ++i) { + trees.emplace_back(*this); + } +} + +void OnlineForestClassifier::fit(const SArrayDouble2dPtr features, + const SArrayDoublePtr labels) { + // std::cout << "OnlineForestClassifier::fit" << std::endl; + ulong n_samples = features->n_rows(); + ulong n_features = features->n_cols(); + set_n_features(n_features); + for (ulong i = 0; i < n_samples; ++i) { + for (TreeClassifier &tree : trees) { + // Fit the tree online using the new data point + tree.fit(view_row(*features, i), (*labels)[i]); + } + _iteration++; + } + // std::cout << "Done OnlineForestClassifier::fit" << std::endl; +} + +void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, + SArrayDouble2dPtr predictions, + bool use_aggregation) { + predictions->fill(0.); + if (_iteration > 0) { + ulong n_samples = features->n_rows(); + ArrayDouble scores_tree(_n_classes); + scores_tree.fill(0.); + ArrayDouble scores_forest(_n_classes); + scores_forest.fill(0.); + for (ulong i = 0; i < n_samples; ++i) { + // The prediction is simply the average of the predictions + ArrayDouble scores_i = view_row(*predictions, i); + for (TreeClassifier &tree : trees) { + tree.predict(view_row(*features, i), scores_tree); + // TODO: use a .incr method instead ?? + scores_i.mult_incr(scores_tree, 1.); + } + scores_i /= _n_trees; + } + } else { + TICK_ERROR("You must call ``fit`` before ``predict``.") +} +} + +void OnlineForestClassifier::clear() { + create_trees(); + _iteration = 0; +} + +inline ulong OnlineForestClassifier::sample_feature() { + return rand.uniform_int(0L, n_features() - 1); +} + +inline double OnlineForestClassifier::sample_threshold(double left, double right) { + return rand.uniform(left, right); +} + +//inline bool OnlineForestClassifier::verbose() const { +// return _verbose; +//} +// +//inline OnlineForestClassifier &OnlineForestClassifier::set_verbose(bool verbose) { +// _verbose = verbose; +// return *this; +//} diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h new file mode 100644 index 000000000..ab6fbc92f --- /dev/null +++ b/tick/inference/src/online_forest_classifier.h @@ -0,0 +1,318 @@ + +#ifndef TICK_ONLINE_FOREST_CLASSIFIER_H +#define TICK_ONLINE_FOREST_CLASSIFIER_H + +// License: BSD 3 clause + +#include "base.h" +#include +#include "../../random/src/rand.h" + + +// TODO: faire tres attention au features binaires si le range est 0 sur toutes les coordonnées, ne rien faire +// TODO: code a classifier + +// TODO: choisir la feature proportionnellement au ratio des range de features, mais attention au cas de features +// discretes +// TODO: une option pour créer une cellule vide, enfin oublier les donnes dans la cellule quand elle a ete splitee + +// TODO: choix de la feature les labels + +// TODO: pour la classification, on utilise pas les frequences, on utilise des frequences regularisees, prior Dirichlet p_c = (n_c + 0.5) + (\sum n_c + C / 2). En fait une option + +// TODO: check that not using reserve in the forest works as well... + + +enum class CriterionClassifier { + log = 0, +}; + + +class TreeClassifier; + +/********************************************************************************* + * NodeClassifier + *********************************************************************************/ + +class NodeClassifier { + protected: + // Tree containing the node + TreeClassifier &_tree; + // Index of the parent + ulong _parent; + // Index of the left child + ulong _left; + // Index of the right child + ulong _right; + // Index of the feature used for the split + ulong _feature; + // Threshold used for the split + double _threshold; + // Number of samples in the node + ulong _n_samples; + // The features of the sample saved in the node + // TODO: use a unique_ptr on x_t + ArrayDouble _x_t; + // The label of the sample saved in the node + double _y_t; + // Logarithm of the aggregation weight for the node + double _weight; + // Logarithm of the agregation weight for the sub-tree starting at this node + double _weight_tree; + // true if the node is a leaf + bool _is_leaf; + // Counts the number of sample seen in each class + ArrayULong _counts; + + public: + NodeClassifier(TreeClassifier &tree, ulong parent); + NodeClassifier(const NodeClassifier &node); + NodeClassifier(const NodeClassifier &&node); + NodeClassifier &operator=(const NodeClassifier &) = delete; + NodeClassifier &operator=(const NodeClassifier &&) = delete; + + // Computation of log( (e^a + e^b) / 2) in an overproof way + inline static double log_sum_2_exp(const double a, const double b) { + // TODO if |a - b| > 50 skip + if (a > b) { + return a + std::log((1 + std::exp(b - a)) / 2); + } else { + return b + std::log((1 + std::exp(a - b)) / 2); + } + } + + // Update to apply to a node when going forward in the tree (towards leaves) + void update_downwards(const ArrayDouble &x_t, const double y_t); + // Update to apply to a node when going upward in the tree (towards the root) + void update_upwards(); + // Update the prediction of the label + void update_predict(const double y_t); + // Predict function (average of the labels of samples that passed through the node) + void predict(ArrayDouble& scores) const; + // Loss function used for aggregation + + double score(uint8_t y) const; + + double loss(const double y_t); + // Get node at index in the tree + inline NodeClassifier &node(ulong index) const; + // Get number of features + inline ulong n_features() const; + // Number of classes + inline uint8_t n_classes() const; + // Step to use for aggrgation + inline double step() const; + // Print of the node + void print(); + + inline ulong parent() const; + inline ulong left() const; + inline NodeClassifier &set_left(ulong left); + inline ulong right() const; + inline NodeClassifier &set_right(ulong right); + inline bool is_leaf() const; + inline NodeClassifier &set_is_leaf(bool is_leaf); + inline ulong feature() const; + inline NodeClassifier &set_feature(ulong feature); + inline double threshold() const; + inline NodeClassifier &set_threshold(double threshold); + inline ulong n_samples() const; + inline NodeClassifier &set_n_samples(ulong n_samples); + inline double weight() const; + inline NodeClassifier &set_weight(double weight); + inline double weight_tree() const; + inline NodeClassifier &set_weight_tree(double weight); + inline const ArrayDouble &x_t() const; + inline NodeClassifier &set_x_t(const ArrayDouble &x_t); + inline double y_t() const; + inline NodeClassifier &set_y_t(const double y_t); +}; + +class OnlineForestClassifier; + +/********************************************************************************* + * TreeClassifier + *********************************************************************************/ + +class TreeClassifier { + protected: + // The forest of the tree + OnlineForestClassifier &forest; + // Number of nodes in the tree + ulong _n_nodes = 0; + // Iteration counter + ulong iteration = 0; + // Nodes of the tree + std::vector nodes = std::vector(); + // Split the node at given index + ulong split_leaf(ulong index, const ArrayDouble &x_t, double y_t); + // Add nodes in the tree + ulong add_node(ulong parent); + + ulong go_downwards(const ArrayDouble &x_t, double y_t, bool predict); + void go_upwards(ulong leaf_index); + + public: + TreeClassifier(OnlineForestClassifier &forest); + TreeClassifier(const TreeClassifier &tree); + TreeClassifier(const TreeClassifier &&tree); + TreeClassifier &operator=(const TreeClassifier &) = delete; + TreeClassifier &operator=(const TreeClassifier &&) = delete; + + void fit(const ArrayDouble &x_t, double y_t); + void predict(const ArrayDouble &x_t, ArrayDouble &scores); + + inline ulong n_features() const; + inline uint8_t n_classes() const; + inline ulong n_nodes() const; + inline double step() const; + + void print() { + std::cout << "Tree(n_nodes: " << _n_nodes << std::endl; + std::cout << " "; + for (NodeClassifier &node : nodes) { + node.print(); + } + std::cout << ")"; + } + + inline CriterionClassifier criterion() const; + + NodeClassifier &node(ulong index) { + return nodes[index]; + } +}; + +/********************************************************************************* + * OnlineForestClassifier + *********************************************************************************/ + +class OnlineForestClassifier { + private: + // Number of Trees in the forest + uint32_t _n_trees; + // Number of threads to use for parallel growing of trees + int32_t _n_threads; + // CriterionClassifier used for splitting (not used for now) + CriterionClassifier _criterion; + // Step-size used for aggregation + double _step; + // Number of features. + ulong _n_features; + // Number of classes in the classification problem + uint8_t _n_classes; + // Seed for random number generation + int _seed; + // Verbose things or not + bool _verbose; + // Iteration counter + ulong _iteration; + // The list of trees in the forest + std::vector trees; + // Random number generator for feature and threshold sampling + Rand rand; + // Create trees + void create_trees(); + + public: + OnlineForestClassifier(uint32_t n_trees, uint8_t n_classes, double step, CriterionClassifier criterion, + int32_t n_threads, int seed, bool verbose); + virtual ~OnlineForestClassifier(); + + void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); + void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions, bool use_aggregation); + + inline ulong sample_feature(); + inline double sample_threshold(double left, double right); + + void clear(); + + inline double step() const { + return _step; + } + + void print() { + for (TreeClassifier &tree: trees) { + tree.print(); + } + } + + inline ulong n_samples() const { + if (_iteration > 0) { + return _iteration; + } else { + TICK_ERROR("You must call ``fit`` before asking for ``n_samples``.") + } + } + + inline ulong n_features() const { + if (_iteration > 0) { + return _n_features; + } else { + TICK_ERROR("You must call ``fit`` before asking for ``n_features``.") + } + } + + inline uint8_t n_classes() const { + return _n_classes; + } + + OnlineForestClassifier & set_n_classes(uint8_t n_classes) { + if (_iteration == 0) { + _n_classes = n_classes; + } else { + TICK_ERROR("OnlineForest::set_n_classes can be called only once !") + } + return *this; + } + + inline OnlineForestClassifier &set_n_features(ulong n_features) { + if (_iteration == 0) { + _n_features = n_features; + } + return *this; + } + + inline uint32_t n_trees() const { + return _n_trees; + } + + inline OnlineForestClassifier &set_n_trees(uint32_t n_trees) { + _n_trees = n_trees; + return *this; + } + + inline int32_t n_threads() const { + return _n_threads; + } + + inline CriterionClassifier criterion() const { + return _criterion; + } + + inline int seed() const { + return _seed; + } + + inline OnlineForestClassifier &set_seed(int seed) { + _seed = seed; + rand.reseed(seed); + return *this; + } + + OnlineForestClassifier &set_n_threads(int32_t n_threads) { + _n_threads = n_threads; + return *this; + } + + inline OnlineForestClassifier &set_criterion(CriterionClassifier criterion) { + _criterion = criterion; + return *this; + } + + +// inline bool verbose() const; +// inline OnlineForestClassifier &set_verbose(bool verbose); +}; + +#endif //TICK_ONLINE_FOREST_CLASSIFIER_H diff --git a/tick/inference/swig/online_forest_classifier.i b/tick/inference/swig/online_forest_classifier.i new file mode 100644 index 000000000..d06c61bb7 --- /dev/null +++ b/tick/inference/swig/online_forest_classifier.i @@ -0,0 +1,47 @@ +// License: BSD 3 clause + +%include std_shared_ptr.i +%shared_ptr(OnlineForestRegressor); + +%{ +#include "online_forest_classifier.h" +%} + + +enum class CriterionClassifier { + log = 0 +}; + +class OnlineForestClassifier { + public: + OnlineForestClassifier(uint32_t n_trees, uint8_t n_classes, double step, CriterionClassifier criterion, + int32_t n_threads, int seed, bool verbose); + + void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); + void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions, bool use_aggregation); + + void clear(); + + inline double step() const; + void print(); + + ulong n_samples() const; + ulong n_features() const; + uint8_t n_classes() const; + OnlineForestClassifier & set_n_classes(uint8_t n_classes); + + ulong n_classes() const; + // OnlineForestClassifier &set_n_features(ulong n_features); + + uint32_t n_trees() const; + OnlineForestClassifier &set_n_trees(uint32_t n_trees); + + int32_t n_threads() const; + OnlineForestClassifier &set_n_threads(int32_t n_threads); + CriterionClassifier criterion() const; + OnlineForestClassifier &set_criterion(CriterionClassifier criterion); + int seed() const; + OnlineForestClassifier &set_seed(int seed); + // bool verbose() const; + // OnlineForestRegressor &set_verbose(bool verbose); +}; From 933c91537ff43579cfc8fc2b81913793eb73ea02 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Fri, 17 Nov 2017 00:36:14 +0100 Subject: [PATCH 09/32] ... --- video.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/video.py b/video.py index baa67fc77..b5f05d73a 100644 --- a/video.py +++ b/video.py @@ -1,5 +1,6 @@ import matplotlib.animation as animation +from matplotlib.animation import MovieWriter from sklearn.model_selection import train_test_split import numpy as np @@ -37,7 +38,7 @@ ax.scatter(X_train[:2, 0], X_train[:2, 1], c=np.array([0, 1]), s=25, cmap=cm) n_trees = 50 -clf = OnlineForestClassifier(n_trees=n_trees, n_classes=2, seed=123, step=1e-1) +clf = OnlineForestClassifier(n_trees=n_trees, n_classes=2, seed=123, step=1.) def animate(i): clf.fit(X_train[i, :].reshape(1, 2), np.array([y_train[i]])) From ce3aaa296ede9d6e8d69b5106dfe9aebb51c00cd Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Fri, 17 Nov 2017 10:56:25 +0100 Subject: [PATCH 10/32] ... --- online_forest.py | 152 ++---------------- online_forest_data.py | 47 ++++++ tick/inference/online_forest_classifier.py | 12 +- .../src/online_forest_classifier.cpp | 59 +++++-- tick/inference/src/online_forest_classifier.h | 7 + 5 files changed, 126 insertions(+), 151 deletions(-) create mode 100644 online_forest_data.py diff --git a/online_forest.py b/online_forest.py index b8338125a..d912ae9b3 100644 --- a/online_forest.py +++ b/online_forest.py @@ -1,40 +1,34 @@ -from tick.simulation import SimuLinReg, SimuLogReg, weights_sparse_gauss +from tick.simulation import SimuLogReg, weights_sparse_gauss from sklearn.model_selection import train_test_split import numpy as np -from tick.inference import OnlineForestRegressor, OnlineForestClassifier +from tick.inference import OnlineForestClassifier from matplotlib.colors import ListedColormap -from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, \ - RandomForestClassifier, ExtraTreesClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.datasets import make_moons, make_classification, make_circles from sklearn.metrics import roc_auc_score import matplotlib.pyplot as plt from time import time -n_samples = 500 +n_samples = 1000 n_features = 2 seed = 123 np.set_printoptions(precision=2) - w0 = weights_sparse_gauss(n_features, nnz=2) X, y = SimuLogReg(w0, -1., n_samples=n_samples, seed=seed).simulate() y = (y + 1) / 2 -# X_train, X_test, y_train, y_test = train_test_split(X, y) - - -def plot_decisions_regression(clfs, datasets, names, use_aggregation=None): +def plot_decisions_regression(clfs, datasets, names): i = 1 h = .02 fig = plt.figure(figsize=(4 * (len(clfs) + 1), 4 * len(datasets))) # iterate over datasets for ds_cnt, ds in enumerate(datasets): X, y = ds - # X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=.4, random_state=42) @@ -42,15 +36,11 @@ def plot_decisions_regression(clfs, datasets, names, use_aggregation=None): y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) - # just plot the dataset first cm = plt.cm.RdBu - cm_bright = ListedColormap(['#FF0000', '#0000FF']) ax = plt.subplot(len(datasets), len(clfs) + 1, i) if ds_cnt == 0: ax.set_title("Input data") - # Plot the training points - # plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=10, cmap=cm) ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=25, cmap=cm) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=25, @@ -60,39 +50,25 @@ def plot_decisions_regression(clfs, datasets, names, use_aggregation=None): ax.set_xticks(()) ax.set_yticks(()) i += 1 - # iterate over classifiers for name, clf in zip(names, clfs): ax = plt.subplot(len(datasets), len(clfs) + 1, i) - - t1 = time() clf.fit(X_train, y_train) - t2 = time() - - # mse = np.linalg.norm(y_test - clf.predict(X_test)) - # score = clf.score(X_test, y_test) - Z = clf.predict(np.array([xx.ravel(), yy.ravel()]).T) - # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) - # Plot also the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm, s=15) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=15, alpha=0.6) - ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) if ds_cnt == 0: ax.set_title(name) - - # ax.text(xx.max() - .3, yy.min() + .3, ('%.2f (%.2f)' % (mse, t2-t1)).lstrip('0'), - # size=15, horizontalalignment='right') i += 1 plt.tight_layout() @@ -109,12 +85,10 @@ def plot_decision_classification(classifiers, datasets, names): X, y = ds X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=.4, random_state=42) - x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) - # just plot the dataset first cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) @@ -122,71 +96,27 @@ def plot_decision_classification(classifiers, datasets, names): if ds_cnt == 0: ax.set_title("Input data") # Plot the training points - ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=10, cmap=cm) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=10, alpha=0.6) - - # ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, - # edgecolors='k') - # # and testing points - # ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, - # alpha=0.6, - # edgecolors='k') ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 - # iterate over classifiers for name, clf in zip(names, classifiers): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) - if hasattr(clf, 'clear'): clf.clear() clf.fit(X_train, y_train) - Z = clf.predict_proba(np.array([xx.ravel(), yy.ravel()]).T)[:, 1] score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) - # score = clf.score(X_test, y_test) - - # Plot the decision boundary. For that, we will assign a color to - # each - # point in the mesh [x_min, x_max]x[y_min, y_max]. - # if hasattr(clf, "decision_function"): - # Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) - # else: - # Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] - - # Z = clf.predict(np.array([xx.ravel(), yy.ravel()]).T)[:, 1] - - - # Z = Z[:, 1] - # print(Z) - # print(Z.shape) - # print(xx.shape, xx.shape[0] * xx.shape[1]) - - # Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])[:, 1] - # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) - - # Plot also the training points - # ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm, s=15) - # # and testing points - # ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, - # s=15, alpha=0.6) - - # ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, - # edgecolors='k') - # # and testing points - # ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, - # edgecolors='k', alpha=0.6) - ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) @@ -198,16 +128,12 @@ def plot_decision_classification(classifiers, datasets, names): i += 1 plt.tight_layout() - plt.show() + # plt.show() path = '/Users/stephane.gaiffas/Downloads/' -import os - -# plt.savefig(os.path.join(path, 'online1.pdf')) - -n_trees = 50 +n_trees = 10 X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_informative=2, random_state=1, @@ -216,75 +142,23 @@ def plot_decision_classification(classifiers, datasets, names): X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) -datasets = [make_moons(n_samples=n_samples, noise=0.3, random_state=0), - make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), - linearly_separable - ] - -# datasets = [ -# (X, y) -# ] - -# clfs = [ -# OnlineForestClassifier(n_trees=n_trees, seed=123, step=0.25), -# ExtraTreesRegressor(n_estimators=n_trees), -# RandomForestRegressor(n_estimators=n_trees) -# ] +datasets = [ + make_moons(n_samples=n_samples, noise=0.3, random_state=0), + make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), + linearly_separable +] classifiers = [ OnlineForestClassifier(n_trees=n_trees, seed=123, step=1.), ExtraTreesClassifier(n_estimators=n_trees), RandomForestClassifier(n_estimators=n_trees) ] + names = [ "Online forest", "Extra trees", "Breiman RF" ] - -# forest = OnlineForestClassifier(n_trees=n_trees, n_classes=2, seed=123, step=1.) -# print(y) - -# forest.fit(X, y) -# forest.predict(X) - - plot_decision_classification(classifiers, datasets, names) plt.show() - -# forest = OnlineForestRegressor(n_trees=n_trees, seed=123, step=0.25) -# -# forest.fit(X, y) -# -# forest.predict(X) - -# plt.savefig(os.path.join(path, 'decisions.pdf')) - - -# plot_decision_regions(clf, X, y, use_aggregation=True) - -# plt.savefig(os.path.join(path, 'online2.pdf')) - -# clf.print() - -# plt.show() - - -# clf.fit(X, y) - -# print(y) -# print(clf.predict(X)) -# clf.print() - - -# plot_decision_regions(clf, X, y, n_iter=None, use_aggregation=True) -# plt.show() - -# exit(0) -# forest = OnlineForestRegressor(n_trees=100, min_samples_split=50) - -# plot_decision_regions(clf, X, y, n_samples) - - -# plt.savefig('/Users/stephane.gaiffas/Downloads/online-forest.pdf') diff --git a/online_forest_data.py b/online_forest_data.py new file mode 100644 index 000000000..f9f56387b --- /dev/null +++ b/online_forest_data.py @@ -0,0 +1,47 @@ + +import os +import pandas as pd +import pickle as pkl + +from tick.inference import OnlineForestRegressor, OnlineForestClassifier +from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, \ + RandomForestClassifier, ExtraTreesClassifier + +path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' + + +# filename = 'dna.p' +# filename = 'letter.p' +# filename = 'satimage.p' +filename = 'usps.p' + +with open(os.path.join(path, filename), 'rb') as f: + data = pkl.load(f) + +X_train = data['x_train'] +X_test = data['x_test'] +y_train = data['y_train'] +y_test = data['y_test'] + +n_classes = y_train.max() + +n_classes = 5 + +print("n_classes:", n_classes) +n_trees = 10 + +classifiers = [ + OnlineForestClassifier(n_trees=n_trees, n_classes=n_classes, + seed=123, step=1.), + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) +] +names = [ + "Online forest", + "Extra trees", + "Breiman RF" +] + +for clf, name in zip(classifiers, names): + clf.fit(X_train, y_train) + print('Accuracy of', name, ': ', '%.2f' % clf.score(X_test, y_test)) diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py index 0421f4bc4..579ffde73 100644 --- a/tick/inference/online_forest_classifier.py +++ b/tick/inference/online_forest_classifier.py @@ -156,11 +156,21 @@ def predict_proba(self, X, use_aggregation: bool=True): self._forest.predict(X, scores, True) return scores + def predict(self, X): + if not self._fitted: + raise ValueError("You must call ``fit`` before") + else: + scores = self.predict_proba(X) + return scores.argmax(axis=1) + def clear(self): self._forest.clear() def score(self, X, y): - from sklearn.metrics import r2_score + from sklearn.metrics import accuracy_score + + y_pred = self.predict(X) + return accuracy_score(y, y_pred) def print(self): self._forest._print() diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 841a4948f..932214665 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -52,6 +52,20 @@ void NodeClassifier::update_downwards(const ArrayDouble &x_t, const double y_t) update_predict(y_t); } +bool NodeClassifier::is_same(const ArrayDouble &x_t) { + if (_is_leaf) { + for (ulong j = 0; j < n_features(); ++j) { + double delta = std::abs(x_t[j] - _x_t[j]); + if (delta > 0.) { + return false; + } + } + return true; + } else { + TICK_ERROR("NodeClassifier::is_same: node is not a leaf !") + } +} + void NodeClassifier::update_upwards() { if (_is_leaf) { _weight_tree = _weight; @@ -228,8 +242,23 @@ ulong TreeClassifier::split_leaf(ulong index, const ArrayDouble &x_t, double y_t ulong right = add_node(index); node(index).set_left(left).set_right(right).set_is_leaf(false); + // std::cout << "n_features(): " << n_features() << std::endl; + ArrayDouble diff(n_features()); + for(ulong j = 0; j < n_features(); ++j) { + // std::cout << "j: " << j; + diff[j] = std::abs(node(index).x_t()[j] - x_t[j]); + } + // std::cout << std::endl; + diff /= diff.sum(); + // diff.print(); + // std::cout << "diff.sum=" << diff.sum() << std::endl; + // TODO: better feature sampling - ulong feature = forest.sample_feature(); + // ulong feature = forest.sample_feature(); + + ulong feature = forest.sample_feature(diff); + + // std::cout << "feature: " << feature << std::endl; double x1_tj = x_t[feature]; double x2_tj = node(index).x_t()[feature]; @@ -322,22 +351,24 @@ inline ulong TreeClassifier::n_nodes() const { void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { // TODO: Test that the size does not change within successive calls to fit -// std::cout << "iteration: " << iteration << std::endl; -// print(); + // std::cout << "iteration: " << iteration << std::endl; + // print(); if (iteration == 0) { nodes[0].set_x_t(x_t).set_y_t(y_t); iteration++; return; } ulong leaf = go_downwards(x_t, y_t, false); - ulong new_leaf = split_leaf(leaf, x_t, y_t); -// for(ulong j=0; j < n_features(); ++j) { -// double delta = std::abs(x_t[j] - node(leaf).sample().first[j]); -// if (delta > 0.) { -// new_leaf = split_node(leaf, x_t, y_t); -// break; -// } -// } + + NodeClassifier& leaf_node = node(leaf); + ulong new_leaf; + bool is_same = leaf_node.is_same(x_t); + // std::cout << "is_same: " << is_same << std::endl; + if (is_same) { + new_leaf = leaf; + } else { + new_leaf = split_leaf(leaf, x_t, y_t); + } go_upwards(new_leaf); iteration++; } @@ -371,7 +402,9 @@ void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores) { } ulong TreeClassifier::add_node(ulong parent) { + // std::cout << "Adding node with parent " << parent << std::endl; nodes.emplace_back(*this, parent); + // std::cout << "Done." << std::endl; return _n_nodes++; } @@ -472,6 +505,10 @@ inline ulong OnlineForestClassifier::sample_feature() { return rand.uniform_int(0L, n_features() - 1); } +inline ulong OnlineForestClassifier::sample_feature(const ArrayDouble & prob) { + return rand.discrete(prob); +} + inline double OnlineForestClassifier::sample_threshold(double left, double right) { return rand.uniform(left, right); } diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index ab6fbc92f..63fc88a05 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -94,6 +94,9 @@ class NodeClassifier { double score(uint8_t y) const; double loss(const double y_t); + + bool is_same(const ArrayDouble &x_t); + // Get node at index in the tree inline NodeClassifier &node(ulong index) const; // Get number of features @@ -170,8 +173,11 @@ class TreeClassifier { void print() { std::cout << "Tree(n_nodes: " << _n_nodes << std::endl; std::cout << " "; + ulong index = 0; for (NodeClassifier &node : nodes) { + std::cout << "index: " << index << " "; node.print(); + index++; } std::cout << ")"; } @@ -223,6 +229,7 @@ class OnlineForestClassifier { void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions, bool use_aggregation); inline ulong sample_feature(); + inline ulong sample_feature(const ArrayDouble & prob); inline double sample_threshold(double left, double right); void clear(); From e3c91f5012eb13ee6cc1b941ab0c129c4e509e2f Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Sun, 19 Nov 2017 15:19:16 +0100 Subject: [PATCH 11/32] Reducing size of a node --- online_forest.py | 5 +- online_forest_data.py | 68 ++++++++------ tick/inference/online_forest_classifier.py | 3 + .../src/online_forest_classifier.cpp | 93 +++++++++++-------- tick/inference/src/online_forest_classifier.h | 85 +++++++++-------- .../inference/swig/online_forest_classifier.i | 2 + tick/random/src/rand.cpp | 5 + tick/random/src/rand.h | 8 ++ 8 files changed, 164 insertions(+), 105 deletions(-) diff --git a/online_forest.py b/online_forest.py index d912ae9b3..bc183906c 100644 --- a/online_forest.py +++ b/online_forest.py @@ -161,4 +161,7 @@ def plot_decision_classification(classifiers, datasets, names): ] plot_decision_classification(classifiers, datasets, names) -plt.show() + +plt.savefig('decisions.pdf') + +# plt.show() diff --git a/online_forest_data.py b/online_forest_data.py index f9f56387b..6ef2d1230 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -7,41 +7,57 @@ from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, \ RandomForestClassifier, ExtraTreesClassifier -path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' - - -# filename = 'dna.p' -# filename = 'letter.p' -# filename = 'satimage.p' -filename = 'usps.p' +import matplotlib.pyplot as plt -with open(os.path.join(path, filename), 'rb') as f: - data = pkl.load(f) +path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' -X_train = data['x_train'] -X_test = data['x_test'] -y_train = data['y_train'] -y_test = data['y_test'] +filenames = [ + 'dna.p', + 'letter.p', + 'satimage.p', + 'usps.p' +] -n_classes = y_train.max() +n_classess = [3, 25, 5, 9] -n_classes = 5 +n_trees = 100 -print("n_classes:", n_classes) -n_trees = 10 -classifiers = [ - OnlineForestClassifier(n_trees=n_trees, n_classes=n_classes, - seed=123, step=1.), - ExtraTreesClassifier(n_estimators=n_trees), - RandomForestClassifier(n_estimators=n_trees) -] names = [ "Online forest", "Extra trees", "Breiman RF" ] -for clf, name in zip(classifiers, names): - clf.fit(X_train, y_train) - print('Accuracy of', name, ': ', '%.2f' % clf.score(X_test, y_test)) +for filename, n_classes in zip(filenames, n_classess): + print(filename) + with open(os.path.join(path, filename), 'rb') as f: + data = pkl.load(f) + X_train = data['x_train'] + X_test = data['x_test'] + y_train = data['y_train'] + y_test = data['y_test'] + + triche = RandomForestClassifier(n_estimators=n_trees) + triche.fit(X_train, y_train) + probabilities = triche.feature_importances_ / triche.feature_importances_.sum() + # + # plt.stem(probabilities) + # plt.title('Features importance for ' + filename, fontsize=18) + # plt.xlabel('Features') + # plt.ylabel('Importance') + # # plt.show() + # plt.savefig(filename + '.pdf') + + online_forest = OnlineForestClassifier(n_trees=n_trees, n_classes=n_classes, + seed=123, step=1.) + online_forest.set_probabilities(probabilities) + classifiers = [ + online_forest, + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) + ] + + for clf, name in zip(classifiers, names): + clf.fit(X_train, y_train) + print('Accuracy of', name, ': ', '%.2f' % clf.score(X_test, y_test)) diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py index 579ffde73..21c8695cd 100644 --- a/tick/inference/online_forest_classifier.py +++ b/tick/inference/online_forest_classifier.py @@ -189,3 +189,6 @@ def criterion(self, value): # self._forest.set_criterion(unif) else: raise ValueError("``criterion`` must be either 'unif' or 'mse'.") + + def set_probabilities(self, probabilities): + self._forest.set_probabilities(probabilities) diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 932214665..01ae06928 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -7,7 +7,7 @@ * NodeClassifier methods *********************************************************************************/ -NodeClassifier::NodeClassifier(TreeClassifier &tree, ulong parent) +NodeClassifier::NodeClassifier(TreeClassifier &tree, uint32_t parent) : _tree(tree) { _parent = parent; _left = 0; @@ -54,7 +54,7 @@ void NodeClassifier::update_downwards(const ArrayDouble &x_t, const double y_t) bool NodeClassifier::is_same(const ArrayDouble &x_t) { if (_is_leaf) { - for (ulong j = 0; j < n_features(); ++j) { + for (uint32_t j = 0; j < n_features(); ++j) { double delta = std::abs(x_t[j] - _x_t[j]); if (delta > 0.) { return false; @@ -96,11 +96,11 @@ double NodeClassifier::loss(const double y_t) { return -std::log(score(c)); } -inline NodeClassifier &NodeClassifier::node(ulong index) const { +inline NodeClassifier &NodeClassifier::node(uint32_t index) const { return _tree.node(index); } -ulong NodeClassifier::n_features() const { +uint32_t NodeClassifier::n_features() const { return _tree.n_features(); } @@ -112,24 +112,24 @@ inline double NodeClassifier::step() const { return _tree.step(); } -inline ulong NodeClassifier::parent() const { +inline uint32_t NodeClassifier::parent() const { return _parent; } -inline ulong NodeClassifier::left() const { +inline uint32_t NodeClassifier::left() const { return _left; } -inline NodeClassifier &NodeClassifier::set_left(ulong left) { +inline NodeClassifier &NodeClassifier::set_left(uint32_t left) { _left = left; return *this; } -inline ulong NodeClassifier::right() const { +inline uint32_t NodeClassifier::right() const { return _right; } -inline NodeClassifier &NodeClassifier::set_right(ulong right) { +inline NodeClassifier &NodeClassifier::set_right(uint32_t right) { _right = right; return *this; } @@ -143,11 +143,11 @@ inline NodeClassifier &NodeClassifier::set_is_leaf(bool is_leaf) { return *this; } -inline ulong NodeClassifier::feature() const { +inline uint32_t NodeClassifier::feature() const { return _feature; } -inline NodeClassifier &NodeClassifier::set_feature(ulong feature) { +inline NodeClassifier &NodeClassifier::set_feature(uint32_t feature) { _feature = feature; return *this; } @@ -161,11 +161,11 @@ inline NodeClassifier &NodeClassifier::set_threshold(double threshold) { return *this; } -inline ulong NodeClassifier::n_samples() const { +inline uint32_t NodeClassifier::n_samples() const { return _n_samples; } -inline NodeClassifier &NodeClassifier::set_n_samples(ulong n_samples) { +inline NodeClassifier &NodeClassifier::set_n_samples(uint32_t n_samples) { _n_samples = n_samples; return *this; } @@ -236,15 +236,15 @@ TreeClassifier::TreeClassifier(OnlineForestClassifier &forest) : forest(forest) add_node(0); } -ulong TreeClassifier::split_leaf(ulong index, const ArrayDouble &x_t, double y_t) { +uint32_t TreeClassifier::split_leaf(uint32_t index, const ArrayDouble &x_t, double y_t) { // std::cout << "Splitting node " << index << std::endl; - ulong left = add_node(index); - ulong right = add_node(index); + uint32_t left = add_node(index); + uint32_t right = add_node(index); node(index).set_left(left).set_right(right).set_is_leaf(false); // std::cout << "n_features(): " << n_features() << std::endl; ArrayDouble diff(n_features()); - for(ulong j = 0; j < n_features(); ++j) { + for(uint32_t j = 0; j < n_features(); ++j) { // std::cout << "j: " << j; diff[j] = std::abs(node(index).x_t()[j] - x_t[j]); } @@ -254,9 +254,11 @@ ulong TreeClassifier::split_leaf(ulong index, const ArrayDouble &x_t, double y_t // std::cout << "diff.sum=" << diff.sum() << std::endl; // TODO: better feature sampling + // ulong feature = forest.sample_feature_bis(); + // ulong feature = forest.sample_feature(); - ulong feature = forest.sample_feature(diff); + uint32_t feature = forest.sample_feature(diff); // std::cout << "feature: " << feature << std::endl; @@ -265,8 +267,8 @@ ulong TreeClassifier::split_leaf(ulong index, const ArrayDouble &x_t, double y_t double threshold; // The leaf that contains the passed sample (x_t, y_t) - ulong data_leaf; - ulong other_leaf; + uint32_t data_leaf; + uint32_t other_leaf; // std::cout << "x1_tj= " << x1_tj << " x2_tj= " << x2_tj << " threshold= " << threshold << std::endl; // TODO: what if x1_tj == x2_tj. Must be taken care of by sample_feature() @@ -302,13 +304,13 @@ ulong TreeClassifier::split_leaf(ulong index, const ArrayDouble &x_t, double y_t return data_leaf; } -ulong TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { +uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { // Find the leaf that contains the sample // Start at the root. Index of the root is always 0 // If predict == true, this call to find_leaf is for // prediction only, so that no leaf update and splits can be done // std::cout << "Going downwards" << std::endl; - ulong index_current_node = 0; + uint32_t index_current_node = 0; bool is_leaf = false; while (!is_leaf) { // Get the current node @@ -330,9 +332,9 @@ ulong TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool pred return index_current_node; } -void TreeClassifier::go_upwards(ulong leaf_index) { +void TreeClassifier::go_upwards(uint32_t leaf_index) { // std::cout << "Going upwards" << std::endl; - ulong current = leaf_index; + uint32_t current = leaf_index; while (true) { NodeClassifier ¤t_node = node(current); current_node.update_upwards(); @@ -345,7 +347,7 @@ void TreeClassifier::go_upwards(ulong leaf_index) { } } -inline ulong TreeClassifier::n_nodes() const { +inline uint32_t TreeClassifier::n_nodes() const { return _n_nodes; } @@ -358,10 +360,10 @@ void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { iteration++; return; } - ulong leaf = go_downwards(x_t, y_t, false); + uint32_t leaf = go_downwards(x_t, y_t, false); NodeClassifier& leaf_node = node(leaf); - ulong new_leaf; + uint32_t new_leaf; bool is_same = leaf_node.is_same(x_t); // std::cout << "is_same: " << is_same << std::endl; if (is_same) { @@ -375,9 +377,9 @@ void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores) { // std::cout << "Going downwards" << std::endl; - ulong leaf = go_downwards(x_t, 0., true); + uint32_t leaf = go_downwards(x_t, 0., true); // std::cout << "Done." << std::endl; - ulong current = leaf; + uint32_t current = leaf; // The child of the current node that does not contain the data ArrayDouble pred_new(n_classes()); while (true) { @@ -401,14 +403,14 @@ void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores) { } } -ulong TreeClassifier::add_node(ulong parent) { +uint32_t TreeClassifier::add_node(uint32_t parent) { // std::cout << "Adding node with parent " << parent << std::endl; nodes.emplace_back(*this, parent); // std::cout << "Done." << std::endl; return _n_nodes++; } -inline ulong TreeClassifier::n_features() const { +inline uint32_t TreeClassifier::n_features() const { return forest.n_features(); } @@ -439,6 +441,15 @@ OnlineForestClassifier::OnlineForestClassifier(uint32_t n_trees, _criterion(criterion), _step(step), _verbose(verbose), trees() { // No iteration so far _iteration = 0; + + std::cout << "sizeof(float): " << sizeof(float) << std::endl; + std::cout << "sizeof(double): " << sizeof(double) << std::endl; + std::cout << "sizeof(uint8_t): " << sizeof(uint8_t) << std::endl; + std::cout << "sizeof(uint16_t): " << sizeof(uint16_t) << std::endl; + std::cout << "sizeof(uint32_t): " << sizeof(uint32_t) << std::endl; + std::cout << "sizeof(long): " << sizeof(long) << std::endl; + std::cout << "sizeof(ulong): " << sizeof(ulong) << std::endl; + create_trees(); // Seed the random number generators set_seed(seed); @@ -458,10 +469,10 @@ void OnlineForestClassifier::create_trees() { void OnlineForestClassifier::fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels) { // std::cout << "OnlineForestClassifier::fit" << std::endl; - ulong n_samples = features->n_rows(); - ulong n_features = features->n_cols(); + uint32_t n_samples = static_cast(features->n_rows()); + uint32_t n_features = static_cast(features->n_cols()); set_n_features(n_features); - for (ulong i = 0; i < n_samples; ++i) { + for (uint32_t i = 0; i < n_samples; ++i) { for (TreeClassifier &tree : trees) { // Fit the tree online using the new data point tree.fit(view_row(*features, i), (*labels)[i]); @@ -476,12 +487,12 @@ void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, bool use_aggregation) { predictions->fill(0.); if (_iteration > 0) { - ulong n_samples = features->n_rows(); + uint32_t n_samples = static_cast(features->n_rows()); ArrayDouble scores_tree(_n_classes); scores_tree.fill(0.); ArrayDouble scores_forest(_n_classes); scores_forest.fill(0.); - for (ulong i = 0; i < n_samples; ++i) { + for (uint32_t i = 0; i < n_samples; ++i) { // The prediction is simply the average of the predictions ArrayDouble scores_i = view_row(*predictions, i); for (TreeClassifier &tree : trees) { @@ -501,11 +512,15 @@ void OnlineForestClassifier::clear() { _iteration = 0; } -inline ulong OnlineForestClassifier::sample_feature() { - return rand.uniform_int(0L, n_features() - 1); +inline uint32_t OnlineForestClassifier::sample_feature() { + return rand.uniform_int(static_cast(0), n_features() - 1); +} + +inline uint32_t OnlineForestClassifier::sample_feature_bis() { + return rand.discrete(_probabilities); } -inline ulong OnlineForestClassifier::sample_feature(const ArrayDouble & prob) { +inline uint32_t OnlineForestClassifier::sample_feature(const ArrayDouble & prob) { return rand.discrete(prob); } diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index 63fc88a05..706cdcb51 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -27,7 +27,6 @@ enum class CriterionClassifier { log = 0, }; - class TreeClassifier; /********************************************************************************* @@ -39,17 +38,17 @@ class NodeClassifier { // Tree containing the node TreeClassifier &_tree; // Index of the parent - ulong _parent; + uint32_t _parent; // Index of the left child - ulong _left; + uint32_t _left; // Index of the right child - ulong _right; + uint32_t _right; // Index of the feature used for the split - ulong _feature; + uint32_t _feature; // Threshold used for the split double _threshold; // Number of samples in the node - ulong _n_samples; + uint32_t _n_samples; // The features of the sample saved in the node // TODO: use a unique_ptr on x_t ArrayDouble _x_t; @@ -65,7 +64,7 @@ class NodeClassifier { ArrayULong _counts; public: - NodeClassifier(TreeClassifier &tree, ulong parent); + NodeClassifier(TreeClassifier &tree, uint32_t parent); NodeClassifier(const NodeClassifier &node); NodeClassifier(const NodeClassifier &&node); NodeClassifier &operator=(const NodeClassifier &) = delete; @@ -88,7 +87,7 @@ class NodeClassifier { // Update the prediction of the label void update_predict(const double y_t); // Predict function (average of the labels of samples that passed through the node) - void predict(ArrayDouble& scores) const; + void predict(ArrayDouble &scores) const; // Loss function used for aggregation double score(uint8_t y) const; @@ -98,29 +97,29 @@ class NodeClassifier { bool is_same(const ArrayDouble &x_t); // Get node at index in the tree - inline NodeClassifier &node(ulong index) const; + inline NodeClassifier &node(uint32_t index) const; // Get number of features - inline ulong n_features() const; + inline uint32_t n_features() const; // Number of classes - inline uint8_t n_classes() const; + inline uint8_t n_classes() const; // Step to use for aggrgation inline double step() const; // Print of the node void print(); - inline ulong parent() const; - inline ulong left() const; - inline NodeClassifier &set_left(ulong left); - inline ulong right() const; - inline NodeClassifier &set_right(ulong right); + inline uint32_t parent() const; + inline uint32_t left() const; + inline NodeClassifier &set_left(uint32_t left); + inline uint32_t right() const; + inline NodeClassifier &set_right(uint32_t right); inline bool is_leaf() const; inline NodeClassifier &set_is_leaf(bool is_leaf); - inline ulong feature() const; - inline NodeClassifier &set_feature(ulong feature); + inline uint32_t feature() const; + inline NodeClassifier &set_feature(uint32_t feature); inline double threshold() const; inline NodeClassifier &set_threshold(double threshold); - inline ulong n_samples() const; - inline NodeClassifier &set_n_samples(ulong n_samples); + inline uint32_t n_samples() const; + inline NodeClassifier &set_n_samples(uint32_t n_samples); inline double weight() const; inline NodeClassifier &set_weight(double weight); inline double weight_tree() const; @@ -142,18 +141,18 @@ class TreeClassifier { // The forest of the tree OnlineForestClassifier &forest; // Number of nodes in the tree - ulong _n_nodes = 0; + uint32_t _n_nodes = 0; // Iteration counter - ulong iteration = 0; + uint32_t iteration = 0; // Nodes of the tree std::vector nodes = std::vector(); // Split the node at given index - ulong split_leaf(ulong index, const ArrayDouble &x_t, double y_t); + uint32_t split_leaf(uint32_t index, const ArrayDouble &x_t, double y_t); // Add nodes in the tree - ulong add_node(ulong parent); + uint32_t add_node(uint32_t parent); - ulong go_downwards(const ArrayDouble &x_t, double y_t, bool predict); - void go_upwards(ulong leaf_index); + uint32_t go_downwards(const ArrayDouble &x_t, double y_t, bool predict); + void go_upwards(uint32_t leaf_index); public: TreeClassifier(OnlineForestClassifier &forest); @@ -165,15 +164,15 @@ class TreeClassifier { void fit(const ArrayDouble &x_t, double y_t); void predict(const ArrayDouble &x_t, ArrayDouble &scores); - inline ulong n_features() const; - inline uint8_t n_classes() const; - inline ulong n_nodes() const; + inline uint32_t n_features() const; + inline uint8_t n_classes() const; + inline uint32_t n_nodes() const; inline double step() const; void print() { std::cout << "Tree(n_nodes: " << _n_nodes << std::endl; std::cout << " "; - ulong index = 0; + uint32_t index = 0; for (NodeClassifier &node : nodes) { std::cout << "index: " << index << " "; node.print(); @@ -184,7 +183,7 @@ class TreeClassifier { inline CriterionClassifier criterion() const; - NodeClassifier &node(ulong index) { + NodeClassifier &node(uint32_t index) { return nodes[index]; } }; @@ -204,7 +203,7 @@ class OnlineForestClassifier { // Step-size used for aggregation double _step; // Number of features. - ulong _n_features; + uint32_t _n_features; // Number of classes in the classification problem uint8_t _n_classes; // Seed for random number generation @@ -212,11 +211,13 @@ class OnlineForestClassifier { // Verbose things or not bool _verbose; // Iteration counter - ulong _iteration; + uint32_t _iteration; // The list of trees in the forest std::vector trees; // Random number generator for feature and threshold sampling Rand rand; + + ArrayDouble _probabilities; // Create trees void create_trees(); @@ -228,8 +229,11 @@ class OnlineForestClassifier { void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions, bool use_aggregation); - inline ulong sample_feature(); - inline ulong sample_feature(const ArrayDouble & prob); + inline uint32_t sample_feature(); + inline uint32_t sample_feature(const ArrayDouble &prob); + + inline uint32_t sample_feature_bis(); + inline double sample_threshold(double left, double right); void clear(); @@ -244,7 +248,7 @@ class OnlineForestClassifier { } } - inline ulong n_samples() const { + inline uint32_t n_samples() const { if (_iteration > 0) { return _iteration; } else { @@ -252,7 +256,7 @@ class OnlineForestClassifier { } } - inline ulong n_features() const { + inline uint32_t n_features() const { if (_iteration > 0) { return _n_features; } else { @@ -264,7 +268,7 @@ class OnlineForestClassifier { return _n_classes; } - OnlineForestClassifier & set_n_classes(uint8_t n_classes) { + OnlineForestClassifier &set_n_classes(uint8_t n_classes) { if (_iteration == 0) { _n_classes = n_classes; } else { @@ -273,7 +277,7 @@ class OnlineForestClassifier { return *this; } - inline OnlineForestClassifier &set_n_features(ulong n_features) { + inline OnlineForestClassifier &set_n_features(uint32_t n_features) { if (_iteration == 0) { _n_features = n_features; } @@ -317,6 +321,9 @@ class OnlineForestClassifier { return *this; } + inline void set_probabilities(const ArrayDouble &probabilities) { + _probabilities = probabilities; + } // inline bool verbose() const; // inline OnlineForestClassifier &set_verbose(bool verbose); diff --git a/tick/inference/swig/online_forest_classifier.i b/tick/inference/swig/online_forest_classifier.i index d06c61bb7..9871d5c0a 100644 --- a/tick/inference/swig/online_forest_classifier.i +++ b/tick/inference/swig/online_forest_classifier.i @@ -44,4 +44,6 @@ class OnlineForestClassifier { OnlineForestClassifier &set_seed(int seed); // bool verbose() const; // OnlineForestRegressor &set_verbose(bool verbose); + + void set_probabilities(const ArrayDouble & probabilities); }; diff --git a/tick/random/src/rand.cpp b/tick/random/src/rand.cpp index 91b8ebb54..815a33f4b 100644 --- a/tick/random/src/rand.cpp +++ b/tick/random/src/rand.cpp @@ -51,6 +51,11 @@ ulong Rand::uniform_int(ulong a, ulong b) { return uniform_ulong_dist(generator, p); } +uint32_t Rand::uniform_int(uint32_t a, uint32_t b) { + std::uniform_int_distribution::param_type p(a, b); + return uniform_uint32_dist(generator, p); +} + double Rand::uniform() { return uniform_dist(generator); } diff --git a/tick/random/src/rand.h b/tick/random/src/rand.h index 79efa5fab..6fbf0db02 100644 --- a/tick/random/src/rand.h +++ b/tick/random/src/rand.h @@ -28,6 +28,7 @@ class DLL_PUBLIC Rand { std::uniform_int_distribution uniform_int_dist; std::uniform_int_distribution uniform_ulong_dist; + std::uniform_int_distribution uniform_uint32_dist; std::uniform_real_distribution uniform_dist; std::normal_distribution normal_dist; std::exponential_distribution expon_dist; @@ -78,6 +79,13 @@ class DLL_PUBLIC Rand { */ ulong uniform_int(ulong a, ulong b); + /** + * @brief Returns a random integer between two number (both can be reached) + * \param a : lower bound + * \param b : upper bound + */ + uint32_t uniform_int(uint32_t a, uint32_t b); + /** * @brief Returns a random real between 0 and 1 */ From ad88f9122316eed1be75dca825d10e0b5ded4ad9 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Tue, 21 Nov 2017 10:17:20 +0100 Subject: [PATCH 12/32] ... --- .../src/online_forest_classifier.cpp | 55 ++++++++++++++++--- tick/inference/src/online_forest_classifier.h | 15 ++++- tick/random/src/rand.cpp | 2 +- 3 files changed, 62 insertions(+), 10 deletions(-) diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 01ae06928..88e3aa3a6 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -7,9 +7,10 @@ * NodeClassifier methods *********************************************************************************/ -NodeClassifier::NodeClassifier(TreeClassifier &tree, uint32_t parent) +NodeClassifier::NodeClassifier(TreeClassifier &tree, uint32_t parent, uint32_t time) : _tree(tree) { _parent = parent; + _time = time; _left = 0; _right = 0; _n_samples = 0; @@ -24,6 +25,7 @@ NodeClassifier::NodeClassifier(const NodeClassifier &node) : _tree(node._tree), _parent(node._parent), _left(node._left), _right(node._right), _feature(node._feature), _threshold(node._threshold), + _time(node._time), _features_min(node._features_min), _features_max(node._features_max), _n_samples(node._n_samples), _x_t(node._x_t), _y_t(node._y_t), @@ -37,6 +39,9 @@ NodeClassifier::NodeClassifier(const NodeClassifier &&node) : _tree(_tree) { _right = node._right; _feature = node._feature; _threshold = node._threshold; + _time = node._time; + _features_min = node._features_min; + _features_max = node._features_max; _n_samples = node._n_samples; _x_t = node._x_t; _y_t = node._y_t; @@ -161,6 +166,30 @@ inline NodeClassifier &NodeClassifier::set_threshold(double threshold) { return *this; } +inline double NodeClassifier::time() const { + return _time; +} + +inline NodeClassifier &NodeClassifier::set_time(double time) { + _time = time; +} + +inline double NodeClassifier::features_min(const uint32_t j) const { + return _features_min[j]; +} + +inline double NodeClassifier::set_features_min(const ArrayDouble &features_min) { + _features_min = features_min; +} + +inline double NodeClassifier::features_max(const uint32_t j) const { + return _features_max[j]; +} + +inline double NodeClassifier::set_features_max(const ArrayDouble &features_max) { + _features_max = features_max; +} + inline uint32_t NodeClassifier::n_samples() const { return _n_samples; } @@ -351,6 +380,16 @@ inline uint32_t TreeClassifier::n_nodes() const { return _n_nodes; } +uint32_t TreeClassifier::n_leaves() const { + uint32_t n_leaves = 0; + for(const NodeClassifier &node: nodes) { + if(node.is_leaf()) { + ++n_leaves; + } + } + return n_leaves; +} + void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { // TODO: Test that the size does not change within successive calls to fit // std::cout << "iteration: " << iteration << std::endl; @@ -442,13 +481,13 @@ OnlineForestClassifier::OnlineForestClassifier(uint32_t n_trees, // No iteration so far _iteration = 0; - std::cout << "sizeof(float): " << sizeof(float) << std::endl; - std::cout << "sizeof(double): " << sizeof(double) << std::endl; - std::cout << "sizeof(uint8_t): " << sizeof(uint8_t) << std::endl; - std::cout << "sizeof(uint16_t): " << sizeof(uint16_t) << std::endl; - std::cout << "sizeof(uint32_t): " << sizeof(uint32_t) << std::endl; - std::cout << "sizeof(long): " << sizeof(long) << std::endl; - std::cout << "sizeof(ulong): " << sizeof(ulong) << std::endl; +// std::cout << "sizeof(float): " << sizeof(float) << std::endl; +// std::cout << "sizeof(double): " << sizeof(double) << std::endl; +// std::cout << "sizeof(uint8_t): " << sizeof(uint8_t) << std::endl; +// std::cout << "sizeof(uint16_t): " << sizeof(uint16_t) << std::endl; +// std::cout << "sizeof(uint32_t): " << sizeof(uint32_t) << std::endl; +// std::cout << "sizeof(long): " << sizeof(long) << std::endl; +// std::cout << "sizeof(ulong): " << sizeof(ulong) << std::endl; create_trees(); // Seed the random number generators diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index 706cdcb51..de9558347 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -47,6 +47,12 @@ class NodeClassifier { uint32_t _feature; // Threshold used for the split double _threshold; + // Time of creation of the node + double _time; + // Range of the features + ArrayDouble _features_min; + ArrayDouble _features_max; + // Number of samples in the node uint32_t _n_samples; // The features of the sample saved in the node @@ -64,7 +70,7 @@ class NodeClassifier { ArrayULong _counts; public: - NodeClassifier(TreeClassifier &tree, uint32_t parent); + NodeClassifier(TreeClassifier &tree, uint32_t parent, uint32_t time = 0); NodeClassifier(const NodeClassifier &node); NodeClassifier(const NodeClassifier &&node); NodeClassifier &operator=(const NodeClassifier &) = delete; @@ -118,6 +124,12 @@ class NodeClassifier { inline NodeClassifier &set_feature(uint32_t feature); inline double threshold() const; inline NodeClassifier &set_threshold(double threshold); + inline double time() const; + inline NodeClassifier &set_time(double time); + inline double features_min(const uint32_t j) const; + inline double set_features_min(const ArrayDouble &features_min); + inline double features_max(const uint32_t j) const; + inline double set_features_max(const ArrayDouble &features_max); inline uint32_t n_samples() const; inline NodeClassifier &set_n_samples(uint32_t n_samples); inline double weight() const; @@ -167,6 +179,7 @@ class TreeClassifier { inline uint32_t n_features() const; inline uint8_t n_classes() const; inline uint32_t n_nodes() const; + uint32_t n_leaves() const; inline double step() const; void print() { diff --git a/tick/random/src/rand.cpp b/tick/random/src/rand.cpp index 815a33f4b..9dbde0c6e 100644 --- a/tick/random/src/rand.cpp +++ b/tick/random/src/rand.cpp @@ -52,7 +52,7 @@ ulong Rand::uniform_int(ulong a, ulong b) { } uint32_t Rand::uniform_int(uint32_t a, uint32_t b) { - std::uniform_int_distribution::param_type p(a, b); + std::uniform_int_distribution::param_type p(a, b); return uniform_uint32_dist(generator, p); } From 53857e5c7b22aeaa9b21661e569b23438bf25e21 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Tue, 21 Nov 2017 14:49:44 +0100 Subject: [PATCH 13/32] major change: lets not not only split leaves --- .../src/online_forest_classifier.cpp | 36 +++++++++++++++++++ tick/inference/src/online_forest_classifier.h | 4 +++ 2 files changed, 40 insertions(+) diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 88e3aa3a6..808e4786b 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -84,6 +84,23 @@ void NodeClassifier::update_predict(const double y_t) { _counts[static_cast(y_t)]++; } +void NodeClassifier::update_range(const ArrayDouble &x_t) { + if (_n_samples == 0) { + _features_min = x_t; + _features_max = x_t; + } else { + for(uint32_t j = 0; j < n_features(); ++j) { + double x_tj = x_t[j]; + if (x_tj < _features_min[j]) { + _features_min[j] = x_tj; + } + if (x_tj > _features_max[j]) { + _features_max[j] = x_tj; + } + } + } +} + double NodeClassifier::score(uint8_t c) const { // Using Dirichet(1/2, ... 1/2) prior return static_cast(2 * _counts[c] + 1) / (2 * _n_samples + n_classes()); @@ -245,6 +262,8 @@ void NodeClassifier::print() { << ", thresh: " << _threshold << ", scores: [" << std::setprecision(2) << score(0) << ", " << std::setprecision(2) << score(1) << "]" << ", counts: [" << std::setprecision(2) << _counts[0] << ", " << std::setprecision(2) << _counts[1] << "]" + << ", min: [" << std::setprecision(2) << _features_min[0] << ", " << std::setprecision(2) << _features_min[1] << "]" + << ", max: [" << std::setprecision(2) << _features_max[0] << ", " << std::setprecision(2) << _features_max[1] << "]" << ", weight: " << _weight << ", weight_tree: " << _weight_tree << ")\n"; @@ -333,6 +352,15 @@ uint32_t TreeClassifier::split_leaf(uint32_t index, const ArrayDouble &x_t, doub return data_leaf; } +void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t) { + + NodeClassifier ¤t_node = node(node_index); + if(current_node.n_samples() == 0) { + + } +} + + uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { // Find the leaf that contains the sample // Start at the root. Index of the root is always 0 @@ -345,6 +373,10 @@ uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool p // Get the current node NodeClassifier ¤t_node = node(index_current_node); if (!predict) { + + // Do the point extends the node ? + extend_range(index_current_node, x_t, y_t); + current_node.update_downwards(x_t, y_t); } // Is the node a leaf ? @@ -361,6 +393,9 @@ uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool p return index_current_node; } + + + void TreeClassifier::go_upwards(uint32_t leaf_index) { // std::cout << "Going upwards" << std::endl; uint32_t current = leaf_index; @@ -399,6 +434,7 @@ void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { iteration++; return; } + uint32_t leaf = go_downwards(x_t, y_t, false); NodeClassifier& leaf_node = node(leaf); diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index de9558347..a0b240074 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -92,6 +92,8 @@ class NodeClassifier { void update_upwards(); // Update the prediction of the label void update_predict(const double y_t); + // Update range of the seen features + void update_range(const ArrayDouble &x_t); // Predict function (average of the labels of samples that passed through the node) void predict(ArrayDouble &scores) const; // Loss function used for aggregation @@ -163,6 +165,8 @@ class TreeClassifier { // Add nodes in the tree uint32_t add_node(uint32_t parent); + void extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t); + uint32_t go_downwards(const ArrayDouble &x_t, double y_t, bool predict); void go_upwards(uint32_t leaf_index); From 962b7240271a15097ac1d0d14c554ba261cf2faa Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 22 Nov 2017 09:40:35 +0100 Subject: [PATCH 14/32] Splitting node first attempt --- online_forest.py | 6 +- .../src/online_forest_classifier.cpp | 188 ++++++++++++++---- tick/inference/src/online_forest_classifier.h | 14 +- 3 files changed, 165 insertions(+), 43 deletions(-) diff --git a/online_forest.py b/online_forest.py index bc183906c..712024a46 100644 --- a/online_forest.py +++ b/online_forest.py @@ -133,7 +133,7 @@ def plot_decision_classification(classifiers, datasets, names): path = '/Users/stephane.gaiffas/Downloads/' -n_trees = 10 +n_trees = 1 X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_informative=2, random_state=1, @@ -162,6 +162,6 @@ def plot_decision_classification(classifiers, datasets, names): plot_decision_classification(classifiers, datasets, names) -plt.savefig('decisions.pdf') +# plt.savefig('decisions.pdf') -# plt.show() +plt.show() diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 808e4786b..73cd6d089 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -7,7 +7,7 @@ * NodeClassifier methods *********************************************************************************/ -NodeClassifier::NodeClassifier(TreeClassifier &tree, uint32_t parent, uint32_t time) +NodeClassifier::NodeClassifier(TreeClassifier &tree, uint32_t parent, double time) : _tree(tree) { _parent = parent; _time = time; @@ -51,6 +51,25 @@ NodeClassifier::NodeClassifier(const NodeClassifier &&node) : _tree(_tree) { _counts = node._counts; } +NodeClassifier &NodeClassifier::operator=(const NodeClassifier &node) { + _parent = node._parent; + _left = node._left; + _right = node._right; + _feature = node._feature; + _threshold = node._threshold; + _time = node._time; + _features_min = node._features_min; + _features_max = node._features_max; + _n_samples = node._n_samples; + _x_t = node._x_t; + _y_t = node._y_t; + _weight = node._weight; + _weight_tree = node._weight_tree; + _is_leaf = node._is_leaf; + _counts = node._counts; + return *this; +} + void NodeClassifier::update_downwards(const ArrayDouble &x_t, const double y_t) { _n_samples++; _weight -= step() * loss(y_t); @@ -138,6 +157,11 @@ inline uint32_t NodeClassifier::parent() const { return _parent; } +inline NodeClassifier& NodeClassifier::set_parent(uint32_t parent) { + _parent = parent; + return *this; +} + inline uint32_t NodeClassifier::left() const { return _left; } @@ -189,22 +213,25 @@ inline double NodeClassifier::time() const { inline NodeClassifier &NodeClassifier::set_time(double time) { _time = time; + return *this; } inline double NodeClassifier::features_min(const uint32_t j) const { return _features_min[j]; } -inline double NodeClassifier::set_features_min(const ArrayDouble &features_min) { +inline NodeClassifier & NodeClassifier::set_features_min(const ArrayDouble &features_min) { _features_min = features_min; + return *this; } inline double NodeClassifier::features_max(const uint32_t j) const { return _features_max[j]; } -inline double NodeClassifier::set_features_max(const ArrayDouble &features_max) { +inline NodeClassifier & NodeClassifier::set_features_max(const ArrayDouble &features_max) { _features_max = features_max; + return *this; } inline uint32_t NodeClassifier::n_samples() const { @@ -303,7 +330,6 @@ uint32_t TreeClassifier::split_leaf(uint32_t index, const ArrayDouble &x_t, doub // TODO: better feature sampling // ulong feature = forest.sample_feature_bis(); - // ulong feature = forest.sample_feature(); uint32_t feature = forest.sample_feature(diff); @@ -353,11 +379,97 @@ uint32_t TreeClassifier::split_leaf(uint32_t index, const ArrayDouble &x_t, doub } void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t) { - + std::cout << "Extending the range" << index << std::endl; NodeClassifier ¤t_node = node(node_index); if(current_node.n_samples() == 0) { - + // The node is a leaf with no sample point, so it does not have a range + // In this case we just initialize the range with the given feature + // This node will then be updated by the call to update_downwards in go_downwards + current_node.set_features_min(x_t); + current_node.set_features_max(x_t); + } else { + ArrayDouble extension(n_features()); + double extensions_sum = 0; + for(uint32_t j =0; j < n_features(); ++j) { + double x_tj = x_t[j]; + double feature_min_j = current_node.features_min(j); + double feature_max_j = current_node.features_max(j); + if(x_tj < feature_min_j) { + extension[j] = feature_min_j - x_tj; + extensions_sum += feature_min_j - x_tj; + } else { + if (x_tj > feature_max_j) { + extension[j] = x_tj - feature_max_j; + extensions_sum += x_tj - feature_max_j; + } else { + extension[j] = 0; + } + } + } + // If the sample x_t extends the current range of the node + if(extensions_sum > 0) { + bool do_split; + double time = current_node.time(); + double T = forest.sample_exponential(extensions_sum); + // Let us determine if we need to split the node or not + if (current_node.is_leaf()) { + do_split = true; + } else { + // Same as node(current_node.right()).time(); + double child_time = node(current_node.left()).time(); + // Sample a exponential random variable with intensity + if (time + T < child_time) { + do_split = true; + } else { + do_split = false; + } + } + if (do_split) { + // Sample the splitting feature with a probability proportional to the range extensions + ArrayDouble probabilities = extension; + probabilities /= extensions_sum; + uint32_t feature = forest.sample_feature(probabilities); + double threshold; + // Is the extension on the right side ? + bool is_right_extension = x_t[feature] > current_node.features_max(feature); + + // Create new nodes + uint32_t left_new = add_node(node_index, time + T); + uint32_t right_new = add_node(node_index, time + T); + if(is_right_extension) { + threshold = forest.sample_threshold(current_node.features_max(feature), x_t[feature]); + // left_new is the same as node_index, excepted for the parent, time and the fact that it's not a leaf + node(left_new) = node(node_index); + // donc faut remettre le bon parent et le bon temps + // TODO: set_is_leaf useless for left_new since it's a copy of node_index + node(left_new).set_parent(node_index).set_time(time + T); + // right_new doit avoir comme parent node_index + node(right_new).set_parent(node_index).set_time(time + T); + // We must tell the old childs that they have a new parent, if the current node is not a leaf + if(!node(node_index).is_leaf()) { + node(node(node_index).left()).set_parent(left_new); + node(node(node_index).right()).set_parent(left_new); + } + // TODO: faut retourner right_new dans ce cas ? + } else { + threshold = forest.sample_threshold(x_t[feature], current_node.features_min(feature)); + node(right_new) = node(node_index); + node(right_new).set_parent(node_index).set_time(time + T); + node(left_new).set_parent(node_index).set_time(time + T); + if(!node(node_index).is_leaf()) { + node(node(node_index).left()).set_parent(right_new); + node(node(node_index).right()).set_parent(right_new); + } + } + // We update the splitting feature, threshold, and childs of the current index + node(node_index).set_feature(feature).set_threshold(threshold).set_left(left_new) + .set_right(right_new).set_is_leaf(false); + } + // Update the range of the node here + node(node_index).update_range(x_t); + } } + std::cout << "Done extending the range." << index << std::endl; } @@ -366,20 +478,20 @@ uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool p // Start at the root. Index of the root is always 0 // If predict == true, this call to find_leaf is for // prediction only, so that no leaf update and splits can be done - // std::cout << "Going downwards" << std::endl; + std::cout << "Going downwards" << std::endl; uint32_t index_current_node = 0; bool is_leaf = false; while (!is_leaf) { // Get the current node - NodeClassifier ¤t_node = node(index_current_node); + // NodeClassifier ¤t_node = node(index_current_node); if (!predict) { - - // Do the point extends the node ? + // Extend the range and eventually split the current node extend_range(index_current_node, x_t, y_t); - - current_node.update_downwards(x_t, y_t); + // Update the current node + node(index_current_node).update_downwards(x_t, y_t); } // Is the node a leaf ? + NodeClassifier ¤t_node = node(index_current_node); is_leaf = current_node.is_leaf(); if (!is_leaf) { if (x_t[current_node.feature()] <= current_node.threshold()) { @@ -389,7 +501,7 @@ uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool p } } } - // std::cout << "Done going downwards" << std::endl; + std::cout << "Done going downwards" << std::endl; return index_current_node; } @@ -397,13 +509,13 @@ uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool p void TreeClassifier::go_upwards(uint32_t leaf_index) { - // std::cout << "Going upwards" << std::endl; + std::cout << "Going upwards" << std::endl; uint32_t current = leaf_index; while (true) { NodeClassifier ¤t_node = node(current); current_node.update_upwards(); if (current == 0) { - // std::cout << "Done going upwards" << std::endl; + std::cout << "Done going upwards" << std::endl; break; } // We must update the root node @@ -427,26 +539,30 @@ uint32_t TreeClassifier::n_leaves() const { void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { // TODO: Test that the size does not change within successive calls to fit - // std::cout << "iteration: " << iteration << std::endl; - // print(); - if (iteration == 0) { - nodes[0].set_x_t(x_t).set_y_t(y_t); - iteration++; - return; - } + std::cout << "iteration: " << iteration << std::endl; + std::cout << "x_t: [" << std::setprecision(2) << x_t[0] << ", " << std::setprecision(2) << x_t[1] << "]" << std::endl; + print(); + + // TODO: what about these lines ??? +// if (iteration == 0) { +// nodes[0].set_x_t(x_t).set_y_t(y_t); +// iteration++; +// return; +// } uint32_t leaf = go_downwards(x_t, y_t, false); - NodeClassifier& leaf_node = node(leaf); - uint32_t new_leaf; - bool is_same = leaf_node.is_same(x_t); - // std::cout << "is_same: " << is_same << std::endl; - if (is_same) { - new_leaf = leaf; - } else { - new_leaf = split_leaf(leaf, x_t, y_t); - } - go_upwards(new_leaf); + // NodeClassifier& leaf_node = node(leaf); + // uint32_t new_leaf; + +// bool is_same = leaf_node.is_same(x_t); +// // std::cout << "is_same: " << is_same << std::endl; +// if (is_same) { +// new_leaf = leaf; +// } else { +// new_leaf = split_leaf(leaf, x_t, y_t); +// } + go_upwards(leaf); iteration++; } @@ -478,9 +594,9 @@ void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores) { } } -uint32_t TreeClassifier::add_node(uint32_t parent) { +uint32_t TreeClassifier::add_node(uint32_t parent, double time) { // std::cout << "Adding node with parent " << parent << std::endl; - nodes.emplace_back(*this, parent); + nodes.emplace_back(*this, parent, time); // std::cout << "Done." << std::endl; return _n_nodes++; } @@ -595,6 +711,10 @@ inline uint32_t OnlineForestClassifier::sample_feature_bis() { return rand.discrete(_probabilities); } +inline double OnlineForestClassifier::sample_exponential(double intensity) { + return rand.exponential(intensity); +} + inline uint32_t OnlineForestClassifier::sample_feature(const ArrayDouble & prob) { return rand.discrete(prob); } diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index a0b240074..062090c11 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -52,7 +52,6 @@ class NodeClassifier { // Range of the features ArrayDouble _features_min; ArrayDouble _features_max; - // Number of samples in the node uint32_t _n_samples; // The features of the sample saved in the node @@ -70,10 +69,10 @@ class NodeClassifier { ArrayULong _counts; public: - NodeClassifier(TreeClassifier &tree, uint32_t parent, uint32_t time = 0); + NodeClassifier(TreeClassifier &tree, uint32_t parent, double time = 0); NodeClassifier(const NodeClassifier &node); NodeClassifier(const NodeClassifier &&node); - NodeClassifier &operator=(const NodeClassifier &) = delete; + NodeClassifier &operator=(const NodeClassifier &); NodeClassifier &operator=(const NodeClassifier &&) = delete; // Computation of log( (e^a + e^b) / 2) in an overproof way @@ -116,6 +115,7 @@ class NodeClassifier { void print(); inline uint32_t parent() const; + inline NodeClassifier &set_parent(uint32_t parent); inline uint32_t left() const; inline NodeClassifier &set_left(uint32_t left); inline uint32_t right() const; @@ -129,9 +129,9 @@ class NodeClassifier { inline double time() const; inline NodeClassifier &set_time(double time); inline double features_min(const uint32_t j) const; - inline double set_features_min(const ArrayDouble &features_min); + inline NodeClassifier & set_features_min(const ArrayDouble &features_min); inline double features_max(const uint32_t j) const; - inline double set_features_max(const ArrayDouble &features_max); + inline NodeClassifier & set_features_max(const ArrayDouble &features_max); inline uint32_t n_samples() const; inline NodeClassifier &set_n_samples(uint32_t n_samples); inline double weight() const; @@ -163,7 +163,7 @@ class TreeClassifier { // Split the node at given index uint32_t split_leaf(uint32_t index, const ArrayDouble &x_t, double y_t); // Add nodes in the tree - uint32_t add_node(uint32_t parent); + uint32_t add_node(uint32_t parent, double time = 0); void extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t); @@ -251,6 +251,8 @@ class OnlineForestClassifier { inline uint32_t sample_feature_bis(); + inline double sample_exponential(double intensity); + inline double sample_threshold(double left, double right); void clear(); From 75f17ffec8b55fdcb3039e8f56b70fca0097abed Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 22 Nov 2017 10:51:15 +0100 Subject: [PATCH 15/32] Working, but bad performance --- online_forest.py | 11 ++- online_forest_data.py | 11 ++- .../src/online_forest_classifier.cpp | 78 ++++++++++--------- tick/inference/src/online_forest_classifier.h | 2 +- 4 files changed, 55 insertions(+), 47 deletions(-) diff --git a/online_forest.py b/online_forest.py index 712024a46..395237368 100644 --- a/online_forest.py +++ b/online_forest.py @@ -11,7 +11,8 @@ from time import time -n_samples = 1000 + +n_samples = 500 n_features = 2 seed = 123 @@ -133,13 +134,19 @@ def plot_decision_classification(classifiers, datasets, names): path = '/Users/stephane.gaiffas/Downloads/' -n_trees = 1 +n_trees = 20 X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) + + +# clf = OnlineForestClassifier(n_trees=n_trees, seed=123, step=1.) +# clf.fit(X, y) +# clf.print() + linearly_separable = (X, y) datasets = [ diff --git a/online_forest_data.py b/online_forest_data.py index 6ef2d1230..6b43a85de 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -20,8 +20,7 @@ n_classess = [3, 25, 5, 9] -n_trees = 100 - +n_trees = 10 names = [ "Online forest", @@ -38,9 +37,9 @@ y_train = data['y_train'] y_test = data['y_test'] - triche = RandomForestClassifier(n_estimators=n_trees) - triche.fit(X_train, y_train) - probabilities = triche.feature_importances_ / triche.feature_importances_.sum() + # triche = RandomForestClassifier(n_estimators=n_trees) + # triche.fit(X_train, y_train) + # probabilities = triche.feature_importances_ / triche.feature_importances_.sum() # # plt.stem(probabilities) # plt.title('Features importance for ' + filename, fontsize=18) @@ -51,7 +50,7 @@ online_forest = OnlineForestClassifier(n_trees=n_trees, n_classes=n_classes, seed=123, step=1.) - online_forest.set_probabilities(probabilities) + # online_forest.set_probabilities(probabilities) classifiers = [ online_forest, ExtraTreesClassifier(n_estimators=n_trees), diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 73cd6d089..93bc3222b 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -283,15 +283,19 @@ void NodeClassifier::print() { std::cout << "Node(parent: " << _parent << ", left: " << _left << ", right: " << _right + << ", time: " << std::setprecision(2) << _time << ", n_samples: " << _n_samples << ", is_leaf: " << _is_leaf << ", feature: " << _feature << ", thresh: " << _threshold << ", scores: [" << std::setprecision(2) << score(0) << ", " << std::setprecision(2) << score(1) << "]" - << ", counts: [" << std::setprecision(2) << _counts[0] << ", " << std::setprecision(2) << _counts[1] << "]" - << ", min: [" << std::setprecision(2) << _features_min[0] << ", " << std::setprecision(2) << _features_min[1] << "]" - << ", max: [" << std::setprecision(2) << _features_max[0] << ", " << std::setprecision(2) << _features_max[1] << "]" - << ", weight: " << _weight + << ", counts: [" << std::setprecision(2) << _counts[0] << ", " << std::setprecision(2) << _counts[1] << "]"; + if (_n_samples > 0) { + std::cout << ", min: [" << std::setprecision(2) << _features_min[0] << ", " << std::setprecision(2) << _features_min[1] << "]" + << ", max: [" << std::setprecision(2) << _features_max[0] << ", " << std::setprecision(2) << _features_max[1] << "]"; + + } + std::cout << ", weight: " << _weight << ", weight_tree: " << _weight_tree << ")\n"; } @@ -379,7 +383,7 @@ uint32_t TreeClassifier::split_leaf(uint32_t index, const ArrayDouble &x_t, doub } void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t) { - std::cout << "Extending the range" << index << std::endl; + // std::cout << "Extending the range of: " << index << std::endl; NodeClassifier ¤t_node = node(node_index); if(current_node.n_samples() == 0) { // The node is a leaf with no sample point, so it does not have a range @@ -388,6 +392,7 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c current_node.set_features_min(x_t); current_node.set_features_max(x_t); } else { + // std::cout << "Computing extension" << std::endl; ArrayDouble extension(n_features()); double extensions_sum = 0; for(uint32_t j =0; j < n_features(); ++j) { @@ -406,53 +411,71 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c } } } +// std::cout << "extension: [" << extension[0] << ", " << std::setprecision(2) << extension[1] << "]" << std::endl; +// std::cout << "extension_sum: " << std::setprecision(2) << extensions_sum << std::endl; +// std::cout << "... Done computing extension." << std::endl; + // If the sample x_t extends the current range of the node if(extensions_sum > 0) { + // std::cout << "Extension non-zero, considering the possibility of a split" << std::endl; bool do_split; double time = current_node.time(); double T = forest.sample_exponential(extensions_sum); + // std::cout << "time: " << std::setprecision(2) << time << ", T: " << std::setprecision(2) << T << std::endl; // Let us determine if we need to split the node or not if (current_node.is_leaf()) { + // std::cout << "I'll split the node since it's a leaf" << std::endl; do_split = true; } else { // Same as node(current_node.right()).time(); double child_time = node(current_node.left()).time(); // Sample a exponential random variable with intensity if (time + T < child_time) { + // std::cout << " I'll split since time + T < child_time with child_time: " << child_time << std::endl; do_split = true; } else { + // std::cout << "I won't split since time + T >= child_time with child_time: " << child_time << std::endl; do_split = false; } } if (do_split) { + // std::cout << "Starting the splitting of node: " << node_index << std::endl; // Sample the splitting feature with a probability proportional to the range extensions ArrayDouble probabilities = extension; probabilities /= extensions_sum; + // std::cout << "using the probabilities: [" << std::setprecision(2) << probabilities[0] << ", " << std::setprecision(2) << probabilities[1] << "]" << std::endl; uint32_t feature = forest.sample_feature(probabilities); + // std::cout << "sampled feature: " << feature << std::endl; double threshold; // Is the extension on the right side ? bool is_right_extension = x_t[feature] > current_node.features_max(feature); - // Create new nodes uint32_t left_new = add_node(node_index, time + T); uint32_t right_new = add_node(node_index, time + T); if(is_right_extension) { - threshold = forest.sample_threshold(current_node.features_max(feature), x_t[feature]); + // std::cout << "extension is on the right" << std::endl; + threshold = forest.sample_threshold(node(node_index).features_max(feature), x_t[feature]); + // std::cout << "sample inside the extension the threshold: " << threshold << std::endl; // left_new is the same as node_index, excepted for the parent, time and the fact that it's not a leaf + // std::cout << "Let's copy the current node in the left child" << threshold << std::endl; node(left_new) = node(node_index); // donc faut remettre le bon parent et le bon temps // TODO: set_is_leaf useless for left_new since it's a copy of node_index + // std::cout << "Let's the update the left child" << std::endl; node(left_new).set_parent(node_index).set_time(time + T); // right_new doit avoir comme parent node_index + // std::cout << "Let's the update the right child" << std::endl; node(right_new).set_parent(node_index).set_time(time + T); // We must tell the old childs that they have a new parent, if the current node is not a leaf if(!node(node_index).is_leaf()) { + // std::cout << "The current node is not a leaf, so let's not forget to update the old childs" << std::endl; node(node(node_index).left()).set_parent(left_new); node(node(node_index).right()).set_parent(left_new); } // TODO: faut retourner right_new dans ce cas ? } else { - threshold = forest.sample_threshold(x_t[feature], current_node.features_min(feature)); + // std::cout << "extension is on the left" << std::endl; + threshold = forest.sample_threshold(x_t[feature], node(node_index).features_min(feature)); node(right_new) = node(node_index); node(right_new).set_parent(node_index).set_time(time + T); node(left_new).set_parent(node_index).set_time(time + T); @@ -469,7 +492,7 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c node(node_index).update_range(x_t); } } - std::cout << "Done extending the range." << index << std::endl; + // std::cout << "...Done extending the range." << std::endl; } @@ -478,7 +501,7 @@ uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool p // Start at the root. Index of the root is always 0 // If predict == true, this call to find_leaf is for // prediction only, so that no leaf update and splits can be done - std::cout << "Going downwards" << std::endl; + // std::cout << "Going downwards" << std::endl; uint32_t index_current_node = 0; bool is_leaf = false; while (!is_leaf) { @@ -501,21 +524,18 @@ uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool p } } } - std::cout << "Done going downwards" << std::endl; + // std::cout << "...Done going downwards." << std::endl; return index_current_node; } - - - void TreeClassifier::go_upwards(uint32_t leaf_index) { - std::cout << "Going upwards" << std::endl; + // std::cout << "Going upwards" << std::endl; uint32_t current = leaf_index; while (true) { NodeClassifier ¤t_node = node(current); current_node.update_upwards(); if (current == 0) { - std::cout << "Done going upwards" << std::endl; + // std::cout << "...Done going upwards." << std::endl; break; } // We must update the root node @@ -539,29 +559,11 @@ uint32_t TreeClassifier::n_leaves() const { void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { // TODO: Test that the size does not change within successive calls to fit - std::cout << "iteration: " << iteration << std::endl; - std::cout << "x_t: [" << std::setprecision(2) << x_t[0] << ", " << std::setprecision(2) << x_t[1] << "]" << std::endl; - print(); - - // TODO: what about these lines ??? -// if (iteration == 0) { -// nodes[0].set_x_t(x_t).set_y_t(y_t); -// iteration++; -// return; -// } - + // std::cout << "------------------------------------------" << std::endl; + // std::cout << "iteration: " << iteration << std::endl; + // std::cout << "x_t: [" << std::setprecision(2) << x_t[0] << ", " << std::setprecision(2) << x_t[1] << "]" << std::endl; + // print(); uint32_t leaf = go_downwards(x_t, y_t, false); - - // NodeClassifier& leaf_node = node(leaf); - // uint32_t new_leaf; - -// bool is_same = leaf_node.is_same(x_t); -// // std::cout << "is_same: " << is_same << std::endl; -// if (is_same) { -// new_leaf = leaf; -// } else { -// new_leaf = split_leaf(leaf, x_t, y_t); -// } go_upwards(leaf); iteration++; } diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index 062090c11..bba25c75c 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -195,7 +195,7 @@ class TreeClassifier { node.print(); index++; } - std::cout << ")"; + std::cout << ")" << std::endl; } inline CriterionClassifier criterion() const; From d61db2f44604f378a2c681509c56f98513ebd8d1 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 22 Nov 2017 14:20:36 +0100 Subject: [PATCH 16/32] weird segfault --- online_forest.py | 29 +++- online_forest_data.py | 22 ++- tick/inference/online_forest_classifier.py | 8 +- .../src/online_forest_classifier.cpp | 148 +++++++++--------- tick/inference/src/online_forest_classifier.h | 12 +- .../inference/swig/online_forest_classifier.i | 9 +- video.py | 7 +- 7 files changed, 136 insertions(+), 99 deletions(-) diff --git a/online_forest.py b/online_forest.py index 395237368..89ea69f5f 100644 --- a/online_forest.py +++ b/online_forest.py @@ -77,8 +77,10 @@ def plot_decisions_regression(clfs, datasets, names): def plot_decision_classification(classifiers, datasets, names): + n_classifiers = len(classifiers) + n_datasets = len(datasets) h = .02 - fig = plt.figure(figsize=(2 * (len(classifiers) + 1), 2 * len(datasets))) + fig = plt.figure(figsize=(2 * (n_classifiers + 1), 2 * n_datasets)) i = 1 # iterate over datasets for ds_cnt, ds in enumerate(datasets): @@ -93,7 +95,7 @@ def plot_decision_classification(classifiers, datasets, names): # just plot the dataset first cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) - ax = plt.subplot(len(datasets), len(classifiers) + 1, i) + ax = plt.subplot(n_datasets, n_classifiers + 1, i) if ds_cnt == 0: ax.set_title("Input data") # Plot the training points @@ -108,7 +110,7 @@ def plot_decision_classification(classifiers, datasets, names): i += 1 # iterate over classifiers for name, clf in zip(names, classifiers): - ax = plt.subplot(len(datasets), len(classifiers) + 1, i) + ax = plt.subplot(n_datasets, n_classifiers + 1, i) if hasattr(clf, 'clear'): clf.clear() clf.fit(X_train, y_train) @@ -134,7 +136,7 @@ def plot_decision_classification(classifiers, datasets, names): path = '/Users/stephane.gaiffas/Downloads/' -n_trees = 20 +n_trees = 10 X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_informative=2, random_state=1, @@ -149,22 +151,33 @@ def plot_decision_classification(classifiers, datasets, names): linearly_separable = (X, y) + datasets = [ make_moons(n_samples=n_samples, noise=0.3, random_state=0), make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), linearly_separable ] + +from sklearn.neighbors import KNeighborsClassifier + + classifiers = [ - OnlineForestClassifier(n_trees=n_trees, seed=123, step=1.), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., use_aggregation=True), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=100., use_aggregation=True), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., use_aggregation=False), + KNeighborsClassifier(n_neighbors=5), ExtraTreesClassifier(n_estimators=n_trees), RandomForestClassifier(n_estimators=n_trees) ] names = [ - "Online forest", - "Extra trees", - "Breiman RF" + "OF (agg, step=1.)", + "OF(agg, step=100.)", + "OF(no agg.)", + "KNN (k=5)", + "ET", + "BRF" ] plot_decision_classification(classifiers, datasets, names) diff --git a/online_forest_data.py b/online_forest_data.py index 6b43a85de..93258e385 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -6,6 +6,7 @@ from tick.inference import OnlineForestRegressor, OnlineForestClassifier from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, \ RandomForestClassifier, ExtraTreesClassifier +from sklearn.neighbors import KNeighborsClassifier import matplotlib.pyplot as plt @@ -23,9 +24,12 @@ n_trees = 10 names = [ - "Online forest", - "Extra trees", - "Breiman RF" + # "OF (agg, step=1.)", + # "OF(agg, step=100.)", + "OF(no agg.)", + "KNN (k=5)", + "ET", + "BRF" ] for filename, n_classes in zip(filenames, n_classess): @@ -48,15 +52,19 @@ # # plt.show() # plt.savefig(filename + '.pdf') - online_forest = OnlineForestClassifier(n_trees=n_trees, n_classes=n_classes, - seed=123, step=1.) # online_forest.set_probabilities(probabilities) classifiers = [ - online_forest, + # OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., + # use_aggregation=True), + # OnlineForestClassifier(n_trees=n_trees, seed=123, step=100., + # use_aggregation=True), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., + use_aggregation=False), + KNeighborsClassifier(n_neighbors=5), ExtraTreesClassifier(n_estimators=n_trees), RandomForestClassifier(n_estimators=n_trees) ] for clf, name in zip(classifiers, names): clf.fit(X_train, y_train) - print('Accuracy of', name, ': ', '%.2f' % clf.score(X_test, y_test)) + # print('Accuracy of', name, ': ', '%.2f' % clf.score(X_test, y_test)) diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py index 21c8695cd..523ad86aa 100644 --- a/tick/inference/online_forest_classifier.py +++ b/tick/inference/online_forest_classifier.py @@ -86,7 +86,7 @@ class OnlineForestClassifier(ABC, Base): @actual_kwargs def __init__(self, n_trees: int = 10, n_classes: int=2, step: float = 1., - criterion: str = 'log', + criterion: str = 'log', use_aggregation: bool = True, max_depth: int = -1, min_samples_split: int = 50, n_threads: int = 1, seed: int = -1, verbose: bool = True, warm_start: bool = True, n_splits: int = 10): @@ -105,10 +105,12 @@ def __init__(self, n_trees: int = 10, n_classes: int=2, step: float = 1., self.verbose = verbose self.warm_start = warm_start self.n_splits = n_splits + self.use_aggregation = use_aggregation self._forest = _OnlineForestClassifier(n_trees, n_classes, step, self._criterion, + self.use_aggregation, #max_depth, # min_samples_split, n_threads, @@ -134,7 +136,7 @@ def apply(self, X): """ raise NotImplementedError() - def predict_proba(self, X, use_aggregation: bool=True): + def predict_proba(self, X): """Predict class for given samples Parameters @@ -153,7 +155,7 @@ def predict_proba(self, X, use_aggregation: bool=True): raise ValueError("You must call ``fit`` before") else: X = safe_array(X) - self._forest.predict(X, scores, True) + self._forest.predict(X, scores) return scores def predict(self, X): diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 93bc3222b..562035b2b 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -315,72 +315,72 @@ TreeClassifier::TreeClassifier(OnlineForestClassifier &forest) : forest(forest) add_node(0); } -uint32_t TreeClassifier::split_leaf(uint32_t index, const ArrayDouble &x_t, double y_t) { - // std::cout << "Splitting node " << index << std::endl; - uint32_t left = add_node(index); - uint32_t right = add_node(index); - node(index).set_left(left).set_right(right).set_is_leaf(false); - - // std::cout << "n_features(): " << n_features() << std::endl; - ArrayDouble diff(n_features()); - for(uint32_t j = 0; j < n_features(); ++j) { - // std::cout << "j: " << j; - diff[j] = std::abs(node(index).x_t()[j] - x_t[j]); - } - // std::cout << std::endl; - diff /= diff.sum(); - // diff.print(); - // std::cout << "diff.sum=" << diff.sum() << std::endl; - - // TODO: better feature sampling - // ulong feature = forest.sample_feature_bis(); - // ulong feature = forest.sample_feature(); - - uint32_t feature = forest.sample_feature(diff); - - // std::cout << "feature: " << feature << std::endl; - - double x1_tj = x_t[feature]; - double x2_tj = node(index).x_t()[feature]; - double threshold; - - // The leaf that contains the passed sample (x_t, y_t) - uint32_t data_leaf; - uint32_t other_leaf; - - // std::cout << "x1_tj= " << x1_tj << " x2_tj= " << x2_tj << " threshold= " << threshold << std::endl; - // TODO: what if x1_tj == x2_tj. Must be taken care of by sample_feature() - if (x1_tj < x2_tj) { - threshold = forest.sample_threshold(x1_tj, x2_tj); - data_leaf = left; - other_leaf = right; - } else { - threshold = forest.sample_threshold(x2_tj, x1_tj); - data_leaf = right; - other_leaf = left; - } - // TODO: code a move_sample - NodeClassifier & current_node = node(index); - NodeClassifier & data_node = node(data_leaf); - NodeClassifier & other_node = node(other_leaf); - current_node.set_feature(feature).set_threshold(threshold); - // We pass the sample to the new leaves, and initialize the _label_average with the value - data_node.set_x_t(x_t).set_y_t(y_t); - - // other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); - other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); - - // Update downwards of v' - other_node.update_downwards(current_node.x_t(), current_node.y_t()); - // Update upwards of v': it's a leaf - other_node.update_upwards(); - // node(other_leaf).set_weight_tree(node(other_leaf).weight()); - // Update downwards of v'' - data_node.update_downwards(x_t, y_t); - // Note: the update_up of v'' is done in the go_up method, called in fit() - // std::cout << "Done splitting node." << std::endl; - return data_leaf; -} +//uint32_t TreeClassifier::split_leaf(uint32_t index, const ArrayDouble &x_t, double y_t) { +// // std::cout << "Splitting node " << index << std::endl; +// uint32_t left = add_node(index); +// uint32_t right = add_node(index); +// node(index).set_left(left).set_right(right).set_is_leaf(false); +// +// // std::cout << "n_features(): " << n_features() << std::endl; +// ArrayDouble diff(n_features()); +// for(uint32_t j = 0; j < n_features(); ++j) { +// // std::cout << "j: " << j; +// diff[j] = std::abs(node(index).x_t()[j] - x_t[j]); +// } +// // std::cout << std::endl; +// diff /= diff.sum(); +// // diff.print(); +// // std::cout << "diff.sum=" << diff.sum() << std::endl; +// +// // TODO: better feature sampling +// // ulong feature = forest.sample_feature_bis(); +// // ulong feature = forest.sample_feature(); +// +// uint32_t feature = forest.sample_feature(diff); +// +// // std::cout << "feature: " << feature << std::endl; +// +// double x1_tj = x_t[feature]; +// double x2_tj = node(index).x_t()[feature]; +// double threshold; +// +// // The leaf that contains the passed sample (x_t, y_t) +// uint32_t data_leaf; +// uint32_t other_leaf; +// +// // std::cout << "x1_tj= " << x1_tj << " x2_tj= " << x2_tj << " threshold= " << threshold << std::endl; +// // TODO: what if x1_tj == x2_tj. Must be taken care of by sample_feature() +// if (x1_tj < x2_tj) { +// threshold = forest.sample_threshold(x1_tj, x2_tj); +// data_leaf = left; +// other_leaf = right; +// } else { +// threshold = forest.sample_threshold(x2_tj, x1_tj); +// data_leaf = right; +// other_leaf = left; +// } +// // TODO: code a move_sample +// NodeClassifier & current_node = node(index); +// NodeClassifier & data_node = node(data_leaf); +// NodeClassifier & other_node = node(other_leaf); +// current_node.set_feature(feature).set_threshold(threshold); +// // We pass the sample to the new leaves, and initialize the _label_average with the value +// data_node.set_x_t(x_t).set_y_t(y_t); +// +// // other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); +// other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); +// +// // Update downwards of v' +// other_node.update_downwards(current_node.x_t(), current_node.y_t()); +// // Update upwards of v': it's a leaf +// other_node.update_upwards(); +// // node(other_leaf).set_weight_tree(node(other_leaf).weight()); +// // Update downwards of v'' +// data_node.update_downwards(x_t, y_t); +// // Note: the update_up of v'' is done in the go_up method, called in fit() +// // std::cout << "Done splitting node." << std::endl; +// return data_leaf; +//} void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t) { // std::cout << "Extending the range of: " << index << std::endl; @@ -568,9 +568,15 @@ void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { iteration++; } -void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores) { +void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores, bool use_aggregation) { // std::cout << "Going downwards" << std::endl; uint32_t leaf = go_downwards(x_t, 0., true); + + if(!use_aggregation) { + node(leaf).predict(scores); + return; + } + // std::cout << "Done." << std::endl; uint32_t current = leaf; // The child of the current node that does not contain the data @@ -627,11 +633,12 @@ OnlineForestClassifier::OnlineForestClassifier(uint32_t n_trees, uint8_t n_classes, double step, CriterionClassifier criterion, + bool use_aggregation, int32_t n_threads, int seed, bool verbose) : _n_trees(n_trees), _n_classes(n_classes), _n_threads(n_threads), - _criterion(criterion), _step(step), _verbose(verbose), trees() { + _criterion(criterion), _use_aggregation(use_aggregation), _step(step), _verbose(verbose), trees() { // No iteration so far _iteration = 0; @@ -676,8 +683,7 @@ void OnlineForestClassifier::fit(const SArrayDouble2dPtr features, } void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, - SArrayDouble2dPtr predictions, - bool use_aggregation) { + SArrayDouble2dPtr predictions) { predictions->fill(0.); if (_iteration > 0) { uint32_t n_samples = static_cast(features->n_rows()); @@ -689,7 +695,7 @@ void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, // The prediction is simply the average of the predictions ArrayDouble scores_i = view_row(*predictions, i); for (TreeClassifier &tree : trees) { - tree.predict(view_row(*features, i), scores_tree); + tree.predict(view_row(*features, i), scores_tree, _use_aggregation); // TODO: use a .incr method instead ?? scores_i.mult_incr(scores_tree, 1.); } diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index bba25c75c..68c02027d 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -161,7 +161,7 @@ class TreeClassifier { // Nodes of the tree std::vector nodes = std::vector(); // Split the node at given index - uint32_t split_leaf(uint32_t index, const ArrayDouble &x_t, double y_t); + // uint32_t split_leaf(uint32_t index, const ArrayDouble &x_t, double y_t); // Add nodes in the tree uint32_t add_node(uint32_t parent, double time = 0); @@ -178,7 +178,7 @@ class TreeClassifier { TreeClassifier &operator=(const TreeClassifier &&) = delete; void fit(const ArrayDouble &x_t, double y_t); - void predict(const ArrayDouble &x_t, ArrayDouble &scores); + void predict(const ArrayDouble &x_t, ArrayDouble &scores, bool use_aggregation); inline uint32_t n_features() const; inline uint8_t n_classes() const; @@ -217,6 +217,8 @@ class OnlineForestClassifier { int32_t _n_threads; // CriterionClassifier used for splitting (not used for now) CriterionClassifier _criterion; + + bool _use_aggregation; // Step-size used for aggregation double _step; // Number of features. @@ -239,12 +241,12 @@ class OnlineForestClassifier { void create_trees(); public: - OnlineForestClassifier(uint32_t n_trees, uint8_t n_classes, double step, CriterionClassifier criterion, - int32_t n_threads, int seed, bool verbose); + OnlineForestClassifier(uint32_t n_trees, uint8_t n_classes, double step=1.0, CriterionClassifier criterion=CriterionClassifier::log, + bool use_aggregation = true, int32_t n_threads=1, int seed=0, bool verbose=false); virtual ~OnlineForestClassifier(); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); - void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions, bool use_aggregation); + void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions); inline uint32_t sample_feature(); inline uint32_t sample_feature(const ArrayDouble &prob); diff --git a/tick/inference/swig/online_forest_classifier.i b/tick/inference/swig/online_forest_classifier.i index 9871d5c0a..4bbded12f 100644 --- a/tick/inference/swig/online_forest_classifier.i +++ b/tick/inference/swig/online_forest_classifier.i @@ -14,11 +14,14 @@ enum class CriterionClassifier { class OnlineForestClassifier { public: - OnlineForestClassifier(uint32_t n_trees, uint8_t n_classes, double step, CriterionClassifier criterion, - int32_t n_threads, int seed, bool verbose); + OnlineForestClassifier(uint32_t n_trees, uint8_t n_classes, double step=1.0, + CriterionClassifier criterion=CriterionClassifier::log, + bool use_aggregation = true, + int32_t n_threads=1, + int seed=0, bool verbose=false); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); - void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions, bool use_aggregation); + void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions); void clear(); diff --git a/video.py b/video.py index b5f05d73a..7f3c872c3 100644 --- a/video.py +++ b/video.py @@ -37,8 +37,11 @@ ax.scatter(X_train[:2, 0], X_train[:2, 1], c=np.array([0, 1]), s=25, cmap=cm) -n_trees = 50 -clf = OnlineForestClassifier(n_trees=n_trees, n_classes=2, seed=123, step=1.) +n_trees = 10 + +clf = OnlineForestClassifier(n_trees=n_trees, n_classes=2, seed=123, step=1., + use_aggregation=False) + def animate(i): clf.fit(X_train[i, :].reshape(1, 2), np.array([y_train[i]])) From 5ddde8fb627bbdee1e46122e13047bd91fb14511 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 22 Nov 2017 14:23:46 +0100 Subject: [PATCH 17/32] weird segfault --- online_forest_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/online_forest_data.py b/online_forest_data.py index 93258e385..7e2f35545 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -13,7 +13,7 @@ path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' filenames = [ - 'dna.p', + # 'dna.p', 'letter.p', 'satimage.p', 'usps.p' From 12383fd5e605c4b6e8e8f193fc22ca0199fbc67e Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 22 Nov 2017 14:45:47 +0100 Subject: [PATCH 18/32] ... --- online_forest.py | 4 +- online_forest_data.py | 38 +++++++++++-------- .../src/online_forest_classifier.cpp | 4 ++ tick/inference/src/online_forest_classifier.h | 4 ++ 4 files changed, 33 insertions(+), 17 deletions(-) diff --git a/online_forest.py b/online_forest.py index 89ea69f5f..d32dac7c0 100644 --- a/online_forest.py +++ b/online_forest.py @@ -164,7 +164,7 @@ def plot_decision_classification(classifiers, datasets, names): classifiers = [ OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., use_aggregation=True), - OnlineForestClassifier(n_trees=n_trees, seed=123, step=100., use_aggregation=True), + # OnlineForestClassifier(n_trees=n_trees, seed=123, step=100., use_aggregation=True), OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., use_aggregation=False), KNeighborsClassifier(n_neighbors=5), ExtraTreesClassifier(n_estimators=n_trees), @@ -173,7 +173,7 @@ def plot_decision_classification(classifiers, datasets, names): names = [ "OF (agg, step=1.)", - "OF(agg, step=100.)", + # "OF(agg, step=100.)", "OF(no agg.)", "KNN (k=5)", "ET", diff --git a/online_forest_data.py b/online_forest_data.py index 7e2f35545..ee6fcf3f4 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -13,7 +13,7 @@ path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' filenames = [ - # 'dna.p', + 'dna.p', 'letter.p', 'satimage.p', 'usps.p' @@ -24,14 +24,15 @@ n_trees = 10 names = [ - # "OF (agg, step=1.)", - # "OF(agg, step=100.)", + "OF (agg, step=1.)", + "OF(agg, step=100.)", "OF(no agg.)", "KNN (k=5)", "ET", "BRF" ] + for filename, n_classes in zip(filenames, n_classess): print(filename) with open(os.path.join(path, filename), 'rb') as f: @@ -41,6 +42,18 @@ y_train = data['y_train'] y_test = data['y_test'] + classifiers = [ + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., + use_aggregation=True, n_classes=n_classes), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=100., + n_classes=n_classes, use_aggregation=True), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., + use_aggregation=False, n_classes=n_classes), + KNeighborsClassifier(n_neighbors=5), + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) + ] + # triche = RandomForestClassifier(n_estimators=n_trees) # triche.fit(X_train, y_train) # probabilities = triche.feature_importances_ / triche.feature_importances_.sum() @@ -53,18 +66,13 @@ # plt.savefig(filename + '.pdf') # online_forest.set_probabilities(probabilities) - classifiers = [ - # OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., - # use_aggregation=True), - # OnlineForestClassifier(n_trees=n_trees, seed=123, step=100., - # use_aggregation=True), - OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., - use_aggregation=False), - KNeighborsClassifier(n_neighbors=5), - ExtraTreesClassifier(n_estimators=n_trees), - RandomForestClassifier(n_estimators=n_trees) - ] + + # forest1 = for clf, name in zip(classifiers, names): + if hasattr(clf, 'clear'): + clf.clear() + # print('Fitting', name) clf.fit(X_train, y_train) - # print('Accuracy of', name, ': ', '%.2f' % clf.score(X_test, y_test)) + # print('Done.') + print('Accuracy of', name, ': ', '%.2f' % clf.score(X_test, y_test)) diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 562035b2b..96489b338 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -669,6 +669,10 @@ void OnlineForestClassifier::create_trees() { void OnlineForestClassifier::fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels) { // std::cout << "OnlineForestClassifier::fit" << std::endl; + + _features = features; + _labels = labels; + uint32_t n_samples = static_cast(features->n_rows()); uint32_t n_features = static_cast(features->n_cols()); set_n_features(n_features); diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index 68c02027d..78a7e2423 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -240,6 +240,10 @@ class OnlineForestClassifier { // Create trees void create_trees(); + + SArrayDouble2dPtr _features; + SArrayDoublePtr _labels; + public: OnlineForestClassifier(uint32_t n_trees, uint8_t n_classes, double step=1.0, CriterionClassifier criterion=CriterionClassifier::log, bool use_aggregation = true, int32_t n_threads=1, int seed=0, bool verbose=false); From 30d5daf0b41d8122c046c4898490e61b057b9c27 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 22 Nov 2017 14:46:12 +0100 Subject: [PATCH 19/32] gotcha --- video.py | 1 + 1 file changed, 1 insertion(+) diff --git a/video.py b/video.py index 7f3c872c3..dd8a79057 100644 --- a/video.py +++ b/video.py @@ -30,6 +30,7 @@ fig = plt.figure(figsize=(5, 5)) ax = plt.subplot(1, 1, 1) + ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) From d9f11aead3207aaa70878c83bc1aa1a79c961b23 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 22 Nov 2017 16:39:12 +0100 Subject: [PATCH 20/32] ... --- online_forest_data.py | 18 ++++++++++++++--- tick/inference/online_forest_classifier.py | 4 ++-- .../src/online_forest_classifier.cpp | 10 ++++++++-- tick/inference/src/online_forest_classifier.h | 20 +++---------------- .../inference/swig/online_forest_classifier.i | 2 +- 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/online_forest_data.py b/online_forest_data.py index ee6fcf3f4..7400bab98 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -10,6 +10,17 @@ import matplotlib.pyplot as plt + +# TODO: options for types of sampling of the features +# TODO: online construction of the feature_importances +# TODO: python script that tries all combinations +# TODO: n_classes is mandatory +# TODO: test that n_features is consistent across runs +# TODO: what if we feed several times the same dataset +# TODO: show that the classifier is insensitive to the time of arrival of the points +# TODO: try on datasets for which KNN and a linear method performs poorly +# TODO: V-fold instead of train and test ? + path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' filenames = [ @@ -54,9 +65,9 @@ RandomForestClassifier(n_estimators=n_trees) ] - # triche = RandomForestClassifier(n_estimators=n_trees) - # triche.fit(X_train, y_train) - # probabilities = triche.feature_importances_ / triche.feature_importances_.sum() + triche = RandomForestClassifier(n_estimators=n_trees) + triche.fit(X_train, y_train) + feature_importances = triche.feature_importances_ / triche.feature_importances_.sum() # # plt.stem(probabilities) # plt.title('Features importance for ' + filename, fontsize=18) @@ -72,6 +83,7 @@ for clf, name in zip(classifiers, names): if hasattr(clf, 'clear'): clf.clear() + clf.set_feature_importances(feature_importances) # print('Fitting', name) clf.fit(X_train, y_train) # print('Done.') diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py index 523ad86aa..3de808b52 100644 --- a/tick/inference/online_forest_classifier.py +++ b/tick/inference/online_forest_classifier.py @@ -192,5 +192,5 @@ def criterion(self, value): else: raise ValueError("``criterion`` must be either 'unif' or 'mse'.") - def set_probabilities(self, probabilities): - self._forest.set_probabilities(probabilities) + def set_feature_importances(self, feature_importances): + self._forest.set_feature_importances(feature_importances) diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 96489b338..1f4d0f867 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -720,7 +720,7 @@ inline uint32_t OnlineForestClassifier::sample_feature() { } inline uint32_t OnlineForestClassifier::sample_feature_bis() { - return rand.discrete(_probabilities); + return rand.discrete(_feature_importances); } inline double OnlineForestClassifier::sample_exponential(double intensity) { @@ -728,7 +728,13 @@ inline double OnlineForestClassifier::sample_exponential(double intensity) { } inline uint32_t OnlineForestClassifier::sample_feature(const ArrayDouble & prob) { - return rand.discrete(prob); + ArrayDouble my_prob = prob; + for(uint32_t j = 0; j < n_features(); ++j) { + // my_prob[j] *= _feature_importances[j]; + my_prob[j] = _feature_importances[j]; + } + my_prob /= my_prob.sum(); + return rand.discrete(my_prob); } inline double OnlineForestClassifier::sample_threshold(double left, double right) { diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index 78a7e2423..de14b6fac 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -9,20 +9,6 @@ #include "../../random/src/rand.h" -// TODO: faire tres attention au features binaires si le range est 0 sur toutes les coordonnées, ne rien faire -// TODO: code a classifier - -// TODO: choisir la feature proportionnellement au ratio des range de features, mais attention au cas de features -// discretes -// TODO: une option pour créer une cellule vide, enfin oublier les donnes dans la cellule quand elle a ete splitee - -// TODO: choix de la feature les labels - -// TODO: pour la classification, on utilise pas les frequences, on utilise des frequences regularisees, prior Dirichlet p_c = (n_c + 0.5) + (\sum n_c + C / 2). En fait une option - -// TODO: check that not using reserve in the forest works as well... - - enum class CriterionClassifier { log = 0, }; @@ -236,7 +222,7 @@ class OnlineForestClassifier { // Random number generator for feature and threshold sampling Rand rand; - ArrayDouble _probabilities; + ArrayDouble _feature_importances; // Create trees void create_trees(); @@ -346,8 +332,8 @@ class OnlineForestClassifier { return *this; } - inline void set_probabilities(const ArrayDouble &probabilities) { - _probabilities = probabilities; + inline void set_feature_importances(const ArrayDouble &feature_importances) { + _feature_importances = feature_importances; } // inline bool verbose() const; diff --git a/tick/inference/swig/online_forest_classifier.i b/tick/inference/swig/online_forest_classifier.i index 4bbded12f..cff7f1aff 100644 --- a/tick/inference/swig/online_forest_classifier.i +++ b/tick/inference/swig/online_forest_classifier.i @@ -48,5 +48,5 @@ class OnlineForestClassifier { // bool verbose() const; // OnlineForestRegressor &set_verbose(bool verbose); - void set_probabilities(const ArrayDouble & probabilities); + void set_feature_importances(const ArrayDouble &feature_importances); }; From e9a8f8031d7db31c2de3d7dea14162a1f917bd38 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Fri, 24 Nov 2017 09:45:56 +0100 Subject: [PATCH 21/32] ... --- online_forest.py | 31 ++- online_forest_agathe.py | 226 ++++++++++++++++++ tick/inference/online_forest_classifier.py | 60 ++--- .../src/online_forest_classifier.cpp | 34 +-- tick/inference/src/online_forest_classifier.h | 5 +- video.py | 4 +- 6 files changed, 289 insertions(+), 71 deletions(-) create mode 100644 online_forest_agathe.py diff --git a/online_forest.py b/online_forest.py index d32dac7c0..2bf15b4e0 100644 --- a/online_forest.py +++ b/online_forest.py @@ -5,6 +5,7 @@ from matplotlib.colors import ListedColormap from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import make_moons, make_classification, make_circles from sklearn.metrics import roc_auc_score import matplotlib.pyplot as plt @@ -136,7 +137,7 @@ def plot_decision_classification(classifiers, datasets, names): path = '/Users/stephane.gaiffas/Downloads/' -n_trees = 10 +n_trees = 20 X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_informative=2, random_state=1, @@ -145,8 +146,17 @@ def plot_decision_classification(classifiers, datasets, names): X += 2 * rng.uniform(size=X.shape) -# clf = OnlineForestClassifier(n_trees=n_trees, seed=123, step=1.) -# clf.fit(X, y) +clf = OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=1.) + +X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + +clf.fit(X_train, y_train) + +clf.predict(X_test) + +exit(0) + # clf.print() linearly_separable = (X, y) @@ -158,14 +168,13 @@ def plot_decision_classification(classifiers, datasets, names): linearly_separable ] - -from sklearn.neighbors import KNeighborsClassifier - - classifiers = [ - OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., use_aggregation=True), - # OnlineForestClassifier(n_trees=n_trees, seed=123, step=100., use_aggregation=True), - OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., use_aggregation=False), + OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=1., + use_aggregation=True), + OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=100., + use_aggregation=True), + OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=1., + use_aggregation=False), KNeighborsClassifier(n_neighbors=5), ExtraTreesClassifier(n_estimators=n_trees), RandomForestClassifier(n_estimators=n_trees) @@ -173,7 +182,7 @@ def plot_decision_classification(classifiers, datasets, names): names = [ "OF (agg, step=1.)", - # "OF(agg, step=100.)", + "OF(agg, step=100.)", "OF(no agg.)", "KNN (k=5)", "ET", diff --git a/online_forest_agathe.py b/online_forest_agathe.py new file mode 100644 index 000000000..61759a106 --- /dev/null +++ b/online_forest_agathe.py @@ -0,0 +1,226 @@ +from tick.simulation import SimuLogReg, weights_sparse_gauss +from sklearn.model_selection import train_test_split +import numpy as np +from tick.inference import OnlineForestClassifier +from matplotlib.colors import ListedColormap + +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.datasets import make_moons, make_classification, make_circles +from sklearn.metrics import roc_auc_score +import matplotlib.pyplot as plt + +from time import time + + +n_samples = 1000 +n_features = 2 +seed = 123 + +np.set_printoptions(precision=2) + +w0 = weights_sparse_gauss(n_features, nnz=2) +X, y = SimuLogReg(w0, -1., n_samples=n_samples, seed=seed).simulate() +y = (y + 1) / 2 + + +def plot_decisions_regression(clfs, datasets, names): + i = 1 + h = .02 + fig = plt.figure(figsize=(4 * (len(clfs) + 1), 4 * len(datasets))) + # iterate over datasets + for ds_cnt, ds in enumerate(datasets): + X, y = ds + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + # just plot the dataset first + cm = plt.cm.RdBu + ax = plt.subplot(len(datasets), len(clfs) + 1, i) + if ds_cnt == 0: + ax.set_title("Input data") + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=25, cmap=cm) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=25, + alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + # iterate over classifiers + for name, clf in zip(names, clfs): + ax = plt.subplot(len(datasets), len(clfs) + 1, i) + clf.fit(X_train, y_train) + Z = clf.predict(np.array([xx.ravel(), yy.ravel()]).T) + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + # Plot also the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm, s=15) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, + s=15, alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + if ds_cnt == 0: + ax.set_title(name) + i += 1 + + plt.tight_layout() + # plt.show() + + +def plot_decision_classification(classifiers, datasets, names): + n_classifiers = len(classifiers) + n_datasets = len(datasets) + h = .02 + fig = plt.figure(figsize=(2 * (n_classifiers + 1), 2 * n_datasets)) + i = 1 + # iterate over datasets + for ds_cnt, ds in enumerate(datasets): + # preprocess dataset, split into training and test part + X, y = ds + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + # just plot the dataset first + cm = plt.cm.RdBu + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = plt.subplot(n_datasets, n_classifiers + 1, i) + if ds_cnt == 0: + ax.set_title("Input data") + # Plot the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=10, cmap=cm) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=10, + alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + # iterate over classifiers + for name, clf in zip(names, classifiers): + ax = plt.subplot(n_datasets, n_classifiers + 1, i) + if hasattr(clf, 'clear'): + clf.clear() + clf.fit(X_train, y_train) + Z = clf.predict_proba(np.array([xx.ravel(), yy.ravel()]).T)[:, 1] + + score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + if ds_cnt == 0: + ax.set_title(name) + ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 + + plt.tight_layout() + # plt.show() + + +path = '/Users/stephane.gaiffas/Downloads/' + +n_trees = 30 + +X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) + + +# clf = OnlineForestClassifier(n_trees=n_trees, seed=123, step=1.) +# clf.fit(X, y) +# clf.print() + +linearly_separable = (X, y) + +X, y = make_moons(n_samples=n_samples, noise=0.3, random_state=0) + + +perm1 = np.arange(n_samples) +np.random.shuffle(perm1) +perm2 = np.arange(n_samples) +np.random.shuffle(perm2) +perm3 = np.arange(n_samples) +np.random.shuffle(perm3) + +perm4 = np.argsort(y) + +# datasets = [ +# make_moons(n_samples=n_samples, noise=0.3, random_state=0), +# make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), +# linearly_separable +# ] + + +X2 = np.empty((50, 2)) + +X2[:, 0] = -1 + 0.5 * np.random.random(50) +X2[:, 1] = 0.7 + 0.5 * np.random.random(50) + + +y2 = np.ones(50) +X = np.vstack((X2, X)) +y = np.concatenate((y2, y)) + +# X, y + +datasets = [ + (X, y), + # (X[perm2, :], y[perm2]), + # (X[perm3, :], y[perm3]), + # (X[perm4, :], y[perm4]) +] + +# datasets = [ +# (X[perm1, :], y[perm1]), +# (X[perm2, :], y[perm2]), +# (X[perm3, :], y[perm3]), +# (X[perm4, :], y[perm4]) +# ] + +print(X[perm1]) + +from sklearn.neighbors import KNeighborsClassifier + + +classifiers = [ + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., use_aggregation=True, n_classes=2), + # OnlineForestClassifier(n_trees=n_trees, seed=123, step=100., use_aggregation=True), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., use_aggregation=False, n_classes=2), + KNeighborsClassifier(n_neighbors=5), + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) +] + +names = [ + "OF (agg, step=1.)", + # "OF(agg, step=100.)", + "OF(no agg.)", + "KNN (k=5)", + "ET", + "BRF" +] + +plot_decision_classification(classifiers, datasets, names) + +# plt.savefig('decisions.pdf') + +plt.show() diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py index 3de808b52..a65e61ce7 100644 --- a/tick/inference/online_forest_classifier.py +++ b/tick/inference/online_forest_classifier.py @@ -4,10 +4,9 @@ from tick.base import Base from tick.base import actual_kwargs - -from .build.inference import OnlineForestClassifier as _OnlineForestClassifier from tick.preprocessing.utils import safe_array +from .build.inference import OnlineForestClassifier as _OnlineForestClassifier from .build.inference import CriterionClassifier_log as log @@ -18,10 +17,17 @@ class OnlineForestClassifier(ABC, Base): Parameters ---------- + n_classes : `int` + Number of classes, we need this information since in a online setting, + we don't know the number of classes in advance. + n_trees : `int`, default=10 Number of trees to grow in the forest. Cannot be changed after the first call to ``fit``. + step : `float`, default=1. + Step-size for the aggregation weights. Default is 1 for classification. + criterion : {'log'}, default='log' The criterion used to selected a split. Supported criteria are: * 'unif': splits are sampled uniformly in the range of the features, and @@ -30,12 +36,8 @@ class OnlineForestClassifier(ABC, Base): is selected This cannot be changed after the first call to ``fit`` - max_depth : `int`, default=-1 - The maximum depth of a tree. If <= 0, nodes are splitted with no limit - on the depth of the tree - - min_samples_split : `int`, default=50 - A node waits to contain `min_samples_split` before splitting. + use_aggregation : `bool`, default=True + If True n_threads : `int`, default=1 The number of threads used to grow trees in parallel during training. @@ -48,13 +50,6 @@ class OnlineForestClassifier(ABC, Base): verbose : `bool`, default=True If True, then verboses things during training - warm_start : `bool`, default=True - If True, then successive calls to ``fit`` will continue to grow existing - trees. Otherwise, we start from empty trees - - n_splits : `int`, default=10 - Number of potential splits to consider for a feature. BLABLA ??? - Attributes ---------- n_samples : `int` @@ -70,9 +65,6 @@ class OnlineForestClassifier(ABC, Base): '_forest': {'writable': False}, '_criterion': {'writable': False, 'cpp_setter': 'set_criterion'}, 'n_trees': {'writable': True, 'cpp_setter': 'set_n_trees'}, - 'max_depth': {'writable': True, 'cpp_setter': 'set_max_depth'}, - 'min_samples_split': {'writable': True, - 'cpp_setter': 'set_min_samples_split'}, 'n_threads': {'writable': True, 'cpp_setter': 'set_n_threads'}, 'seed': {'writable': True, 'cpp_setter': 'set_seed'}, 'verbose': {'writable': True, 'cpp_setter': 'set_verbose'}, @@ -85,11 +77,9 @@ class OnlineForestClassifier(ABC, Base): # TODO: n_classes must be mandatory @actual_kwargs - def __init__(self, n_trees: int = 10, n_classes: int=2, step: float = 1., + def __init__(self, n_classes: int, n_trees: int = 10, step: float = 1., criterion: str = 'log', use_aggregation: bool = True, - max_depth: int = -1, min_samples_split: int = 50, - n_threads: int = 1, seed: int = -1, verbose: bool = True, - warm_start: bool = True, n_splits: int = 10): + n_threads: int = 1, seed: int = -1, verbose: bool = True): Base.__init__(self) if not hasattr(self, "_actual_kwargs"): self._actual_kwargs = {} @@ -98,25 +88,14 @@ def __init__(self, n_trees: int = 10, n_classes: int=2, step: float = 1., self.n_classes = n_classes self.step = step self.criterion = criterion - self.max_depth = max_depth - self.min_samples_split = min_samples_split self.n_threads = n_threads self.seed = seed self.verbose = verbose - self.warm_start = warm_start - self.n_splits = n_splits self.use_aggregation = use_aggregation - self._forest = _OnlineForestClassifier(n_trees, - n_classes, - step, - self._criterion, - self.use_aggregation, - #max_depth, - # min_samples_split, - n_threads, - seed, - verbose) - #warm_start, n_splits) + self._forest = _OnlineForestClassifier(n_classes, n_trees, step, + self._criterion, + self.use_aggregation, n_threads, + seed, verbose) def set_data(self, X, y): X = safe_array(X) @@ -152,7 +131,7 @@ def predict_proba(self, X): import numpy as np scores = np.empty((X.shape[0], self.n_classes)) if not self._fitted: - raise ValueError("You must call ``fit`` before") + raise RuntimeError("You must call ``fit`` before") else: X = safe_array(X) self._forest.predict(X, scores) @@ -160,7 +139,7 @@ def predict_proba(self, X): def predict(self, X): if not self._fitted: - raise ValueError("You must call ``fit`` before") + raise RuntimeError("You must call ``fit`` before") else: scores = self.predict_proba(X) return scores.argmax(axis=1) @@ -170,13 +149,11 @@ def clear(self): def score(self, X, y): from sklearn.metrics import accuracy_score - y_pred = self.predict(X) return accuracy_score(y, y_pred) def print(self): self._forest._print() - # TODO: property for splits @property @@ -188,7 +165,6 @@ def criterion(self): def criterion(self, value): if value == 'log': self._set('_criterion', log) - # self._forest.set_criterion(unif) else: raise ValueError("``criterion`` must be either 'unif' or 'mse'.") diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 1f4d0f867..d4747b481 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -569,7 +569,7 @@ void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { } void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores, bool use_aggregation) { - // std::cout << "Going downwards" << std::endl; + std::cout << "Going downwards" << std::endl; uint32_t leaf = go_downwards(x_t, 0., true); if(!use_aggregation) { @@ -577,12 +577,12 @@ void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores, bool u return; } - // std::cout << "Done." << std::endl; + std::cout << "Done." << std::endl; uint32_t current = leaf; // The child of the current node that does not contain the data ArrayDouble pred_new(n_classes()); while (true) { - // std::cout << "node: " << current << std::endl; + std::cout << "node: " << current << std::endl; NodeClassifier ¤t_node = node(current); if (current_node.is_leaf()) { current_node.predict(scores); @@ -603,9 +603,7 @@ void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores, bool u } uint32_t TreeClassifier::add_node(uint32_t parent, double time) { - // std::cout << "Adding node with parent " << parent << std::endl; nodes.emplace_back(*this, parent, time); - // std::cout << "Done." << std::endl; return _n_nodes++; } @@ -687,8 +685,11 @@ void OnlineForestClassifier::fit(const SArrayDouble2dPtr features, } void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, - SArrayDouble2dPtr predictions) { - predictions->fill(0.); + SArrayDouble2dPtr scores) { + scores->fill(0.); + std::cout << "features->n_rows(): " << features->n_rows() << ", features->n_cols(): " << features->n_cols() << std::endl; + std::cout << "scores->n_rows(): " << scores->n_rows() << ", scores->n_cols(): " << scores->n_cols() << std::endl; + std::cout << "n_classes: " << _n_classes << std::endl; if (_iteration > 0) { uint32_t n_samples = static_cast(features->n_rows()); ArrayDouble scores_tree(_n_classes); @@ -697,8 +698,9 @@ void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, scores_forest.fill(0.); for (uint32_t i = 0; i < n_samples; ++i) { // The prediction is simply the average of the predictions - ArrayDouble scores_i = view_row(*predictions, i); + ArrayDouble scores_i = view_row(*scores, i); for (TreeClassifier &tree : trees) { + std::cout << "predict for tree " << std::endl; tree.predict(view_row(*features, i), scores_tree, _use_aggregation); // TODO: use a .incr method instead ?? scores_i.mult_incr(scores_tree, 1.); @@ -728,13 +730,15 @@ inline double OnlineForestClassifier::sample_exponential(double intensity) { } inline uint32_t OnlineForestClassifier::sample_feature(const ArrayDouble & prob) { - ArrayDouble my_prob = prob; - for(uint32_t j = 0; j < n_features(); ++j) { - // my_prob[j] *= _feature_importances[j]; - my_prob[j] = _feature_importances[j]; - } - my_prob /= my_prob.sum(); - return rand.discrete(my_prob); +// ArrayDouble my_prob = prob; +// for(uint32_t j = 0; j < n_features(); ++j) { +// // my_prob[j] *= _feature_importances[j]; +// my_prob[j] = _feature_importances[j]; +// } +// my_prob /= my_prob.sum(); + // return rand.discrete(my_prob); + return rand.discrete(prob); + } inline double OnlineForestClassifier::sample_threshold(double left, double right) { diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index de14b6fac..e27ae4d8c 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -9,6 +9,9 @@ #include "../../random/src/rand.h" +// TODO: in the forest, tests for the input labels and the size of the features (for fit and predict_proba and predict) + + enum class CriterionClassifier { log = 0, }; @@ -236,7 +239,7 @@ class OnlineForestClassifier { virtual ~OnlineForestClassifier(); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); - void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions); + void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr scores); inline uint32_t sample_feature(); inline uint32_t sample_feature(const ArrayDouble &prob); diff --git a/video.py b/video.py index dd8a79057..5de85343b 100644 --- a/video.py +++ b/video.py @@ -38,9 +38,9 @@ ax.scatter(X_train[:2, 0], X_train[:2, 1], c=np.array([0, 1]), s=25, cmap=cm) -n_trees = 10 +n_trees = 20 -clf = OnlineForestClassifier(n_trees=n_trees, n_classes=2, seed=123, step=1., +clf = OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=1., use_aggregation=False) From 57fea1be07017f728772f8eb0efc95c0a6ceaa5e Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Sat, 25 Nov 2017 14:15:03 +0100 Subject: [PATCH 22/32] ... --- online_forest.py | 20 +++++----- .../src/online_forest_classifier.cpp | 40 +++++++++++-------- tick/inference/src/online_forest_classifier.h | 29 +++++++------- .../inference/swig/online_forest_classifier.i | 10 ++--- 4 files changed, 54 insertions(+), 45 deletions(-) diff --git a/online_forest.py b/online_forest.py index 2bf15b4e0..51c858432 100644 --- a/online_forest.py +++ b/online_forest.py @@ -137,6 +137,7 @@ def plot_decision_classification(classifiers, datasets, names): path = '/Users/stephane.gaiffas/Downloads/' +n_classes = 2 n_trees = 20 X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, @@ -146,16 +147,17 @@ def plot_decision_classification(classifiers, datasets, names): X += 2 * rng.uniform(size=X.shape) -clf = OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=1.) +# clf = OnlineForestClassifier(n_classes=n_classes, n_trees=n_trees, seed=123, +# step=1., use_aggregation=True) -X_train, X_test, y_train, y_test = \ - train_test_split(X, y, test_size=.4, random_state=42) - -clf.fit(X_train, y_train) - -clf.predict(X_test) - -exit(0) +# X_train, X_test, y_train, y_test = \ +# train_test_split(X, y, test_size=.4, random_state=42) +# +# clf.fit(X_train, y_train) +# +# clf.predict(X_test) +# +# exit(0) # clf.print() diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index d4747b481..ed949bcd0 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -126,9 +126,15 @@ double NodeClassifier::score(uint8_t c) const { } inline void NodeClassifier::predict(ArrayDouble& scores) const { +// std::cout << "NodeClassifier::predict" << std::endl; +// std::cout << "n_classes: " << n_classes() << std::endl; +// std::cout << "scores.size(): " << scores.size() << std::endl; +// std::cout << "c="; for (uint8_t c=0; c < n_classes(); ++c) { +// std::cout << " " << c; scores[c] = score(c); } +// std::cout << std::endl << "... Done with NodeClassifier::predict" << std::endl; } double NodeClassifier::loss(const double y_t) { @@ -501,7 +507,7 @@ uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool p // Start at the root. Index of the root is always 0 // If predict == true, this call to find_leaf is for // prediction only, so that no leaf update and splits can be done - // std::cout << "Going downwards" << std::endl; +// std::cout << "Going downwards" << std::endl; uint32_t index_current_node = 0; bool is_leaf = false; while (!is_leaf) { @@ -524,7 +530,7 @@ uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool p } } } - // std::cout << "...Done going downwards." << std::endl; +// std::cout << "...Done going downwards." << std::endl; return index_current_node; } @@ -569,24 +575,26 @@ void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { } void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores, bool use_aggregation) { - std::cout << "Going downwards" << std::endl; +// std::cout << "TreeClassifier::predict" << std::endl; uint32_t leaf = go_downwards(x_t, 0., true); - if(!use_aggregation) { +// std::cout << "Not using aggregation so using only the leaf's prediction" << std::endl; node(leaf).predict(scores); return; } - std::cout << "Done." << std::endl; +// std::cout << "Done." << std::endl; uint32_t current = leaf; // The child of the current node that does not contain the data ArrayDouble pred_new(n_classes()); while (true) { - std::cout << "node: " << current << std::endl; +// std::cout << "node: " << current << std::endl; NodeClassifier ¤t_node = node(current); if (current_node.is_leaf()) { +// std::cout << "predict leaf" << std::endl; current_node.predict(scores); } else { +// std::cout << "predict node" << std::endl; double w = std::exp(current_node.weight() - current_node.weight_tree()); // Get the predictions of the current node current_node.predict(pred_new); @@ -596,6 +604,7 @@ void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores, bool u } // Root must be updated as well if (current == 0) { +// std::cout << "Done with predict." << std::endl; break; } current = current_node.parent(); @@ -627,19 +636,18 @@ inline CriterionClassifier TreeClassifier::criterion() const { * OnlineForestClassifier methods *********************************************************************************/ -OnlineForestClassifier::OnlineForestClassifier(uint32_t n_trees, - uint8_t n_classes, +OnlineForestClassifier::OnlineForestClassifier(uint8_t n_classes, + uint32_t n_trees, double step, CriterionClassifier criterion, bool use_aggregation, int32_t n_threads, int seed, bool verbose) - : _n_trees(n_trees), _n_classes(n_classes), _n_threads(n_threads), - _criterion(criterion), _use_aggregation(use_aggregation), _step(step), _verbose(verbose), trees() { + : _n_classes(n_classes), _n_trees(n_trees), _step(step), _criterion(criterion), + _use_aggregation(use_aggregation), _n_threads(n_threads), _verbose(verbose), rand(seed) { // No iteration so far _iteration = 0; - // std::cout << "sizeof(float): " << sizeof(float) << std::endl; // std::cout << "sizeof(double): " << sizeof(double) << std::endl; // std::cout << "sizeof(uint8_t): " << sizeof(uint8_t) << std::endl; @@ -649,8 +657,6 @@ OnlineForestClassifier::OnlineForestClassifier(uint32_t n_trees, // std::cout << "sizeof(ulong): " << sizeof(ulong) << std::endl; create_trees(); - // Seed the random number generators - set_seed(seed); } OnlineForestClassifier::~OnlineForestClassifier() {} @@ -687,9 +693,9 @@ void OnlineForestClassifier::fit(const SArrayDouble2dPtr features, void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr scores) { scores->fill(0.); - std::cout << "features->n_rows(): " << features->n_rows() << ", features->n_cols(): " << features->n_cols() << std::endl; - std::cout << "scores->n_rows(): " << scores->n_rows() << ", scores->n_cols(): " << scores->n_cols() << std::endl; - std::cout << "n_classes: " << _n_classes << std::endl; +// std::cout << "features->n_rows(): " << features->n_rows() << ", features->n_cols(): " << features->n_cols() << std::endl; +// std::cout << "scores->n_rows(): " << scores->n_rows() << ", scores->n_cols(): " << scores->n_cols() << std::endl; +// std::cout << "n_classes: " << _n_classes << std::endl; if (_iteration > 0) { uint32_t n_samples = static_cast(features->n_rows()); ArrayDouble scores_tree(_n_classes); @@ -700,7 +706,7 @@ void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, // The prediction is simply the average of the predictions ArrayDouble scores_i = view_row(*scores, i); for (TreeClassifier &tree : trees) { - std::cout << "predict for tree " << std::endl; +// std::cout << "predict for tree " << std::endl; tree.predict(view_row(*features, i), scores_tree, _use_aggregation); // TODO: use a .incr method instead ?? scores_i.mult_incr(scores_tree, 1.); diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index e27ae4d8c..ab804968f 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -118,9 +118,9 @@ class NodeClassifier { inline double time() const; inline NodeClassifier &set_time(double time); inline double features_min(const uint32_t j) const; - inline NodeClassifier & set_features_min(const ArrayDouble &features_min); + inline NodeClassifier &set_features_min(const ArrayDouble &features_min); inline double features_max(const uint32_t j) const; - inline NodeClassifier & set_features_max(const ArrayDouble &features_max); + inline NodeClassifier &set_features_max(const ArrayDouble &features_max); inline uint32_t n_samples() const; inline NodeClassifier &set_n_samples(uint32_t n_samples); inline double weight() const; @@ -200,24 +200,24 @@ class TreeClassifier { class OnlineForestClassifier { private: + // Number of classes in the classification problem + uint8_t _n_classes; // Number of Trees in the forest uint32_t _n_trees; - // Number of threads to use for parallel growing of trees - int32_t _n_threads; + // Step-size used for aggregation + double _step; // CriterionClassifier used for splitting (not used for now) CriterionClassifier _criterion; - + // bool _use_aggregation; - // Step-size used for aggregation - double _step; - // Number of features. - uint32_t _n_features; - // Number of classes in the classification problem - uint8_t _n_classes; + // Number of threads to use for parallel growing of trees + int32_t _n_threads; // Seed for random number generation int _seed; // Verbose things or not bool _verbose; + // Number of features. + uint32_t _n_features; // Iteration counter uint32_t _iteration; // The list of trees in the forest @@ -229,13 +229,14 @@ class OnlineForestClassifier { // Create trees void create_trees(); - SArrayDouble2dPtr _features; SArrayDoublePtr _labels; public: - OnlineForestClassifier(uint32_t n_trees, uint8_t n_classes, double step=1.0, CriterionClassifier criterion=CriterionClassifier::log, - bool use_aggregation = true, int32_t n_threads=1, int seed=0, bool verbose=false); + OnlineForestClassifier(uint8_t n_classes, uint32_t n_trees, double step = 1.0, + CriterionClassifier criterion = CriterionClassifier::log, + bool use_aggregation = true, int32_t n_threads = 1, + int seed = 0, bool verbose = false); virtual ~OnlineForestClassifier(); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); diff --git a/tick/inference/swig/online_forest_classifier.i b/tick/inference/swig/online_forest_classifier.i index cff7f1aff..d252f90ce 100644 --- a/tick/inference/swig/online_forest_classifier.i +++ b/tick/inference/swig/online_forest_classifier.i @@ -14,11 +14,11 @@ enum class CriterionClassifier { class OnlineForestClassifier { public: - OnlineForestClassifier(uint32_t n_trees, uint8_t n_classes, double step=1.0, - CriterionClassifier criterion=CriterionClassifier::log, - bool use_aggregation = true, - int32_t n_threads=1, - int seed=0, bool verbose=false); + + OnlineForestClassifier(uint8_t n_classes, uint32_t n_trees, double step = 1.0, + CriterionClassifier criterion = CriterionClassifier::log, + bool use_aggregation = true, int32_t n_threads = 1, + int seed = 0, bool verbose = false); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions); From 099c855a828f02f216c0e976c1685946e7918e9f Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Sat, 25 Nov 2017 21:54:02 +0100 Subject: [PATCH 23/32] ... --- online_forest_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/online_forest_data.py b/online_forest_data.py index 7400bab98..72ee8f38b 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -20,6 +20,7 @@ # TODO: show that the classifier is insensitive to the time of arrival of the points # TODO: try on datasets for which KNN and a linear method performs poorly # TODO: V-fold instead of train and test ? +# TODO: Set features importance with default to none path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' From cb88997f219520e575b8a3189e47ec557f60d890 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Mon, 27 Nov 2017 18:09:43 +0100 Subject: [PATCH 24/32] Added online_forest_datasets.py for many datasets tests --- online_forest_data.py | 6 +- online_forest_datasets.py | 250 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 online_forest_datasets.py diff --git a/online_forest_data.py b/online_forest_data.py index 72ee8f38b..a219110c9 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -10,7 +10,6 @@ import matplotlib.pyplot as plt - # TODO: options for types of sampling of the features # TODO: online construction of the feature_importances # TODO: python script that tries all combinations @@ -21,6 +20,11 @@ # TODO: try on datasets for which KNN and a linear method performs poorly # TODO: V-fold instead of train and test ? # TODO: Set features importance with default to none +# TODO: implement a subsample strategy : only one tree is updated with the given sample +# TODO: tree aggregation +# TODO: different "types" of trees: no aggregation, aggregation and different temperatures +# TODO: parse as int the n_classes + path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' diff --git a/online_forest_datasets.py b/online_forest_datasets.py new file mode 100644 index 000000000..3ebcf0968 --- /dev/null +++ b/online_forest_datasets.py @@ -0,0 +1,250 @@ + +import os +import pandas as pd +import numpy as np +from time import time +import zipfile + +from sklearn.preprocessing import MinMaxScaler +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import roc_auc_score +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split + +from tick.inference import OnlineForestClassifier + + +import matplotlib.pyplot as plt + + +path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' + + +def read_abalone(path): + archive = zipfile.ZipFile(os.path.join(path, 'abalone.csv.zip'), 'r') + with archive.open('abalone.csv') as f: + data = pd.read_csv(f, header=None) + continuous = list(range(1, 8)) + discrete = [0] + y = data.pop(8) + y -= 1 + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + y = y.as_matrix() + X = np.hstack((X_continuous, X_discrete)) + return X, y, 'abalone' + + +def read_adult(path): + archive = zipfile.ZipFile(os.path.join(path, 'adult.csv.zip'), 'r') + with archive.open('adult.csv') as f: + data = pd.read_csv(f, header=None) + y = data.pop(13) + discrete = [1, 3, 4, 5, 6, 7, 8, 12] + continuous = list(set(range(13)) - set(discrete)) + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + y = pd.get_dummies(y).as_matrix()[:, 1] + X = np.hstack((X_continuous, X_discrete)) + return X, y, 'adult' + + +def read_bank(path): + archive = zipfile.ZipFile(os.path.join(path, 'bank.csv.zip'), 'r') + with archive.open('bank.csv') as f: + data = pd.read_csv(f) + y = data.pop('y') + discrete = ['job', 'marital', 'education', 'default', 'housing', + 'loan', 'contact', 'day', 'month', 'campaign', 'poutcome'] + continuous = ['age', 'balance', 'duration', 'pdays', 'previous'] + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + y = pd.get_dummies(y).as_matrix()[:, 1] + X = np.hstack((X_continuous, X_discrete)) + return X, y, 'bank' + + +def read_car(path): + archive = zipfile.ZipFile(os.path.join(path, 'car.csv.zip'), 'r') + with archive.open('car.csv') as f: + data = pd.read_csv(f, header=None) + y = data.pop(6) + y = np.argmax(pd.get_dummies(y).as_matrix(), axis=1) + X = pd.get_dummies(data, prefix_sep='#').as_matrix() + return X, y, 'car' + + +def read_cardio(path): + archive = zipfile.ZipFile(os.path.join(path, 'cardiotocography.csv.zip'), + 'r') + with archive.open('cardiotocography.csv', ) as f: + data = pd.read_csv(f, sep=';', decimal=',') + + data.drop(['FileName', 'Date', 'SegFile', + 'A', 'B', 'C', 'D', 'E', 'AD', 'DE', + 'LD', 'FS', 'SUSP'], axis=1, inplace=True) + # A 10-class label + y_class = data.pop('CLASS').as_matrix() + y_class -= 1 + # A 3-class label + y_nsp = data.pop('NSP').as_matrix() + y_nsp -= 1 + continuous = [ + 'b', 'e', 'LBE', 'LB', 'AC', 'FM', 'UC', + 'ASTV', 'MSTV', 'ALTV', 'MLTV', + 'DL', 'DS', 'DP', + 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', + 'Mean', 'Median', 'Variance' + ] + + discrete = [ + 'Tendency' + ] + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + X = np.hstack((X_continuous, X_discrete)) + return X, y_nsp, 'cardio' + + +def read_churn(path): + archive = zipfile.ZipFile(os.path.join(path, 'churn.csv.zip'), 'r') + with archive.open('churn.csv') as f: + data = pd.read_csv(f) + y = data.pop('Churn?') + discrete = [ + 'State', 'Area Code', "Int'l Plan", 'VMail Plan', ] + + continuous = [ + 'Account Length', 'Day Mins', 'Day Calls', 'Eve Calls', 'Day Charge', + 'Eve Mins', 'Eve Charge', 'Night Mins', 'Night Calls', + 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', + 'CustServ Calls', 'VMail Message' + ] + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + y = pd.get_dummies(y).as_matrix()[:, 1] + X = np.hstack((X_continuous, X_discrete)) + return X, y, 'churn' + + +def read_default_cb(path): + archive = zipfile.ZipFile(os.path.join(path, 'default_cb.csv.zip'), 'r') + with archive.open('default_cb.csv') as f: + data = pd.read_csv(f) + continuous = [ + 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'LIMIT_BAL', + 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', + 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6' + ] + discrete = [ + 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', + 'SEX', 'EDUCATION', 'MARRIAGE' + ] + _ = data.pop('ID') + y = data.pop('default payment next month') + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + y = pd.get_dummies(y).as_matrix()[:, 1] + X = np.hstack((X_continuous, X_discrete)) + return X, y, 'default_cb' + +readers = [ + # read_abalone, + # read_adult + # read_bank + # read_car, + read_cardio, + read_churn, + read_default_cb +] + +n_trees = 10 + +names = [ + "OF (agg, step=1.)", + "OF(agg, step=100.)", + "OF(no agg.)", + "KNN (k=5)", + "LR", + "ET", + "BRF" +] + + +data_description = pd.DataFrame( + columns=['name', '#samples', '#features', '#classes'] +) + +performances = pd.DataFrame( + columns=['dataset'] + names +) + +timings = pd.DataFrame( + columns=['dataset'] + names +) + + +for reader in readers: + # Read the data + X, y, dataset_name = reader(path) + X_train, X_test, y_train, y_test \ + = train_test_split(X, y, test_size=.3, random_state=42) + + n_samples, n_features = X.shape + n_classes = int(y.max() + 1) + + data_description = data_description.append( + pd.DataFrame([[dataset_name, n_samples, n_features, n_classes]], + columns=data_description.columns) + ) + + classifiers = [ + OnlineForestClassifier(n_classes=n_classes, n_trees=n_trees, seed=123, step=1., + use_aggregation=True), + OnlineForestClassifier(n_classes=n_classes, n_trees=n_trees, seed=123, + step=100., + use_aggregation=True), + OnlineForestClassifier(n_classes=n_classes, n_trees=n_trees, seed=123, step=1., + use_aggregation=False), + KNeighborsClassifier(n_neighbors=5), + LogisticRegression(class_weight='balanced'), + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) + ] + + performance = [dataset_name] + timing = [dataset_name] + + for clf, name in zip(classifiers, names): + if hasattr(clf, 'clear'): + clf.clear() + t1 = time() + clf.fit(X_train, y_train) + t2 = time() + score = clf.score(X_test, y_test) + t = t2 - t1 + timing.append(t) + performance.append(score) + print('Accuracy of', name, ': ', + '%.2f' % score, + "in %.2f (s)" % t) + + performances = performances.append( + pd.DataFrame([performance], columns=performances.columns) + ) + timings = timings.append( + pd.DataFrame([timing], columns=timings.columns) + ) + +print(data_description) + +print(performances) + +print(timings) From 233b4568dc05fae0ee13bed04536f0a24bbbbd12 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Mon, 27 Nov 2017 18:31:02 +0100 Subject: [PATCH 25/32] ... --- online_forest_datasets.py | 153 +++++++++++++++++++++++++++++++++++++- 1 file changed, 150 insertions(+), 3 deletions(-) diff --git a/online_forest_datasets.py b/online_forest_datasets.py index 3ebcf0968..a7dcec577 100644 --- a/online_forest_datasets.py +++ b/online_forest_datasets.py @@ -20,6 +20,9 @@ path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' +# TODO: do for dna.p + + def read_abalone(path): archive = zipfile.ZipFile(os.path.join(path, 'abalone.csv.zip'), 'r') @@ -155,14 +158,158 @@ def read_default_cb(path): X = np.hstack((X_continuous, X_discrete)) return X, y, 'default_cb' + +def read_ijcnn1(path): + archive = zipfile.ZipFile(os.path.join(path, 'ijcnn1.csv.zip'), 'r') + with archive.open('ijcnn1.csv', ) as f: + data = pd.read_csv(f, header=None) + y = data.pop(12).as_matrix() + y = (y + 1) / 2 + X = data.as_matrix() + return X, y, 'ijcnn1' + + +def read_isolet(path): + archive = zipfile.ZipFile(os.path.join(path, 'isolet.zip'), 'r') + with archive.open('isolet/isolet1234.csv') as f: + data1 = pd.read_csv(f, header=None) + with archive.open('isolet/isolet5.csv') as f: + data2 = pd.read_csv(f, header=None) + data = pd.concat((data1, data2)) + y = data.pop(617).as_matrix() + y -= 1 + X = data.as_matrix() + return X, y, 'isolet' + + +def read_letter(path): + archive = zipfile.ZipFile(os.path.join(path, 'letter.csv.zip'), 'r') + with archive.open('letter.csv') as f: + data = pd.read_csv(f) + data.drop(['Unnamed: 0'], axis=1, inplace=True) + y = data.pop('y').as_matrix() + X = data.as_matrix() + return X, y, 'letter' + + +def read_nursery(path): + archive = zipfile.ZipFile(os.path.join(path, 'nursery.csv.zip'), 'r') + with archive.open('nursery.csv') as f: + data = pd.read_csv(f, header=None) + y1 = data.pop(7) + y1 = pd.get_dummies(y1) + y1 = y1.as_matrix().argmax(axis=1) + y2 = data.pop(8) + y2 = pd.get_dummies(y2) + y2 = y2.as_matrix().argmax(axis=1) + X = pd.get_dummies(data, prefix_sep='#').as_matrix() + return X, y2, 'nursery' + + +def read_ozone(path): + archive = zipfile.ZipFile(os.path.join(path, 'ozone.zip'), 'r') + with archive.open('ozone/ozone.eighthr.csv') as f: + data = pd.read_csv(f, header=None, na_values='?') + data.dropna(inplace=True) + data.drop([0], axis=1, inplace=True) + y = data.pop(73).as_matrix() + X = data.as_matrix() + return X, y, 'ozone' + + +def read_satimage(path): + archive = zipfile.ZipFile(os.path.join(path, 'satimage.csv.zip'), 'r') + with archive.open('satimage.csv') as f: + data = pd.read_csv(f) + data.drop(['Unnamed: 0'], axis=1, inplace=True) + y = data.pop('y').as_matrix() + X = data.as_matrix() + return X, y, 'satimage' + + +def read_sensorless(path): + archive = zipfile.ZipFile(os.path.join(path, 'sensorless.csv.zip'), 'r') + with archive.open('sensorless.csv') as f: + data = pd.read_csv(f, sep=' ', header=None) + y = data.pop(48).as_matrix() + y -= 1 + X = MinMaxScaler().fit_transform(data) + return X, y, 'sensorless' + + +def read_shuttle(path): + archive = zipfile.ZipFile(os.path.join(path, 'shuttle.csv.zip'), 'r') + with archive.open('shuttle.csv') as f: + data = pd.read_csv(f, header=None) + + y = data.pop(10).as_matrix() + y -= 1 + X = MinMaxScaler().fit_transform(data) + return X, y, 'shuttle' + + +def read_spambase(path): + archive = zipfile.ZipFile(os.path.join(path, 'spambase.csv.zip'), 'r') + with archive.open('spambase.csv') as f: + data = pd.read_csv(f, header=None) + y = data.pop(57).as_matrix() + X = MinMaxScaler().fit_transform(data) + return X, y, 'spambase' + + +def read_usps(path): + archive = zipfile.ZipFile(os.path.join(path, 'usps.csv.zip'), 'r') + with archive.open('usps.csv') as f: + data = pd.read_csv(f) + data.drop(['Unnamed: 0'], axis=1, inplace=True) + y = data.pop('y').as_matrix() + X = data.as_matrix() + return X, y, 'usps' + + +def read_wilt(path): + archive = zipfile.ZipFile(os.path.join(path, 'wilt.csv.zip'), 'r') + with archive.open('wilt.csv') as f: + data = pd.read_csv(f) + y = data.pop('class') + y = pd.get_dummies(y).as_matrix().argmax(axis=1) + X = MinMaxScaler().fit_transform(data) + return X, y, 'wilt' + + +def read_wine(path): + archive = zipfile.ZipFile(os.path.join(path, 'wine.zip'), 'r') + with archive.open('wine/red.csv') as f: + data_red = pd.read_csv(f, sep=';') + with archive.open('wine/white.csv') as f: + data_white = pd.read_csv(f, sep=";") + data = data_red + y = data.pop('quality').as_matrix() + y -= 3 + X = MinMaxScaler().fit_transform(data) + return X, y, 'wine' + + readers = [ # read_abalone, # read_adult # read_bank # read_car, - read_cardio, - read_churn, - read_default_cb + # read_cardio, + # read_churn, + # read_default_cb, + # read_ijcnn1 + # read_isolet, + # read_letter, + # read_nursery, + # read_ozone, + # read_satimage, + # read_sensorless, + # read_shuttle, + # read_spambase, + # read_usps, + # read_wilt, + read_wine ] n_trees = 10 From 6929534fd6a315c98de3a9af1419e2cdcd672f0b Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 29 Nov 2017 10:44:28 +0100 Subject: [PATCH 26/32] Safety checks for fit --- online_forest.py | 17 +++++++- online_forest_data.py | 2 +- online_forest_datasets.py | 39 +++++++++---------- .../src/online_forest_classifier.cpp | 15 +++++-- tick/inference/src/online_forest_classifier.h | 35 ++++++++++++++++- 5 files changed, 80 insertions(+), 28 deletions(-) diff --git a/online_forest.py b/online_forest.py index 51c858432..879233917 100644 --- a/online_forest.py +++ b/online_forest.py @@ -140,12 +140,27 @@ def plot_decision_classification(classifiers, datasets, names): n_classes = 2 n_trees = 20 -X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, +X, y = make_classification(n_samples=n_samples, n_features=10, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) +of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=1., + use_aggregation=True) + + +of.fit(X, y) + +X, y = make_classification(n_samples=n_samples, n_features=10, n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) + +of.fit(X, y + 2) + + +exit(0) + # clf = OnlineForestClassifier(n_classes=n_classes, n_trees=n_trees, seed=123, # step=1., use_aggregation=True) diff --git a/online_forest_data.py b/online_forest_data.py index a219110c9..adcc8b29d 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -10,10 +10,10 @@ import matplotlib.pyplot as plt + # TODO: options for types of sampling of the features # TODO: online construction of the feature_importances # TODO: python script that tries all combinations -# TODO: n_classes is mandatory # TODO: test that n_features is consistent across runs # TODO: what if we feed several times the same dataset # TODO: show that the classifier is insensitive to the time of arrival of the points diff --git a/online_forest_datasets.py b/online_forest_datasets.py index a7dcec577..af21ee27a 100644 --- a/online_forest_datasets.py +++ b/online_forest_datasets.py @@ -15,9 +15,6 @@ from tick.inference import OnlineForestClassifier -import matplotlib.pyplot as plt - - path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' # TODO: do for dna.p @@ -291,24 +288,24 @@ def read_wine(path): readers = [ - # read_abalone, - # read_adult - # read_bank - # read_car, - # read_cardio, - # read_churn, - # read_default_cb, - # read_ijcnn1 - # read_isolet, - # read_letter, - # read_nursery, - # read_ozone, - # read_satimage, - # read_sensorless, - # read_shuttle, - # read_spambase, - # read_usps, - # read_wilt, + read_abalone, + read_adult + read_bank + read_car, + read_cardio, + read_churn, + read_default_cb, + read_ijcnn1 + read_isolet, + read_letter, + read_nursery, + read_ozone, + read_satimage, + read_sensorless, + read_shuttle, + read_spambase, + read_usps, + read_wilt, read_wine ] diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index ed949bcd0..92d6547e6 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -665,6 +665,7 @@ void OnlineForestClassifier::create_trees() { // Just in case... trees.clear(); trees.reserve(_n_trees); + // Better tree allocation for (uint32_t i = 0; i < _n_trees; ++i) { trees.emplace_back(*this); } @@ -672,17 +673,23 @@ void OnlineForestClassifier::create_trees() { void OnlineForestClassifier::fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels) { - // std::cout << "OnlineForestClassifier::fit" << std::endl; + uint32_t n_samples = static_cast(features->n_rows()); + uint32_t n_features = static_cast(features->n_cols()); + if(_iteration == 0) { + _n_features = n_features; + } else { + check_n_features(n_features, false); + } _features = features; _labels = labels; - uint32_t n_samples = static_cast(features->n_rows()); - uint32_t n_features = static_cast(features->n_cols()); - set_n_features(n_features); + // set_n_features(n_features); for (uint32_t i = 0; i < n_samples; ++i) { for (TreeClassifier &tree : trees) { // Fit the tree online using the new data point + double label = (*labels)[i]; + check_label(label); tree.fit(view_row(*features, i), (*labels)[i]); } _iteration++; diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index ab804968f..5da2bc3a3 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -5,6 +5,7 @@ // License: BSD 3 clause #include "base.h" +#include #include #include "../../random/src/rand.h" @@ -292,12 +293,44 @@ class OnlineForestClassifier { return *this; } + inline void check_n_features(uint32_t n_features, bool predict) const { + if (n_features != _n_features) { + if(predict) { + TICK_ERROR("Wrong number of features: trained with " + std::to_string(_n_features) + + " features, but received " + std::to_string(n_features) + " features for prediction"); + } else { + TICK_ERROR("Wrong number of features: started to train with " + std::to_string(_n_features) + + " features, but received " + std::to_string(n_features) + " afterwards"); + + } + } + } + + inline void check_label(double label) const { + + double iptr; + double fptr = std::modf(label, &iptr); + if(fptr != 0) { + TICK_ERROR("Wrong label type: received " + std::to_string(label) + " for a classification problem"); + } + if ((label < 0) || (label >= _n_classes) ) { + TICK_ERROR("Wrong label value: received " + std::to_string(label) + " while training for classification with " + + std::to_string(_n_classes) + " classes."); + } + } +/* inline OnlineForestClassifier &set_n_features(uint32_t n_features) { if (_iteration == 0) { - _n_features = n_features; + + } else { + if (n_features != _n_features) { + TICK_ERROR("Wrong number of features: started to train with " + std::to_string(_n_features) + + " features, but received " + std::to_string(n_features) + " afterwards"); + } } return *this; } +*/ inline uint32_t n_trees() const { return _n_trees; From 81b3d8b9b068cf16187b92301a77a01ed7a09021 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Wed, 29 Nov 2017 11:07:31 +0100 Subject: [PATCH 27/32] ... --- online_forest.py | 8 +++++++- online_forest_data.py | 8 +++++--- tick/inference/src/online_forest_classifier.cpp | 2 ++ tick/inference/src/online_forest_classifier.h | 5 ++++- tick/inference/src/online_forest_regressor.h | 14 -------------- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/online_forest.py b/online_forest.py index 879233917..255b84554 100644 --- a/online_forest.py +++ b/online_forest.py @@ -156,8 +156,14 @@ def plot_decision_classification(classifiers, datasets, names): n_informative=2, random_state=1, n_clusters_per_class=1) -of.fit(X, y + 2) +of.fit(X, y) + +X, y = make_classification(n_samples=n_samples, n_features=3, n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) + +of.predict_proba(X) exit(0) diff --git a/online_forest_data.py b/online_forest_data.py index adcc8b29d..94628f5df 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -14,17 +14,19 @@ # TODO: options for types of sampling of the features # TODO: online construction of the feature_importances # TODO: python script that tries all combinations -# TODO: test that n_features is consistent across runs + # TODO: what if we feed several times the same dataset # TODO: show that the classifier is insensitive to the time of arrival of the points -# TODO: try on datasets for which KNN and a linear method performs poorly # TODO: V-fold instead of train and test ? # TODO: Set features importance with default to none # TODO: implement a subsample strategy : only one tree is updated with the given sample # TODO: tree aggregation # TODO: different "types" of trees: no aggregation, aggregation and different temperatures -# TODO: parse as int the n_classes +# TODO: unittest for attributes +# TODO: unittest for wrong n_features in fit and predict and wrong labels in training + +# TODO: tryout multiple passes path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 92d6547e6..6453d695f 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -703,6 +703,8 @@ void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, // std::cout << "features->n_rows(): " << features->n_rows() << ", features->n_cols(): " << features->n_cols() << std::endl; // std::cout << "scores->n_rows(): " << scores->n_rows() << ", scores->n_cols(): " << scores->n_cols() << std::endl; // std::cout << "n_classes: " << _n_classes << std::endl; + uint32_t n_features = static_cast(features->n_cols()); + check_n_features(n_features, true); if (_iteration > 0) { uint32_t n_samples = static_cast(features->n_rows()); ArrayDouble scores_tree(_n_classes); diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index 5da2bc3a3..5e80b5fbe 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -9,8 +9,11 @@ #include #include "../../random/src/rand.h" +// TODO: change the Dirichlet parameter +// TODO: reserve nodes in advance +// TODO: set_feature_importances with a nullptr by default +// TODO: subsample parameter, default 0.5 -// TODO: in the forest, tests for the input labels and the size of the features (for fit and predict_proba and predict) enum class CriterionClassifier { diff --git a/tick/inference/src/online_forest_regressor.h b/tick/inference/src/online_forest_regressor.h index 18a9c8326..224f02867 100644 --- a/tick/inference/src/online_forest_regressor.h +++ b/tick/inference/src/online_forest_regressor.h @@ -9,20 +9,6 @@ #include "../../random/src/rand.h" -// TODO: faire tres attention au features binaires si le range est 0 sur toutes les coordonnées, ne rien faire -// TODO: code a classifier - -// TODO: choisir la feature proportionnellement au ratio des range de features, mais attention au cas de features -// discretes -// TODO: une option pour créer une cellule vide, enfin oublier les donnes dans la cellule quand elle a ete splitee - -// TODO: choix de la feature les labels - -// TODO: pour la classification, on utilise pas les frequences, on utilise des frequences regularisees, prior Dirichlet p_c = (n_c + 0.5) + (\sum n_c + C / 2). En fait une option - -// TODO: check that not using reserve in the forest works as well... - - enum class CriterionRegressor { unif = 0, mse From 5707c6b9a3d7f3d219d75d9ad8fac0653b5df1e8 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Thu, 30 Nov 2017 23:49:49 +0100 Subject: [PATCH 28/32] ... --- LICENSE.txt | 12 -- online_forest.py | 146 ++++++++++------ online_forest_data.py | 2 + requirements.txt | 6 - tick/inference/online_forest_classifier.py | 30 +++- .../src/online_forest_classifier.cpp | 162 ++++++++---------- tick/inference/src/online_forest_classifier.h | 47 +++-- .../inference/swig/online_forest_classifier.i | 10 +- .../tests/online_forest_classifier_test.py | 49 ++++++ 9 files changed, 282 insertions(+), 182 deletions(-) delete mode 100644 LICENSE.txt delete mode 100644 requirements.txt create mode 100644 tick/inference/tests/online_forest_classifier_test.py diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index e88bd6720..000000000 --- a/LICENSE.txt +++ /dev/null @@ -1,12 +0,0 @@ -Copyright (c) 2015-2017, the tick developers -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/online_forest.py b/online_forest.py index 255b84554..5b9cf25dc 100644 --- a/online_forest.py +++ b/online_forest.py @@ -13,15 +13,11 @@ from time import time -n_samples = 500 -n_features = 2 -seed = 123 - np.set_printoptions(precision=2) -w0 = weights_sparse_gauss(n_features, nnz=2) -X, y = SimuLogReg(w0, -1., n_samples=n_samples, seed=seed).simulate() -y = (y + 1) / 2 +# w0 = weights_sparse_gauss(n_features, nnz=2) +# X, y = SimuLogReg(w0, -1., n_samples=n_samples, seed=seed).simulate() +# y = (y + 1) / 2 def plot_decisions_regression(clfs, datasets, names): @@ -137,80 +133,130 @@ def plot_decision_classification(classifiers, datasets, names): path = '/Users/stephane.gaiffas/Downloads/' +n_samples = 20000 +n_features = 100 n_classes = 2 -n_trees = 20 -X, y = make_classification(n_samples=n_samples, n_features=10, n_redundant=0, - n_informative=2, random_state=1, - n_clusters_per_class=1) -rng = np.random.RandomState(2) -X += 2 * rng.uniform(size=X.shape) +# +# of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=30., n_passes=1, +# seed=123, use_aggregation=True) +# +# of.fit(X, y) -of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=1., - use_aggregation=True) +# print("n_nodes:", of.n_nodes()) +# print("n_leaves:", of.n_leaves()) +# print(of.predict_proba(X)) +# print("step: ", of._forest.step()) +# of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=1., +# seed=123, use_aggregation=True, n_passes=1) +# +# of.fit(X, y) -of.fit(X, y) +# print("n_nodes:", of.n_nodes()) +# print("n_leaves:", of.n_leaves()) +# print(of.predict_proba(X)) +# print("step: ", of._forest.step()) -X, y = make_classification(n_samples=n_samples, n_features=10, n_redundant=0, - n_informative=2, random_state=1, - n_clusters_per_class=1) -of.fit(X, y) +# exit(0) -X, y = make_classification(n_samples=n_samples, n_features=3, n_redundant=0, - n_informative=2, random_state=1, +X, y = make_classification(n_samples=n_samples, n_features=n_features, + n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) -of.predict_proba(X) - -exit(0) +clf = OnlineForestClassifier(n_classes=n_classes, n_trees=50, seed=123, + step=1., use_aggregation=True) +X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) -# clf = OnlineForestClassifier(n_classes=n_classes, n_trees=n_trees, seed=123, -# step=1., use_aggregation=True) +clf.fit(X_train, y_train) -# X_train, X_test, y_train, y_test = \ -# train_test_split(X, y, test_size=.4, random_state=42) -# -# clf.fit(X_train, y_train) -# # clf.predict(X_test) -# -# exit(0) + +exit(0) # clf.print() +X, y = make_classification(n_samples=n_samples, n_features=n_features, n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) - datasets = [ make_moons(n_samples=n_samples, noise=0.3, random_state=0), make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), linearly_separable ] -classifiers = [ - OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=1., - use_aggregation=True), - OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=100., - use_aggregation=True), - OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=1., - use_aggregation=False), +n_trees = 10 + +of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=30., n_passes=1, + seed=123, use_aggregation=True) +seed = 123 + + +params = [ + {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 0.1}, + {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 0.5}, + {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 2}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.2, 'n_passes': 5}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 10}, + # {'use_aggregation': True, 'n_trees': 1, 'subsampling': 1, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 1, 'subsampling': 0.1, 'n_passes': 10}, + # + # {'use_aggregation': True, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 20}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 20}, + # {'use_aggregation': False, 'n_trees': 1, 'subsampling': 1, 'n_passes': 1}, + # {'use_aggregation': False, 'n_trees': 1, 'subsampling': 1, 'n_passes': 20}, + # {'use_aggregation': False, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 1}, + # {'use_aggregation': False, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 20}, + # {'use_aggregation': False, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 1}, + # {'use_aggregation': False, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 20}, +] + + +def toto(kkk): + return "OF(T: " \ + + str(kkk['n_trees']) + ", S: " + str(kkk['subsampling']) \ + + ', P: ' + str(kkk['n_passes']) + ', di: ' + str(kkk['dirichlet']) \ + + ")" + # return "OF(A: " + str(kkk['use_aggregation']) + ", T: " \ + # + str(kkk['n_trees']) + ", S: " + str(kkk['subsampling']) \ + # + ', P: ' + str(kkk['n_passes']) + ")" + + +names = list(toto(kw) for kw in params) + ["KNN", "ET", "BRF"] + +classifiers = list( + OnlineForestClassifier(n_classes=n_classes, seed=123, step=1., **kw) + for kw in params +) + +classifiers += [ KNeighborsClassifier(n_neighbors=5), ExtraTreesClassifier(n_estimators=n_trees), RandomForestClassifier(n_estimators=n_trees) ] -names = [ - "OF (agg, step=1.)", - "OF(agg, step=100.)", - "OF(no agg.)", - "KNN (k=5)", - "ET", - "BRF" -] +# names = [ +# "OF(agg, n_passes=1)", +# "OF(agg, n_passes=5)", +# "OF(agg, n_passes=10)", +# "OF(no agg., n_passes=1)", +# "KNN (k=5)", +# "ET", +# "BRF" +# ] plot_decision_classification(classifiers, datasets, names) diff --git a/online_forest_data.py b/online_forest_data.py index 94628f5df..c51a499ba 100644 --- a/online_forest_data.py +++ b/online_forest_data.py @@ -27,6 +27,8 @@ # TODO: unittest for wrong n_features in fit and predict and wrong labels in training # TODO: tryout multiple passes +# TODO: really make seed work with inline forest + path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index c94cc223f..000000000 --- a/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -numpy -numpydoc -scipy -matplotlib -scikit-learn -pandas diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py index a65e61ce7..22c7d2c7e 100644 --- a/tick/inference/online_forest_classifier.py +++ b/tick/inference/online_forest_classifier.py @@ -2,6 +2,8 @@ from abc import ABC +import numpy as np + from tick.base import Base from tick.base import actual_kwargs from tick.preprocessing.utils import safe_array @@ -77,14 +79,18 @@ class OnlineForestClassifier(ABC, Base): # TODO: n_classes must be mandatory @actual_kwargs - def __init__(self, n_classes: int, n_trees: int = 10, step: float = 1., + def __init__(self, n_classes: int, n_trees: int = 10, n_passes: int = 1, + step: float = 1., criterion: str = 'log', use_aggregation: bool = True, - n_threads: int = 1, seed: int = -1, verbose: bool = True): + subsampling: float=1., dirichlet: float=None, + n_threads: int = 1, + seed: int = -1, verbose: bool = True): Base.__init__(self) if not hasattr(self, "_actual_kwargs"): self._actual_kwargs = {} self._fitted = False self.n_trees = n_trees + self.n_passes = n_passes self.n_classes = n_classes self.step = step self.criterion = criterion @@ -92,9 +98,15 @@ def __init__(self, n_classes: int, n_trees: int = 10, step: float = 1., self.seed = seed self.verbose = verbose self.use_aggregation = use_aggregation - self._forest = _OnlineForestClassifier(n_classes, n_trees, step, + self.subsampling = subsampling + if dirichlet is None: + dirichlet = 1 / n_classes + self.dirichlet = dirichlet + self._forest = _OnlineForestClassifier(n_classes, n_trees, n_passes, + step, self._criterion, - self.use_aggregation, n_threads, + self.use_aggregation, + subsampling, dirichlet, n_threads, seed, verbose) def set_data(self, X, y): @@ -156,6 +168,16 @@ def print(self): self._forest._print() # TODO: property for splits + def n_leaves(self): + n_leaves_per_tree = np.empty(self.n_trees, dtype=np.uint32) + self._forest.n_leaves(n_leaves_per_tree) + return n_leaves_per_tree + + def n_nodes(self): + n_nodes_per_tree = np.empty(self.n_trees, dtype=np.uint32) + self._forest.n_nodes(n_nodes_per_tree) + return n_nodes_per_tree + @property def criterion(self): if self._criterion == log: diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 6453d695f..f57a62dcd 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -72,7 +72,9 @@ NodeClassifier &NodeClassifier::operator=(const NodeClassifier &node) { void NodeClassifier::update_downwards(const ArrayDouble &x_t, const double y_t) { _n_samples++; - _weight -= step() * loss(y_t); + if(use_aggregation()) { + _weight -= step() * loss(y_t); + } update_predict(y_t); } @@ -121,20 +123,14 @@ void NodeClassifier::update_range(const ArrayDouble &x_t) { } double NodeClassifier::score(uint8_t c) const { - // Using Dirichet(1/2, ... 1/2) prior - return static_cast(2 * _counts[c] + 1) / (2 * _n_samples + n_classes()); + // Using the Dirichet prior + return (_counts[c] + dirichlet()) / (_n_samples + dirichlet() * n_classes()); } inline void NodeClassifier::predict(ArrayDouble& scores) const { -// std::cout << "NodeClassifier::predict" << std::endl; -// std::cout << "n_classes: " << n_classes() << std::endl; -// std::cout << "scores.size(): " << scores.size() << std::endl; -// std::cout << "c="; for (uint8_t c=0; c < n_classes(); ++c) { -// std::cout << " " << c; scores[c] = score(c); } -// std::cout << std::endl << "... Done with NodeClassifier::predict" << std::endl; } double NodeClassifier::loss(const double y_t) { @@ -159,6 +155,10 @@ inline double NodeClassifier::step() const { return _tree.step(); } +inline double NodeClassifier::dirichlet() const { + return _tree.dirichlet(); +} + inline uint32_t NodeClassifier::parent() const { return _parent; } @@ -249,6 +249,10 @@ inline NodeClassifier &NodeClassifier::set_n_samples(uint32_t n_samples) { return *this; } +inline bool NodeClassifier::use_aggregation() const { + return _tree.use_aggregation(); +} + inline double NodeClassifier::weight() const { return _weight; } @@ -319,74 +323,8 @@ TreeClassifier::TreeClassifier(const TreeClassifier &&tree) TreeClassifier::TreeClassifier(OnlineForestClassifier &forest) : forest(forest) { // TODO: pre-allocate the vector to make things faster ? add_node(0); -} -//uint32_t TreeClassifier::split_leaf(uint32_t index, const ArrayDouble &x_t, double y_t) { -// // std::cout << "Splitting node " << index << std::endl; -// uint32_t left = add_node(index); -// uint32_t right = add_node(index); -// node(index).set_left(left).set_right(right).set_is_leaf(false); -// -// // std::cout << "n_features(): " << n_features() << std::endl; -// ArrayDouble diff(n_features()); -// for(uint32_t j = 0; j < n_features(); ++j) { -// // std::cout << "j: " << j; -// diff[j] = std::abs(node(index).x_t()[j] - x_t[j]); -// } -// // std::cout << std::endl; -// diff /= diff.sum(); -// // diff.print(); -// // std::cout << "diff.sum=" << diff.sum() << std::endl; -// -// // TODO: better feature sampling -// // ulong feature = forest.sample_feature_bis(); -// // ulong feature = forest.sample_feature(); -// -// uint32_t feature = forest.sample_feature(diff); -// -// // std::cout << "feature: " << feature << std::endl; -// -// double x1_tj = x_t[feature]; -// double x2_tj = node(index).x_t()[feature]; -// double threshold; -// -// // The leaf that contains the passed sample (x_t, y_t) -// uint32_t data_leaf; -// uint32_t other_leaf; -// -// // std::cout << "x1_tj= " << x1_tj << " x2_tj= " << x2_tj << " threshold= " << threshold << std::endl; -// // TODO: what if x1_tj == x2_tj. Must be taken care of by sample_feature() -// if (x1_tj < x2_tj) { -// threshold = forest.sample_threshold(x1_tj, x2_tj); -// data_leaf = left; -// other_leaf = right; -// } else { -// threshold = forest.sample_threshold(x2_tj, x1_tj); -// data_leaf = right; -// other_leaf = left; -// } -// // TODO: code a move_sample -// NodeClassifier & current_node = node(index); -// NodeClassifier & data_node = node(data_leaf); -// NodeClassifier & other_node = node(other_leaf); -// current_node.set_feature(feature).set_threshold(threshold); -// // We pass the sample to the new leaves, and initialize the _label_average with the value -// data_node.set_x_t(x_t).set_y_t(y_t); -// -// // other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); -// other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); -// -// // Update downwards of v' -// other_node.update_downwards(current_node.x_t(), current_node.y_t()); -// // Update upwards of v': it's a leaf -// other_node.update_upwards(); -// // node(other_leaf).set_weight_tree(node(other_leaf).weight()); -// // Update downwards of v'' -// data_node.update_downwards(x_t, y_t); -// // Note: the update_up of v'' is done in the go_up method, called in fit() -// // std::cout << "Done splitting node." << std::endl; -// return data_leaf; -//} +} void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t) { // std::cout << "Extending the range of: " << index << std::endl; @@ -570,7 +508,9 @@ void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { // std::cout << "x_t: [" << std::setprecision(2) << x_t[0] << ", " << std::setprecision(2) << x_t[1] << "]" << std::endl; // print(); uint32_t leaf = go_downwards(x_t, y_t, false); - go_upwards(leaf); + if(use_aggregation()) { + go_upwards(leaf); + } iteration++; } @@ -628,34 +568,38 @@ inline double TreeClassifier::step() const { return forest.step(); } +inline double TreeClassifier::dirichlet() const { + return forest.dirichlet(); +} + inline CriterionClassifier TreeClassifier::criterion() const { return forest.criterion(); } +inline bool TreeClassifier::use_aggregation() const { + return forest.use_aggregation(); +} + /********************************************************************************* * OnlineForestClassifier methods *********************************************************************************/ OnlineForestClassifier::OnlineForestClassifier(uint8_t n_classes, uint32_t n_trees, + uint8_t n_passes, double step, CriterionClassifier criterion, bool use_aggregation, + double subsampling, + double dirichlet, int32_t n_threads, int seed, bool verbose) - : _n_classes(n_classes), _n_trees(n_trees), _step(step), _criterion(criterion), - _use_aggregation(use_aggregation), _n_threads(n_threads), _verbose(verbose), rand(seed) { + : _n_classes(n_classes), _n_trees(n_trees), _n_passes(n_passes), _step(step), _criterion(criterion), + _use_aggregation(use_aggregation), _subsampling(subsampling), _dirichlet(dirichlet), _n_threads(n_threads), + _verbose(verbose), rand(seed) { // No iteration so far _iteration = 0; -// std::cout << "sizeof(float): " << sizeof(float) << std::endl; -// std::cout << "sizeof(double): " << sizeof(double) << std::endl; -// std::cout << "sizeof(uint8_t): " << sizeof(uint8_t) << std::endl; -// std::cout << "sizeof(uint16_t): " << sizeof(uint16_t) << std::endl; -// std::cout << "sizeof(uint32_t): " << sizeof(uint32_t) << std::endl; -// std::cout << "sizeof(long): " << sizeof(long) << std::endl; -// std::cout << "sizeof(ulong): " << sizeof(ulong) << std::endl; - create_trees(); } @@ -684,15 +628,22 @@ void OnlineForestClassifier::fit(const SArrayDouble2dPtr features, _features = features; _labels = labels; + // set_n_features(n_features); - for (uint32_t i = 0; i < n_samples; ++i) { - for (TreeClassifier &tree : trees) { - // Fit the tree online using the new data point - double label = (*labels)[i]; - check_label(label); - tree.fit(view_row(*features, i), (*labels)[i]); + + for(uint8_t pass = 0; pass < _n_passes; ++pass) { + for (uint32_t i = 0; i < n_samples; ++i) { + for (TreeClassifier &tree : trees) { + // Fit the tree online using the new data point + double label = (*labels)[i]; + check_label(label); + double U = rand.uniform(); + if (U <= _subsampling) { + tree.fit(view_row(*features, i), (*labels)[i]); + } + } + _iteration++; } - _iteration++; } // std::cout << "Done OnlineForestClassifier::fit" << std::endl; } @@ -732,6 +683,12 @@ void OnlineForestClassifier::clear() { _iteration = 0; } +void OnlineForestClassifier::print() { + for (TreeClassifier &tree: trees) { + tree.print(); + } +} + inline uint32_t OnlineForestClassifier::sample_feature() { return rand.uniform_int(static_cast(0), n_features() - 1); } @@ -760,6 +717,23 @@ inline double OnlineForestClassifier::sample_threshold(double left, double right return rand.uniform(left, right); } + +void OnlineForestClassifier::n_nodes(SArrayUIntPtr n_nodes_per_tree) { + uint8_t j = 0; + for (TreeClassifier& tree : trees) { + (*n_nodes_per_tree)[j] = tree.n_nodes(); + j++; + } +} + +void OnlineForestClassifier::n_leaves(SArrayUIntPtr n_leaves_per_tree) { + uint8_t j = 0; + for (TreeClassifier& tree : trees) { + (*n_leaves_per_tree)[j] = tree.n_leaves(); + j++; + } +} + //inline bool OnlineForestClassifier::verbose() const { // return _verbose; //} diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index 5e80b5fbe..cbc59c827 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -14,7 +14,11 @@ // TODO: set_feature_importances with a nullptr by default // TODO: subsample parameter, default 0.5 - +// TODO: tree aggregation +// TODO: subsampling in the columns and the rows +// TODO: memory optimization (a FeatureSplitter), maximum (sizeof(uint8_t) splits)), a set of current splits +// TODO: only binary features version ? +// TODO: enum class CriterionClassifier { log = 0, @@ -70,7 +74,7 @@ class NodeClassifier { // Computation of log( (e^a + e^b) / 2) in an overproof way inline static double log_sum_2_exp(const double a, const double b) { - // TODO if |a - b| > 50 skip + // TODO: if |a - b| > 50 skip if (a > b) { return a + std::log((1 + std::exp(b - a)) / 2); } else { @@ -102,8 +106,10 @@ class NodeClassifier { inline uint32_t n_features() const; // Number of classes inline uint8_t n_classes() const; - // Step to use for aggrgation + // Step to use for aggregation inline double step() const; + // + inline double dirichlet() const; // Print of the node void print(); @@ -127,6 +133,7 @@ class NodeClassifier { inline NodeClassifier &set_features_max(const ArrayDouble &features_max); inline uint32_t n_samples() const; inline NodeClassifier &set_n_samples(uint32_t n_samples); + inline bool use_aggregation() const; inline double weight() const; inline NodeClassifier &set_weight(double weight); inline double weight_tree() const; @@ -178,6 +185,7 @@ class TreeClassifier { inline uint32_t n_nodes() const; uint32_t n_leaves() const; inline double step() const; + inline double dirichlet() const; void print() { std::cout << "Tree(n_nodes: " << _n_nodes << std::endl; @@ -192,6 +200,7 @@ class TreeClassifier { } inline CriterionClassifier criterion() const; + inline bool use_aggregation() const; NodeClassifier &node(uint32_t index) { return nodes[index]; @@ -208,12 +217,18 @@ class OnlineForestClassifier { uint8_t _n_classes; // Number of Trees in the forest uint32_t _n_trees; + // + uint8_t _n_passes; // Step-size used for aggregation double _step; // CriterionClassifier used for splitting (not used for now) CriterionClassifier _criterion; // bool _use_aggregation; + // + double _subsampling; + // + double _dirichlet; // Number of threads to use for parallel growing of trees int32_t _n_threads; // Seed for random number generation @@ -237,10 +252,10 @@ class OnlineForestClassifier { SArrayDoublePtr _labels; public: - OnlineForestClassifier(uint8_t n_classes, uint32_t n_trees, double step = 1.0, + OnlineForestClassifier(uint8_t n_classes, uint32_t n_trees, uint8_t n_passes = 1, double step = 1.0, CriterionClassifier criterion = CriterionClassifier::log, - bool use_aggregation = true, int32_t n_threads = 1, - int seed = 0, bool verbose = false); + bool use_aggregation = true, double subsampling = 1, double dirichlet = 0.5, + int32_t n_threads = 1, int seed = 0, bool verbose = false); virtual ~OnlineForestClassifier(); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); @@ -261,11 +276,7 @@ class OnlineForestClassifier { return _step; } - void print() { - for (TreeClassifier &tree: trees) { - tree.print(); - } - } + void print(); inline uint32_t n_samples() const { if (_iteration > 0) { @@ -287,6 +298,10 @@ class OnlineForestClassifier { return _n_classes; } + inline bool use_aggregation() const { + return _use_aggregation; + } + OnlineForestClassifier &set_n_classes(uint8_t n_classes) { if (_iteration == 0) { _n_classes = n_classes; @@ -304,13 +319,11 @@ class OnlineForestClassifier { } else { TICK_ERROR("Wrong number of features: started to train with " + std::to_string(_n_features) + " features, but received " + std::to_string(n_features) + " afterwards"); - } } } inline void check_label(double label) const { - double iptr; double fptr = std::modf(label, &iptr); if(fptr != 0) { @@ -376,6 +389,14 @@ class OnlineForestClassifier { _feature_importances = feature_importances; } + inline double dirichlet() const { + return _dirichlet; + } + + void n_nodes(SArrayUIntPtr n_nodes_per_tree); + + void n_leaves(SArrayUIntPtr n_leaves_per_tree); + // inline bool verbose() const; // inline OnlineForestClassifier &set_verbose(bool verbose); }; diff --git a/tick/inference/swig/online_forest_classifier.i b/tick/inference/swig/online_forest_classifier.i index d252f90ce..448d884cc 100644 --- a/tick/inference/swig/online_forest_classifier.i +++ b/tick/inference/swig/online_forest_classifier.i @@ -15,10 +15,10 @@ enum class CriterionClassifier { class OnlineForestClassifier { public: - OnlineForestClassifier(uint8_t n_classes, uint32_t n_trees, double step = 1.0, + OnlineForestClassifier(uint8_t n_classes, uint32_t n_trees, uint8_t n_passes = 1, double step = 1.0, CriterionClassifier criterion = CriterionClassifier::log, - bool use_aggregation = true, int32_t n_threads = 1, - int seed = 0, bool verbose = false); + bool use_aggregation = true, double subsampling=1, double dirichlet=0.5, + int32_t n_threads = 1, int seed = 0, bool verbose = false); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions); @@ -45,6 +45,10 @@ class OnlineForestClassifier { OnlineForestClassifier &set_criterion(CriterionClassifier criterion); int seed() const; OnlineForestClassifier &set_seed(int seed); + + void n_nodes(SArrayUIntPtr n_nodes_per_tree); + void n_leaves(SArrayUIntPtr n_leaves_per_tree); + // bool verbose() const; // OnlineForestRegressor &set_verbose(bool verbose); diff --git a/tick/inference/tests/online_forest_classifier_test.py b/tick/inference/tests/online_forest_classifier_test.py new file mode 100644 index 000000000..16d0adb7b --- /dev/null +++ b/tick/inference/tests/online_forest_classifier_test.py @@ -0,0 +1,49 @@ +# License: BSD 3 clause + +import unittest +from tick.inference.tests.inference import InferenceTest + +from sklearn.datasets import make_moons, make_classification, make_circles + +from tick.inference import OnlineForestClassifier + + +# Test + +class Test(InferenceTest): + + def test_online_forest_n_features_differs(self): + n_classes = 2 + n_trees = 20 + + X, y = make_classification(n_samples=n_samples, n_features=10, + n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) + rng = np.random.RandomState(2) + X += 2 * rng.uniform(size=X.shape) + + of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, + step=1., + use_aggregation=True) + + of.fit(X, y) + + X, y = make_classification(n_samples=n_samples, n_features=10, + n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) + + of.fit(X, y) + + X, y = make_classification(n_samples=n_samples, n_features=3, + n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) + + + def test_online_forest_n_classes_differs(self): + pass + +if __name__ == "__main__": + unittest.main() From dfbfa1520522b202dd9fcafed12cc9b14201919e Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Sat, 30 Dec 2017 11:30:17 +0100 Subject: [PATCH 29/32] ... --- online_forest_selection.py | 154 ++++++++++++++++++ tick/inference/online_forest_classifier.py | 20 ++- .../src/online_forest_classifier.cpp | 21 +-- tick/inference/src/online_forest_classifier.h | 17 +- .../inference/swig/online_forest_classifier.i | 21 ++- 5 files changed, 200 insertions(+), 33 deletions(-) create mode 100644 online_forest_selection.py diff --git a/online_forest_selection.py b/online_forest_selection.py new file mode 100644 index 000000000..9783008a8 --- /dev/null +++ b/online_forest_selection.py @@ -0,0 +1,154 @@ +from tick.simulation import SimuLogReg, weights_sparse_gauss +from sklearn.model_selection import train_test_split +import numpy as np +from tick.inference import OnlineForestClassifier +from matplotlib.colors import ListedColormap + +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.datasets import make_moons, make_classification, make_circles +from sklearn.metrics import roc_auc_score +import matplotlib.pyplot as plt + +from tick.simulation import weights_sparse_exp, SimuLogReg + +np.set_printoptions(precision=2) + +# w0 = weights_sparse_gauss(n_features, nnz=2) +# X, y = SimuLogReg(w0, -1., n_samples=n_samples, seed=seed).simulate() + + +n_samples = 10000 +n_features = 50 +n_classes = 2 + + +w0 = weights_sparse_exp(n_features, nnz=10) +X, y = SimuLogReg(weights=w0, intercept=None, n_samples=n_samples).simulate() +y = (y + 1) / 2 + +path = '/Users/stephane.gaiffas/Downloads/' + + +clf = OnlineForestClassifier(n_classes=n_classes, n_trees=50, seed=123, + step=1., use_aggregation=True) + +clf.fit(X, y) + + +# of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=30., n_passes=1, +# seed=123, use_aggregation=True) +# +# of.fit(X, y) + +# print("n_nodes:", of.n_nodes()) +# print("n_leaves:", of.n_leaves()) +# print(of.predict_proba(X)) +# print("step: ", of._forest.step()) + +# of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=1., +# seed=123, use_aggregation=True, n_passes=1) +# +# of.fit(X, y) + +# print("n_nodes:", of.n_nodes()) +# print("n_leaves:", of.n_leaves()) +# print(of.predict_proba(X)) +# print("step: ", of._forest.step()) + + +# exit(0) + + +# X_train, X_test, y_train, y_test = \ +# train_test_split(X, y, test_size=.4, random_state=42) +# +# clf.fit(X_train, y_train) + +# clf.predict(X_test) + +# exit(0) + +# clf.print() + +X, y = make_classification(n_samples=n_samples, n_features=n_features, n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) +linearly_separable = (X, y) + +datasets = [ + make_moons(n_samples=n_samples, noise=0.3, random_state=0), + make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), + linearly_separable +] + +n_trees = 10 + +of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=30., n_passes=1, + seed=123, use_aggregation=True) +seed = 123 + + +params = [ + {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 0.1}, + {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 0.5}, + {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 2}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.2, 'n_passes': 5}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 10}, + # {'use_aggregation': True, 'n_trees': 1, 'subsampling': 1, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 1, 'subsampling': 0.1, 'n_passes': 10}, + # + # {'use_aggregation': True, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 20}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 20}, + # {'use_aggregation': False, 'n_trees': 1, 'subsampling': 1, 'n_passes': 1}, + # {'use_aggregation': False, 'n_trees': 1, 'subsampling': 1, 'n_passes': 20}, + # {'use_aggregation': False, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 1}, + # {'use_aggregation': False, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 20}, + # {'use_aggregation': False, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 1}, + # {'use_aggregation': False, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 20}, +] + + +def toto(kkk): + return "OF(T: " \ + + str(kkk['n_trees']) + ", S: " + str(kkk['subsampling']) \ + + ', P: ' + str(kkk['n_passes']) + ', di: ' + str(kkk['dirichlet']) \ + + ")" + # return "OF(A: " + str(kkk['use_aggregation']) + ", T: " \ + # + str(kkk['n_trees']) + ", S: " + str(kkk['subsampling']) \ + # + ', P: ' + str(kkk['n_passes']) + ")" + + +names = list(toto(kw) for kw in params) + ["KNN", "ET", "BRF"] + +classifiers = list( + OnlineForestClassifier(n_classes=n_classes, seed=123, step=1., **kw) + for kw in params +) + +classifiers += [ + KNeighborsClassifier(n_neighbors=5), + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) +] + +# names = [ +# "OF(agg, n_passes=1)", +# "OF(agg, n_passes=5)", +# "OF(agg, n_passes=10)", +# "OF(no agg., n_passes=1)", +# "KNN (k=5)", +# "ET", +# "BRF" +# ] + +plot_decision_classification(classifiers, datasets, names) + +# plt.savefig('decisions.pdf') + +plt.show() diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py index 22c7d2c7e..2a9f02eeb 100644 --- a/tick/inference/online_forest_classifier.py +++ b/tick/inference/online_forest_classifier.py @@ -72,6 +72,7 @@ class OnlineForestClassifier(ABC, Base): 'verbose': {'writable': True, 'cpp_setter': 'set_verbose'}, 'warm_start': {'writable': True, 'cpp_setter': 'set_warm_start'}, 'n_splits': {'writable': True, 'cpp_setter': 'set_n_splits'}, + 'dirichlet': {'writable': True, 'cpp_setter': 'set_dirichlet'} } _cpp_obj_name = "_forest" @@ -91,6 +92,7 @@ def __init__(self, n_classes: int, n_trees: int = 10, n_passes: int = 1, self._fitted = False self.n_trees = n_trees self.n_passes = n_passes + self.n_features = None self.n_classes = n_classes self.step = step self.criterion = criterion @@ -102,12 +104,7 @@ def __init__(self, n_classes: int, n_trees: int = 10, n_passes: int = 1, if dirichlet is None: dirichlet = 1 / n_classes self.dirichlet = dirichlet - self._forest = _OnlineForestClassifier(n_classes, n_trees, n_passes, - step, - self._criterion, - self.use_aggregation, - subsampling, dirichlet, n_threads, - seed, verbose) + self._forest = None def set_data(self, X, y): X = safe_array(X) @@ -117,6 +114,17 @@ def set_data(self, X, y): def fit(self, X, y): X = safe_array(X) y = safe_array(y) + n_samples, n_features = X.shape + # TODO: check that sizes of X and y match + if self._forest is None: + self.n_features = n_features + _forest = _OnlineForestClassifier( + n_features, self.n_classes, self.n_trees, self.n_passes, + self.step, + self._criterion, self.use_aggregation, self.subsampling, + self.dirichlet, self.n_threads, self.seed, self.verbose + ) + self._set('_forest', _forest) self._set("_fitted", True) self._forest.fit(X, y) return self diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index f57a62dcd..9dda8b670 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -584,7 +584,8 @@ inline bool TreeClassifier::use_aggregation() const { * OnlineForestClassifier methods *********************************************************************************/ -OnlineForestClassifier::OnlineForestClassifier(uint8_t n_classes, +OnlineForestClassifier::OnlineForestClassifier(uint32_t n_features, + uint8_t n_classes, uint32_t n_trees, uint8_t n_passes, double step, @@ -595,7 +596,7 @@ OnlineForestClassifier::OnlineForestClassifier(uint8_t n_classes, int32_t n_threads, int seed, bool verbose) - : _n_classes(n_classes), _n_trees(n_trees), _n_passes(n_passes), _step(step), _criterion(criterion), + : _n_features(n_features), _n_classes(n_classes), _n_trees(n_trees), _n_passes(n_passes), _step(step), _criterion(criterion), _use_aggregation(use_aggregation), _subsampling(subsampling), _dirichlet(dirichlet), _n_threads(n_threads), _verbose(verbose), rand(seed) { // No iteration so far @@ -734,11 +735,11 @@ void OnlineForestClassifier::n_leaves(SArrayUIntPtr n_leaves_per_tree) { } } -//inline bool OnlineForestClassifier::verbose() const { -// return _verbose; -//} -// -//inline OnlineForestClassifier &OnlineForestClassifier::set_verbose(bool verbose) { -// _verbose = verbose; -// return *this; -//} +bool OnlineForestClassifier::verbose() const { + return _verbose; +} + +OnlineForestClassifier &OnlineForestClassifier::set_verbose(bool verbose) { + _verbose = verbose; + return *this; +} diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index cbc59c827..23dc96076 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -213,10 +213,12 @@ class TreeClassifier { class OnlineForestClassifier { private: + // Number of features + uint32_t _n_features; // Number of classes in the classification problem uint8_t _n_classes; // Number of Trees in the forest - uint32_t _n_trees; + uint8_t _n_trees; // uint8_t _n_passes; // Step-size used for aggregation @@ -235,8 +237,6 @@ class OnlineForestClassifier { int _seed; // Verbose things or not bool _verbose; - // Number of features. - uint32_t _n_features; // Iteration counter uint32_t _iteration; // The list of trees in the forest @@ -252,7 +252,7 @@ class OnlineForestClassifier { SArrayDoublePtr _labels; public: - OnlineForestClassifier(uint8_t n_classes, uint32_t n_trees, uint8_t n_passes = 1, double step = 1.0, + OnlineForestClassifier(uint32_t n_features, uint8_t n_classes, uint32_t n_trees, uint8_t n_passes = 1, double step = 1.0, CriterionClassifier criterion = CriterionClassifier::log, bool use_aggregation = true, double subsampling = 1, double dirichlet = 0.5, int32_t n_threads = 1, int seed = 0, bool verbose = false); @@ -276,6 +276,11 @@ class OnlineForestClassifier { return _step; } + inline OnlineForestClassifier& set_step(const double step) { + _step = step; + return *this; + } + void print(); inline uint32_t n_samples() const { @@ -397,8 +402,8 @@ class OnlineForestClassifier { void n_leaves(SArrayUIntPtr n_leaves_per_tree); -// inline bool verbose() const; -// inline OnlineForestClassifier &set_verbose(bool verbose); + bool verbose() const; + OnlineForestClassifier &set_verbose(bool verbose); }; #endif //TICK_ONLINE_FOREST_CLASSIFIER_H diff --git a/tick/inference/swig/online_forest_classifier.i b/tick/inference/swig/online_forest_classifier.i index 448d884cc..267489e8a 100644 --- a/tick/inference/swig/online_forest_classifier.i +++ b/tick/inference/swig/online_forest_classifier.i @@ -15,42 +15,41 @@ enum class CriterionClassifier { class OnlineForestClassifier { public: - OnlineForestClassifier(uint8_t n_classes, uint32_t n_trees, uint8_t n_passes = 1, double step = 1.0, + OnlineForestClassifier(uint32_t n_features, uint8_t n_classes, uint8_t n_trees, + uint8_t n_passes = 1, double step = 1.0, CriterionClassifier criterion = CriterionClassifier::log, bool use_aggregation = true, double subsampling=1, double dirichlet=0.5, int32_t n_threads = 1, int seed = 0, bool verbose = false); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions); - void clear(); - - inline double step() const; void print(); - ulong n_samples() const; - ulong n_features() const; + uint32_t n_samples() const; + uint32_t n_features() const; uint8_t n_classes() const; - OnlineForestClassifier & set_n_classes(uint8_t n_classes); - ulong n_classes() const; - // OnlineForestClassifier &set_n_features(ulong n_features); + inline double step() const; + inline OnlineForestClassifier& set_step(const double step); uint32_t n_trees() const; OnlineForestClassifier &set_n_trees(uint32_t n_trees); int32_t n_threads() const; OnlineForestClassifier &set_n_threads(int32_t n_threads); + CriterionClassifier criterion() const; OnlineForestClassifier &set_criterion(CriterionClassifier criterion); + int seed() const; OnlineForestClassifier &set_seed(int seed); void n_nodes(SArrayUIntPtr n_nodes_per_tree); void n_leaves(SArrayUIntPtr n_leaves_per_tree); - // bool verbose() const; - // OnlineForestRegressor &set_verbose(bool verbose); + bool verbose() const; + OnlineForestRegressor &set_verbose(bool verbose); void set_feature_importances(const ArrayDouble &feature_importances); }; From 2ae00cc268ae0a4980762101c191c120075cfb1b Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Sat, 30 Dec 2017 11:32:47 +0100 Subject: [PATCH 30/32] ... --- .../src/online_forest_classifier.cpp | 31 +++++++++++++++++++ tick/inference/src/online_forest_classifier.h | 31 +++---------------- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 9dda8b670..b29cbe8d9 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -743,3 +743,34 @@ OnlineForestClassifier &OnlineForestClassifier::set_verbose(bool verbose) { _verbose = verbose; return *this; } + + +double OnlineForestClassifier::step() const { + return _step; +} + +OnlineForestClassifier& OnlineForestClassifier::set_step(const double step) { + _step = step; + return *this; +} + + +uint32_t OnlineForestClassifier::n_samples() const { + if (_iteration > 0) { + return _iteration; + } else { + TICK_ERROR("You must call ``fit`` before asking for ``n_samples``.") + } +} + +inline uint32_t OnlineForestClassifier::n_features() const { + if (_iteration > 0) { + return _n_features; + } else { + TICK_ERROR("You must call ``fit`` before asking for ``n_features``.") + } +} + +uint8_t OnlineForestClassifier::n_classes() const { + return _n_classes; +} diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index 23dc96076..73f57721f 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -272,36 +272,15 @@ class OnlineForestClassifier { void clear(); - inline double step() const { - return _step; - } - - inline OnlineForestClassifier& set_step(const double step) { - _step = step; - return *this; - } + double step() const; + OnlineForestClassifier& set_step(const double step); void print(); - inline uint32_t n_samples() const { - if (_iteration > 0) { - return _iteration; - } else { - TICK_ERROR("You must call ``fit`` before asking for ``n_samples``.") - } - } - - inline uint32_t n_features() const { - if (_iteration > 0) { - return _n_features; - } else { - TICK_ERROR("You must call ``fit`` before asking for ``n_features``.") - } - } + uint32_t n_samples() const; + uint32_t n_features() const; - inline uint8_t n_classes() const { - return _n_classes; - } + uint8_t n_classes() const; inline bool use_aggregation() const { return _use_aggregation; From 0eaabdf769045fff59d1dbe09e0bad04aa0dbf0a Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Tue, 2 Jan 2018 18:02:25 +0100 Subject: [PATCH 31/32] ... --- .../src/online_forest_classifier.cpp | 202 +++++++++++++----- tick/inference/src/online_forest_classifier.h | 152 +++---------- .../inference/swig/online_forest_classifier.i | 1 - .../tests/online_forest_classifier_test.py | 1 + 4 files changed, 180 insertions(+), 176 deletions(-) diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index b29cbe8d9..1e2fe4844 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -72,7 +72,7 @@ NodeClassifier &NodeClassifier::operator=(const NodeClassifier &node) { void NodeClassifier::update_downwards(const ArrayDouble &x_t, const double y_t) { _n_samples++; - if(use_aggregation()) { + if (use_aggregation()) { _weight -= step() * loss(y_t); } update_predict(y_t); @@ -110,7 +110,7 @@ void NodeClassifier::update_range(const ArrayDouble &x_t) { _features_min = x_t; _features_max = x_t; } else { - for(uint32_t j = 0; j < n_features(); ++j) { + for (uint32_t j = 0; j < n_features(); ++j) { double x_tj = x_t[j]; if (x_tj < _features_min[j]) { _features_min[j] = x_tj; @@ -127,8 +127,8 @@ double NodeClassifier::score(uint8_t c) const { return (_counts[c] + dirichlet()) / (_n_samples + dirichlet() * n_classes()); } -inline void NodeClassifier::predict(ArrayDouble& scores) const { - for (uint8_t c=0; c < n_classes(); ++c) { +inline void NodeClassifier::predict(ArrayDouble &scores) const { + for (uint8_t c = 0; c < n_classes(); ++c) { scores[c] = score(c); } } @@ -147,7 +147,7 @@ uint32_t NodeClassifier::n_features() const { return _tree.n_features(); } -uint8_t NodeClassifier::n_classes() const { +uint8_t NodeClassifier::n_classes() const { return _tree.n_classes(); } @@ -163,7 +163,7 @@ inline uint32_t NodeClassifier::parent() const { return _parent; } -inline NodeClassifier& NodeClassifier::set_parent(uint32_t parent) { +inline NodeClassifier &NodeClassifier::set_parent(uint32_t parent) { _parent = parent; return *this; } @@ -226,7 +226,7 @@ inline double NodeClassifier::features_min(const uint32_t j) const { return _features_min[j]; } -inline NodeClassifier & NodeClassifier::set_features_min(const ArrayDouble &features_min) { +inline NodeClassifier &NodeClassifier::set_features_min(const ArrayDouble &features_min) { _features_min = features_min; return *this; } @@ -235,7 +235,7 @@ inline double NodeClassifier::features_max(const uint32_t j) const { return _features_max[j]; } -inline NodeClassifier & NodeClassifier::set_features_max(const ArrayDouble &features_max) { +inline NodeClassifier &NodeClassifier::set_features_max(const ArrayDouble &features_max) { _features_max = features_max; return *this; } @@ -291,23 +291,25 @@ inline NodeClassifier &NodeClassifier::set_y_t(const double y_t) { void NodeClassifier::print() { std::cout << "Node(parent: " << _parent - << ", left: " << _left - << ", right: " << _right - << ", time: " << std::setprecision(2) << _time - << ", n_samples: " << _n_samples - << ", is_leaf: " << _is_leaf - << ", feature: " << _feature - << ", thresh: " << _threshold - << ", scores: [" << std::setprecision(2) << score(0) << ", " << std::setprecision(2) << score(1) << "]" - << ", counts: [" << std::setprecision(2) << _counts[0] << ", " << std::setprecision(2) << _counts[1] << "]"; - if (_n_samples > 0) { - std::cout << ", min: [" << std::setprecision(2) << _features_min[0] << ", " << std::setprecision(2) << _features_min[1] << "]" - << ", max: [" << std::setprecision(2) << _features_max[0] << ", " << std::setprecision(2) << _features_max[1] << "]"; + << ", left: " << _left + << ", right: " << _right + << ", time: " << std::setprecision(2) << _time + << ", n_samples: " << _n_samples + << ", is_leaf: " << _is_leaf + << ", feature: " << _feature + << ", thresh: " << _threshold + << ", scores: [" << std::setprecision(2) << score(0) << ", " << std::setprecision(2) << score(1) << "]" + << ", counts: [" << std::setprecision(2) << _counts[0] << ", " << std::setprecision(2) << _counts[1] << "]"; + if (_n_samples > 0) { + std::cout << ", min: [" << std::setprecision(2) << _features_min[0] << ", " << std::setprecision(2) + << _features_min[1] << "]" + << ", max: [" << std::setprecision(2) << _features_max[0] << ", " << std::setprecision(2) + << _features_max[1] << "]"; - } - std::cout << ", weight: " << _weight - << ", weight_tree: " << _weight_tree - << ")\n"; + } + std::cout << ", weight: " << _weight + << ", weight_tree: " << _weight_tree + << ")\n"; } /********************************************************************************* @@ -329,7 +331,7 @@ TreeClassifier::TreeClassifier(OnlineForestClassifier &forest) : forest(forest) void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t) { // std::cout << "Extending the range of: " << index << std::endl; NodeClassifier ¤t_node = node(node_index); - if(current_node.n_samples() == 0) { + if (current_node.n_samples() == 0) { // The node is a leaf with no sample point, so it does not have a range // In this case we just initialize the range with the given feature // This node will then be updated by the call to update_downwards in go_downwards @@ -339,11 +341,11 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c // std::cout << "Computing extension" << std::endl; ArrayDouble extension(n_features()); double extensions_sum = 0; - for(uint32_t j =0; j < n_features(); ++j) { + for (uint32_t j = 0; j < n_features(); ++j) { double x_tj = x_t[j]; double feature_min_j = current_node.features_min(j); double feature_max_j = current_node.features_max(j); - if(x_tj < feature_min_j) { + if (x_tj < feature_min_j) { extension[j] = feature_min_j - x_tj; extensions_sum += feature_min_j - x_tj; } else { @@ -360,7 +362,7 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c // std::cout << "... Done computing extension." << std::endl; // If the sample x_t extends the current range of the node - if(extensions_sum > 0) { + if (extensions_sum > 0) { // std::cout << "Extension non-zero, considering the possibility of a split" << std::endl; bool do_split; double time = current_node.time(); @@ -396,7 +398,7 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c // Create new nodes uint32_t left_new = add_node(node_index, time + T); uint32_t right_new = add_node(node_index, time + T); - if(is_right_extension) { + if (is_right_extension) { // std::cout << "extension is on the right" << std::endl; threshold = forest.sample_threshold(node(node_index).features_max(feature), x_t[feature]); // std::cout << "sample inside the extension the threshold: " << threshold << std::endl; @@ -411,7 +413,7 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c // std::cout << "Let's the update the right child" << std::endl; node(right_new).set_parent(node_index).set_time(time + T); // We must tell the old childs that they have a new parent, if the current node is not a leaf - if(!node(node_index).is_leaf()) { + if (!node(node_index).is_leaf()) { // std::cout << "The current node is not a leaf, so let's not forget to update the old childs" << std::endl; node(node(node_index).left()).set_parent(left_new); node(node(node_index).right()).set_parent(left_new); @@ -423,7 +425,7 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c node(right_new) = node(node_index); node(right_new).set_parent(node_index).set_time(time + T); node(left_new).set_parent(node_index).set_time(time + T); - if(!node(node_index).is_leaf()) { + if (!node(node_index).is_leaf()) { node(node(node_index).left()).set_parent(right_new); node(node(node_index).right()).set_parent(right_new); } @@ -439,7 +441,6 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c // std::cout << "...Done extending the range." << std::endl; } - uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { // Find the leaf that contains the sample // Start at the root. Index of the root is always 0 @@ -493,14 +494,26 @@ inline uint32_t TreeClassifier::n_nodes() const { uint32_t TreeClassifier::n_leaves() const { uint32_t n_leaves = 0; - for(const NodeClassifier &node: nodes) { - if(node.is_leaf()) { + for (const NodeClassifier &node: nodes) { + if (node.is_leaf()) { ++n_leaves; } } return n_leaves; } +void TreeClassifier::print() { + std::cout << "Tree(n_nodes: " << _n_nodes << std::endl; + std::cout << " "; + uint32_t index = 0; + for (NodeClassifier &node : nodes) { + std::cout << "index: " << index << " "; + node.print(); + index++; + } + std::cout << ")" << std::endl; +} + void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { // TODO: Test that the size does not change within successive calls to fit // std::cout << "------------------------------------------" << std::endl; @@ -508,16 +521,16 @@ void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { // std::cout << "x_t: [" << std::setprecision(2) << x_t[0] << ", " << std::setprecision(2) << x_t[1] << "]" << std::endl; // print(); uint32_t leaf = go_downwards(x_t, y_t, false); - if(use_aggregation()) { + if (use_aggregation()) { go_upwards(leaf); } iteration++; } -void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores, bool use_aggregation) { +void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble &scores, bool use_aggregation) { // std::cout << "TreeClassifier::predict" << std::endl; uint32_t leaf = go_downwards(x_t, 0., true); - if(!use_aggregation) { + if (!use_aggregation) { // std::cout << "Not using aggregation so using only the leaf's prediction" << std::endl; node(leaf).predict(scores); return; @@ -538,7 +551,7 @@ void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble& scores, bool u double w = std::exp(current_node.weight() - current_node.weight_tree()); // Get the predictions of the current node current_node.predict(pred_new); - for(uint8_t c = 0; c < n_classes(); ++c) { + for (uint8_t c = 0; c < n_classes(); ++c) { scores[c] = 0.5 * w * pred_new[c] + (1 - 0.5 * w) * scores[c]; } } @@ -586,7 +599,7 @@ inline bool TreeClassifier::use_aggregation() const { OnlineForestClassifier::OnlineForestClassifier(uint32_t n_features, uint8_t n_classes, - uint32_t n_trees, + uint8_t n_trees, uint8_t n_passes, double step, CriterionClassifier criterion, @@ -596,9 +609,18 @@ OnlineForestClassifier::OnlineForestClassifier(uint32_t n_features, int32_t n_threads, int seed, bool verbose) - : _n_features(n_features), _n_classes(n_classes), _n_trees(n_trees), _n_passes(n_passes), _step(step), _criterion(criterion), - _use_aggregation(use_aggregation), _subsampling(subsampling), _dirichlet(dirichlet), _n_threads(n_threads), - _verbose(verbose), rand(seed) { + : _n_features(n_features), + _n_classes(n_classes), + _n_trees(n_trees), + _n_passes(n_passes), + _step(step), + _criterion(criterion), + _use_aggregation(use_aggregation), + _subsampling(subsampling), + _dirichlet(dirichlet), + _n_threads(n_threads), + _verbose(verbose), + rand(seed) { // No iteration so far _iteration = 0; create_trees(); @@ -617,22 +639,22 @@ void OnlineForestClassifier::create_trees() { } void OnlineForestClassifier::fit(const SArrayDouble2dPtr features, - const SArrayDoublePtr labels) { + const SArrayDoublePtr labels) { uint32_t n_samples = static_cast(features->n_rows()); uint32_t n_features = static_cast(features->n_cols()); - if(_iteration == 0) { + if (_iteration == 0) { _n_features = n_features; } else { check_n_features(n_features, false); } + // TODO: remove this _features = features; _labels = labels; - // set_n_features(n_features); - for(uint8_t pass = 0; pass < _n_passes; ++pass) { + for (uint8_t pass = 0; pass < _n_passes; ++pass) { for (uint32_t i = 0; i < n_samples; ++i) { for (TreeClassifier &tree : trees) { // Fit the tree online using the new data point @@ -675,8 +697,8 @@ void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, scores_i /= _n_trees; } } else { - TICK_ERROR("You must call ``fit`` before ``predict``.") -} + TICK_ERROR("You must call ``fit`` before ``predict``.") + } } void OnlineForestClassifier::clear() { @@ -702,7 +724,7 @@ inline double OnlineForestClassifier::sample_exponential(double intensity) { return rand.exponential(intensity); } -inline uint32_t OnlineForestClassifier::sample_feature(const ArrayDouble & prob) { +inline uint32_t OnlineForestClassifier::sample_feature(const ArrayDouble &prob) { // ArrayDouble my_prob = prob; // for(uint32_t j = 0; j < n_features(); ++j) { // // my_prob[j] *= _feature_importances[j]; @@ -718,10 +740,9 @@ inline double OnlineForestClassifier::sample_threshold(double left, double right return rand.uniform(left, right); } - void OnlineForestClassifier::n_nodes(SArrayUIntPtr n_nodes_per_tree) { uint8_t j = 0; - for (TreeClassifier& tree : trees) { + for (TreeClassifier &tree : trees) { (*n_nodes_per_tree)[j] = tree.n_nodes(); j++; } @@ -729,7 +750,7 @@ void OnlineForestClassifier::n_nodes(SArrayUIntPtr n_nodes_per_tree) { void OnlineForestClassifier::n_leaves(SArrayUIntPtr n_leaves_per_tree) { uint8_t j = 0; - for (TreeClassifier& tree : trees) { + for (TreeClassifier &tree : trees) { (*n_leaves_per_tree)[j] = tree.n_leaves(); j++; } @@ -744,17 +765,19 @@ OnlineForestClassifier &OnlineForestClassifier::set_verbose(bool verbose) { return *this; } +bool OnlineForestClassifier::use_aggregation() const { + return _use_aggregation; +} double OnlineForestClassifier::step() const { return _step; } -OnlineForestClassifier& OnlineForestClassifier::set_step(const double step) { +OnlineForestClassifier &OnlineForestClassifier::set_step(const double step) { _step = step; return *this; } - uint32_t OnlineForestClassifier::n_samples() const { if (_iteration > 0) { return _iteration; @@ -763,7 +786,7 @@ uint32_t OnlineForestClassifier::n_samples() const { } } -inline uint32_t OnlineForestClassifier::n_features() const { +uint32_t OnlineForestClassifier::n_features() const { if (_iteration > 0) { return _n_features; } else { @@ -771,6 +794,75 @@ inline uint32_t OnlineForestClassifier::n_features() const { } } +void OnlineForestClassifier::check_n_features(uint32_t n_features, bool predict) const { + if (n_features != _n_features) { + if (predict) { + TICK_ERROR("Wrong number of features: trained with " + std::to_string(_n_features) + + " features, but received " + std::to_string(n_features) + " features for prediction"); + } else { + TICK_ERROR("Wrong number of features: started to train with " + std::to_string(_n_features) + + " features, but received " + std::to_string(n_features) + " afterwards"); + } + } +} + +void OnlineForestClassifier::check_label(double label) const { + double iptr; + double fptr = std::modf(label, &iptr); + if (fptr != 0) { + TICK_ERROR("Wrong label type: received " + std::to_string(label) + " for a classification problem"); + } + if ((label < 0) || (label >= _n_classes)) { + TICK_ERROR("Wrong label value: received " + std::to_string(label) + " while training for classification with " + + std::to_string(_n_classes) + " classes."); + } +} + uint8_t OnlineForestClassifier::n_classes() const { return _n_classes; } + +uint8_t OnlineForestClassifier::n_trees() const { + return _n_trees; +} + +int32_t OnlineForestClassifier::n_threads() const { + return _n_threads; +} + +CriterionClassifier OnlineForestClassifier::criterion() const { + return _criterion; +} + +int OnlineForestClassifier::seed() const { + return _seed; +} + +OnlineForestClassifier &OnlineForestClassifier::set_seed(int seed) { + _seed = seed; + rand.reseed(seed); + return *this; +} + +OnlineForestClassifier &OnlineForestClassifier::set_n_threads(int32_t n_threads) { + _n_threads = n_threads; + return *this; +} + +OnlineForestClassifier &OnlineForestClassifier::set_criterion(CriterionClassifier criterion) { + _criterion = criterion; + return *this; +} + +void OnlineForestClassifier::set_feature_importances(const ArrayDouble &feature_importances) { + _feature_importances = feature_importances; +} + +double OnlineForestClassifier::dirichlet() const { + return _dirichlet; +} + +OnlineForestClassifier &OnlineForestClassifier::set_dirichlet(const double dirichlet) { + _dirichlet = dirichlet; + return *this; +} diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index 73f57721f..df6439f7d 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -187,17 +187,7 @@ class TreeClassifier { inline double step() const; inline double dirichlet() const; - void print() { - std::cout << "Tree(n_nodes: " << _n_nodes << std::endl; - std::cout << " "; - uint32_t index = 0; - for (NodeClassifier &node : nodes) { - std::cout << "index: " << index << " "; - node.print(); - index++; - } - std::cout << ")" << std::endl; - } + void print(); inline CriterionClassifier criterion() const; inline bool use_aggregation() const; @@ -219,7 +209,7 @@ class OnlineForestClassifier { uint8_t _n_classes; // Number of Trees in the forest uint8_t _n_trees; - // + // Number of passes over each given data uint8_t _n_passes; // Step-size used for aggregation double _step; @@ -251,11 +241,22 @@ class OnlineForestClassifier { SArrayDouble2dPtr _features; SArrayDoublePtr _labels; + void check_n_features(uint32_t n_features, bool predict) const; + inline void check_label(double label) const; + public: - OnlineForestClassifier(uint32_t n_features, uint8_t n_classes, uint32_t n_trees, uint8_t n_passes = 1, double step = 1.0, + OnlineForestClassifier(uint32_t n_features, + uint8_t n_classes, + uint8_t n_trees, + uint8_t n_passes = 1, + double step = 1.0, CriterionClassifier criterion = CriterionClassifier::log, - bool use_aggregation = true, double subsampling = 1, double dirichlet = 0.5, - int32_t n_threads = 1, int seed = 0, bool verbose = false); + bool use_aggregation = true, + double subsampling = 1, + double dirichlet = 0.5, + int32_t n_threads = 1, + int seed = 0, + bool verbose = false); virtual ~OnlineForestClassifier(); void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); @@ -263,126 +264,37 @@ class OnlineForestClassifier { inline uint32_t sample_feature(); inline uint32_t sample_feature(const ArrayDouble &prob); - inline uint32_t sample_feature_bis(); - inline double sample_exponential(double intensity); - inline double sample_threshold(double left, double right); void clear(); - - double step() const; - OnlineForestClassifier& set_step(const double step); - void print(); uint32_t n_samples() const; uint32_t n_features() const; - uint8_t n_classes() const; - inline bool use_aggregation() const { - return _use_aggregation; - } - - OnlineForestClassifier &set_n_classes(uint8_t n_classes) { - if (_iteration == 0) { - _n_classes = n_classes; - } else { - TICK_ERROR("OnlineForest::set_n_classes can be called only once !") - } - return *this; - } - - inline void check_n_features(uint32_t n_features, bool predict) const { - if (n_features != _n_features) { - if(predict) { - TICK_ERROR("Wrong number of features: trained with " + std::to_string(_n_features) - + " features, but received " + std::to_string(n_features) + " features for prediction"); - } else { - TICK_ERROR("Wrong number of features: started to train with " + std::to_string(_n_features) - + " features, but received " + std::to_string(n_features) + " afterwards"); - } - } - } - - inline void check_label(double label) const { - double iptr; - double fptr = std::modf(label, &iptr); - if(fptr != 0) { - TICK_ERROR("Wrong label type: received " + std::to_string(label) + " for a classification problem"); - } - if ((label < 0) || (label >= _n_classes) ) { - TICK_ERROR("Wrong label value: received " + std::to_string(label) + " while training for classification with " - + std::to_string(_n_classes) + " classes."); - } - } -/* - inline OnlineForestClassifier &set_n_features(uint32_t n_features) { - if (_iteration == 0) { - - } else { - if (n_features != _n_features) { - TICK_ERROR("Wrong number of features: started to train with " + std::to_string(_n_features) - + " features, but received " + std::to_string(n_features) + " afterwards"); - } - } - return *this; - } -*/ - - inline uint32_t n_trees() const { - return _n_trees; - } - - inline OnlineForestClassifier &set_n_trees(uint32_t n_trees) { - _n_trees = n_trees; - return *this; - } - - inline int32_t n_threads() const { - return _n_threads; - } - - inline CriterionClassifier criterion() const { - return _criterion; - } - - inline int seed() const { - return _seed; - } - - inline OnlineForestClassifier &set_seed(int seed) { - _seed = seed; - rand.reseed(seed); - return *this; - } - - OnlineForestClassifier &set_n_threads(int32_t n_threads) { - _n_threads = n_threads; - return *this; - } - - inline OnlineForestClassifier &set_criterion(CriterionClassifier criterion) { - _criterion = criterion; - return *this; - } - - inline void set_feature_importances(const ArrayDouble &feature_importances) { - _feature_importances = feature_importances; - } - - inline double dirichlet() const { - return _dirichlet; - } + uint8_t n_trees() const; + bool use_aggregation() const; + double step() const; + OnlineForestClassifier &set_step(const double step); + double dirichlet() const; + OnlineForestClassifier &set_dirichlet(const double dirichlet); + bool verbose() const; + OnlineForestClassifier &set_verbose(bool verbose); + CriterionClassifier criterion() const; + OnlineForestClassifier &set_criterion(CriterionClassifier criterion); + int32_t n_threads() const; + OnlineForestClassifier &set_n_threads(int32_t n_threads); + int seed() const; + OnlineForestClassifier &set_seed(int seed); void n_nodes(SArrayUIntPtr n_nodes_per_tree); - void n_leaves(SArrayUIntPtr n_leaves_per_tree); - bool verbose() const; - OnlineForestClassifier &set_verbose(bool verbose); + void set_feature_importances(const ArrayDouble &feature_importances); + }; #endif //TICK_ONLINE_FOREST_CLASSIFIER_H diff --git a/tick/inference/swig/online_forest_classifier.i b/tick/inference/swig/online_forest_classifier.i index 267489e8a..6d5d2172f 100644 --- a/tick/inference/swig/online_forest_classifier.i +++ b/tick/inference/swig/online_forest_classifier.i @@ -34,7 +34,6 @@ class OnlineForestClassifier { inline OnlineForestClassifier& set_step(const double step); uint32_t n_trees() const; - OnlineForestClassifier &set_n_trees(uint32_t n_trees); int32_t n_threads() const; OnlineForestClassifier &set_n_threads(int32_t n_threads); diff --git a/tick/inference/tests/online_forest_classifier_test.py b/tick/inference/tests/online_forest_classifier_test.py index 16d0adb7b..9f36faa34 100644 --- a/tick/inference/tests/online_forest_classifier_test.py +++ b/tick/inference/tests/online_forest_classifier_test.py @@ -13,6 +13,7 @@ class Test(InferenceTest): def test_online_forest_n_features_differs(self): + n_samples = 1000 n_classes = 2 n_trees = 20 From da70e9070a9f731e871e7c2b058e6078203756d7 Mon Sep 17 00:00:00 2001 From: Stephane Gaiffas Date: Fri, 12 Jan 2018 21:28:23 +0100 Subject: [PATCH 32/32] .. --- online_forest_selection.py | 161 ++++-------------- tick/inference/online_forest_classifier.py | 6 + .../src/online_forest_classifier.cpp | 68 ++++---- tick/inference/src/online_forest_classifier.h | 17 +- .../inference/swig/online_forest_classifier.i | 5 +- 5 files changed, 92 insertions(+), 165 deletions(-) diff --git a/online_forest_selection.py b/online_forest_selection.py index 9783008a8..3ab4ab6d4 100644 --- a/online_forest_selection.py +++ b/online_forest_selection.py @@ -1,154 +1,53 @@ -from tick.simulation import SimuLogReg, weights_sparse_gauss + from sklearn.model_selection import train_test_split +from sklearn.metrics import roc_auc_score import numpy as np from tick.inference import OnlineForestClassifier -from matplotlib.colors import ListedColormap - -from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.datasets import make_moons, make_classification, make_circles -from sklearn.metrics import roc_auc_score -import matplotlib.pyplot as plt - from tick.simulation import weights_sparse_exp, SimuLogReg +from sklearn.ensemble import RandomForestClassifier np.set_printoptions(precision=2) -# w0 = weights_sparse_gauss(n_features, nnz=2) -# X, y = SimuLogReg(w0, -1., n_samples=n_samples, seed=seed).simulate() - - -n_samples = 10000 -n_features = 50 +n_samples = 30000 +n_features = 30 n_classes = 2 +nnz = 5 +w0 = np.zeros(n_features) +w0[:nnz] = 1 -w0 = weights_sparse_exp(n_features, nnz=10) -X, y = SimuLogReg(weights=w0, intercept=None, n_samples=n_samples).simulate() -y = (y + 1) / 2 +# w0 = weights_sparse_exp(n_features, nnz=nnz) -path = '/Users/stephane.gaiffas/Downloads/' +X, y = SimuLogReg(weights=w0, intercept=None, n_samples=n_samples, + cov_corr=0.1).simulate() +y = (y + 1) / 2 +X_train, X_test, y_train, y_test = train_test_split(X, y) -clf = OnlineForestClassifier(n_classes=n_classes, n_trees=50, seed=123, +rf = RandomForestClassifier(n_estimators=10, criterion="entropy") +of = OnlineForestClassifier(n_classes=n_classes, n_trees=10, seed=123, step=1., use_aggregation=True) -clf.fit(X, y) - - -# of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=30., n_passes=1, -# seed=123, use_aggregation=True) -# -# of.fit(X, y) - -# print("n_nodes:", of.n_nodes()) -# print("n_leaves:", of.n_leaves()) -# print(of.predict_proba(X)) -# print("step: ", of._forest.step()) - -# of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=1., -# seed=123, use_aggregation=True, n_passes=1) -# -# of.fit(X, y) - -# print("n_nodes:", of.n_nodes()) -# print("n_leaves:", of.n_leaves()) -# print(of.predict_proba(X)) -# print("step: ", of._forest.step()) - +print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) -# exit(0) +rf.fit(X_train, y_train) +of.fit(X_train, y_train) +y_pred_rf = rf.predict_proba(X_test)[:, 1] +y_pred_of = of.predict_proba(X_test)[:, 1] -# X_train, X_test, y_train, y_test = \ -# train_test_split(X, y, test_size=.4, random_state=42) -# -# clf.fit(X_train, y_train) +print("AUC rf=", roc_auc_score(y_test, y_pred_rf)) +print("AUC of=", roc_auc_score(y_test, y_pred_of)) -# clf.predict(X_test) -# exit(0) - -# clf.print() - -X, y = make_classification(n_samples=n_samples, n_features=n_features, n_redundant=0, - n_informative=2, random_state=1, - n_clusters_per_class=1) -rng = np.random.RandomState(2) -X += 2 * rng.uniform(size=X.shape) -linearly_separable = (X, y) - -datasets = [ - make_moons(n_samples=n_samples, noise=0.3, random_state=0), - make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), - linearly_separable -] - -n_trees = 10 - -of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=30., n_passes=1, - seed=123, use_aggregation=True) -seed = 123 - - -params = [ - {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 0.1}, - {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 0.5}, - {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 2}, - # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1, 'n_passes': 1}, - # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.2, 'n_passes': 5}, - # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 10}, - # {'use_aggregation': True, 'n_trees': 1, 'subsampling': 1, 'n_passes': 1}, - # {'use_aggregation': True, 'n_trees': 1, 'subsampling': 0.1, 'n_passes': 10}, - # - # {'use_aggregation': True, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 1}, - # {'use_aggregation': True, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 20}, - # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 1}, - # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 20}, - # {'use_aggregation': False, 'n_trees': 1, 'subsampling': 1, 'n_passes': 1}, - # {'use_aggregation': False, 'n_trees': 1, 'subsampling': 1, 'n_passes': 20}, - # {'use_aggregation': False, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 1}, - # {'use_aggregation': False, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 20}, - # {'use_aggregation': False, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 1}, - # {'use_aggregation': False, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 20}, -] - - -def toto(kkk): - return "OF(T: " \ - + str(kkk['n_trees']) + ", S: " + str(kkk['subsampling']) \ - + ', P: ' + str(kkk['n_passes']) + ', di: ' + str(kkk['dirichlet']) \ - + ")" - # return "OF(A: " + str(kkk['use_aggregation']) + ", T: " \ - # + str(kkk['n_trees']) + ", S: " + str(kkk['subsampling']) \ - # + ', P: ' + str(kkk['n_passes']) + ")" - - -names = list(toto(kw) for kw in params) + ["KNN", "ET", "BRF"] - -classifiers = list( - OnlineForestClassifier(n_classes=n_classes, seed=123, step=1., **kw) - for kw in params -) +import matplotlib.pyplot as plt -classifiers += [ - KNeighborsClassifier(n_neighbors=5), - ExtraTreesClassifier(n_estimators=n_trees), - RandomForestClassifier(n_estimators=n_trees) -] - -# names = [ -# "OF(agg, n_passes=1)", -# "OF(agg, n_passes=5)", -# "OF(agg, n_passes=10)", -# "OF(no agg., n_passes=1)", -# "KNN (k=5)", -# "ET", -# "BRF" -# ] - -plot_decision_classification(classifiers, datasets, names) - -# plt.savefig('decisions.pdf') +plt.subplot(1, 3, 1) +plt.stem(rf.feature_importances_) +plt.subplot(1, 3, 2) +plt.stem(of.feature_importances) +plt.subplot(1, 3, 3) +plt.stem(w0) plt.show() +# print(clf.feature_importances) \ No newline at end of file diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py index 2a9f02eeb..25b9be29f 100644 --- a/tick/inference/online_forest_classifier.py +++ b/tick/inference/online_forest_classifier.py @@ -200,3 +200,9 @@ def criterion(self, value): def set_feature_importances(self, feature_importances): self._forest.set_feature_importances(feature_importances) + + @property + def feature_importances(self): + feature_importances = np.empty(self.n_features) + self._forest.get_feature_importances(feature_importances) + return feature_importances diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp index 1e2fe4844..b75a5d497 100644 --- a/tick/inference/src/online_forest_classifier.cpp +++ b/tick/inference/src/online_forest_classifier.cpp @@ -70,12 +70,16 @@ NodeClassifier &NodeClassifier::operator=(const NodeClassifier &node) { return *this; } -void NodeClassifier::update_downwards(const ArrayDouble &x_t, const double y_t) { +double NodeClassifier::update_downwards(const ArrayDouble &x_t, const double y_t) { _n_samples++; + // double loss_t = loss(y_t); if (use_aggregation()) { _weight -= step() * loss(y_t); } update_predict(y_t); + // We return the loss before updating the predictor of the node in order to + // update the feature importance in TreeClassifier::go_downwards + return loss(y_t); } bool NodeClassifier::is_same(const ArrayDouble &x_t) { @@ -325,7 +329,8 @@ TreeClassifier::TreeClassifier(const TreeClassifier &&tree) TreeClassifier::TreeClassifier(OnlineForestClassifier &forest) : forest(forest) { // TODO: pre-allocate the vector to make things faster ? add_node(0); - + feature_importances_ = ArrayDouble(forest.n_features()); + feature_importances_.fill(1.); } void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t) { @@ -333,7 +338,7 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c NodeClassifier ¤t_node = node(node_index); if (current_node.n_samples() == 0) { // The node is a leaf with no sample point, so it does not have a range - // In this case we just initialize the range with the given feature + // In this case we just initialize the range with the given feature. // This node will then be updated by the call to update_downwards in go_downwards current_node.set_features_min(x_t); current_node.set_features_max(x_t); @@ -442,21 +447,22 @@ void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, c } uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { - // Find the leaf that contains the sample - // Start at the root. Index of the root is always 0 - // If predict == true, this call to find_leaf is for - // prediction only, so that no leaf update and splits can be done -// std::cout << "Going downwards" << std::endl; + // Find the leaf that contains the sample. Start at the root. Index of the root is always 0. + // If predict == true, this is for prediction only, so no leaf update and splits can be done. uint32_t index_current_node = 0; bool is_leaf = false; + double loss_t = 0; + uint32_t feature = 0; + while (!is_leaf) { - // Get the current node - // NodeClassifier ¤t_node = node(index_current_node); if (!predict) { // Extend the range and eventually split the current node extend_range(index_current_node, x_t, y_t); - // Update the current node - node(index_current_node).update_downwards(x_t, y_t); + // Update the current node. We get the loss for this point before the node update + // to compute feature importance below + NodeClassifier& current_node = node(index_current_node); + feature = current_node.feature(); + loss_t = current_node.update_downwards(x_t, y_t); } // Is the node a leaf ? NodeClassifier ¤t_node = node(index_current_node); @@ -467,12 +473,17 @@ uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool p } else { index_current_node = current_node.right(); } + if(!predict) { + // Compute the difference with the loss of the child + loss_t -= node(index_current_node).loss(y_t); + feature_importances_[feature] += loss_t; + } } } -// std::cout << "...Done going downwards." << std::endl; return index_current_node; } + void TreeClassifier::go_upwards(uint32_t leaf_index) { // std::cout << "Going upwards" << std::endl; uint32_t current = leaf_index; @@ -515,11 +526,6 @@ void TreeClassifier::print() { } void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { - // TODO: Test that the size does not change within successive calls to fit - // std::cout << "------------------------------------------" << std::endl; - // std::cout << "iteration: " << iteration << std::endl; - // std::cout << "x_t: [" << std::setprecision(2) << x_t[0] << ", " << std::setprecision(2) << x_t[1] << "]" << std::endl; - // print(); uint32_t leaf = go_downwards(x_t, y_t, false); if (use_aggregation()) { go_upwards(leaf); @@ -716,9 +722,9 @@ inline uint32_t OnlineForestClassifier::sample_feature() { return rand.uniform_int(static_cast(0), n_features() - 1); } -inline uint32_t OnlineForestClassifier::sample_feature_bis() { - return rand.discrete(_feature_importances); -} +//inline uint32_t OnlineForestClassifier::sample_feature_bis() { +// return rand.discrete(_fefeature_importances_); +//} inline double OnlineForestClassifier::sample_exponential(double intensity) { return rand.exponential(intensity); @@ -787,11 +793,7 @@ uint32_t OnlineForestClassifier::n_samples() const { } uint32_t OnlineForestClassifier::n_features() const { - if (_iteration > 0) { - return _n_features; - } else { - TICK_ERROR("You must call ``fit`` before asking for ``n_features``.") - } + return _n_features; } void OnlineForestClassifier::check_n_features(uint32_t n_features, bool predict) const { @@ -854,9 +856,9 @@ OnlineForestClassifier &OnlineForestClassifier::set_criterion(CriterionClassifie return *this; } -void OnlineForestClassifier::set_feature_importances(const ArrayDouble &feature_importances) { - _feature_importances = feature_importances; -} +//void OnlineForestClassifier::set_feature_importances(const ArrayDouble &feature_importances) { +// _feature_importances = feature_importances; +//} double OnlineForestClassifier::dirichlet() const { return _dirichlet; @@ -866,3 +868,11 @@ OnlineForestClassifier &OnlineForestClassifier::set_dirichlet(const double diric _dirichlet = dirichlet; return *this; } + +void OnlineForestClassifier::get_feature_importances(SArrayDoublePtr feature_importances) { + feature_importances->fill(0); + const double a = static_cast(1) / n_trees(); + for (TreeClassifier &tree : trees) { + feature_importances->mult_incr(tree.feature_importances(), a); + } +} diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h index df6439f7d..10207ed0e 100644 --- a/tick/inference/src/online_forest_classifier.h +++ b/tick/inference/src/online_forest_classifier.h @@ -83,7 +83,7 @@ class NodeClassifier { } // Update to apply to a node when going forward in the tree (towards leaves) - void update_downwards(const ArrayDouble &x_t, const double y_t); + double update_downwards(const ArrayDouble &x_t, const double y_t); // Update to apply to a node when going upward in the tree (towards the root) void update_upwards(); // Update the prediction of the label @@ -102,6 +102,7 @@ class NodeClassifier { // Get node at index in the tree inline NodeClassifier &node(uint32_t index) const; + // Get number of features inline uint32_t n_features() const; // Number of classes @@ -165,6 +166,8 @@ class TreeClassifier { // Add nodes in the tree uint32_t add_node(uint32_t parent, double time = 0); + ArrayDouble feature_importances_; + void extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t); uint32_t go_downwards(const ArrayDouble &x_t, double y_t, bool predict); @@ -195,6 +198,10 @@ class TreeClassifier { NodeClassifier &node(uint32_t index) { return nodes[index]; } + + inline ArrayDouble& feature_importances() { + return feature_importances_; + } }; /********************************************************************************* @@ -234,7 +241,7 @@ class OnlineForestClassifier { // Random number generator for feature and threshold sampling Rand rand; - ArrayDouble _feature_importances; + // ArrayDouble _feature_importances; // Create trees void create_trees(); @@ -264,7 +271,7 @@ class OnlineForestClassifier { inline uint32_t sample_feature(); inline uint32_t sample_feature(const ArrayDouble &prob); - inline uint32_t sample_feature_bis(); + // inline uint32_t sample_feature_bis(); inline double sample_exponential(double intensity); inline double sample_threshold(double left, double right); @@ -293,7 +300,9 @@ class OnlineForestClassifier { void n_nodes(SArrayUIntPtr n_nodes_per_tree); void n_leaves(SArrayUIntPtr n_leaves_per_tree); - void set_feature_importances(const ArrayDouble &feature_importances); + // void set_feature_importances(const ArrayDouble &feature_importances); + + void get_feature_importances(SArrayDoublePtr feature_importances); }; diff --git a/tick/inference/swig/online_forest_classifier.i b/tick/inference/swig/online_forest_classifier.i index 6d5d2172f..02b0c085a 100644 --- a/tick/inference/swig/online_forest_classifier.i +++ b/tick/inference/swig/online_forest_classifier.i @@ -50,5 +50,8 @@ class OnlineForestClassifier { bool verbose() const; OnlineForestRegressor &set_verbose(bool verbose); - void set_feature_importances(const ArrayDouble &feature_importances); + // void set_feature_importances(const ArrayDouble &feature_importances); + + void get_feature_importances(SArrayDoublePtr feature_importances); + };