rust-ml
diff --git a/‎algorithms/linfa-ensemble/README.md‎
Lines changed: 33 additions & 1 deletion b/‎algorithms/linfa-ensemble/README.md‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎algorithms/linfa-ensemble/examples/bagging_iris.rs‎
Lines changed: 0 additions & 35 deletions b/‎algorithms/linfa-ensemble/examples/bagging_iris.rs‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎algorithms/linfa-ensemble/examples/ensemble_iris.rs‎
Lines changed: 64 additions & 0 deletions b/‎algorithms/linfa-ensemble/examples/ensemble_iris.rs‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎algorithms/linfa-ensemble/src/algorithm.rs‎
Lines changed: 27 additions & 8 deletions b/‎algorithms/linfa-ensemble/src/algorithm.rs‎
Lines changed: 27 additions & 8 deletions
diff --git a/‎algorithms/linfa-ensemble/src/hyperparams.rs‎
Lines changed: 16 additions & 0 deletions b/‎algorithms/linfa-ensemble/src/hyperparams.rs‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎algorithms/linfa-ensemble/src/lib.rs‎
Lines changed: 61 additions & 4 deletions b/‎algorithms/linfa-ensemble/src/lib.rs‎
Lines changed: 61 additions & 4 deletions
@@ -15,7 +15,39 @@
 You can find examples in the `examples/` directory. To run an bootstrap aggregation for ensemble of decision trees (a Random Forest) use:
 
 ```bash
-$ cargo run --example randomforest_iris --release
+$ cargo run --example ensemble_iris --release
 ```
 
+The expected output should be
+```commandline
+An example using Bagging with Decision Tree on Iris Dataset
+Final Predictions:
+[0, 2, 0, 1, 1, 2, 2, 1, 0, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2, 0, 1, 2, 2, 2, 0, 0, 1, 0, 2, 0], shape=[30], strides=[1], layout=CFcf (0xf), const ndim=1
+
+classes    | 0          | 1          | 2
+0          | 11         | 0          | 0
+1          | 0          | 7          | 1
+2          | 0          | 1          | 10
+
+Test accuracy: 93.333336
+ with default Decision Tree params,
+ Ensemble Size: 100,
+ Bootstrap Proportion: 0.7
+ Feature selection proportion: 1
+
+An example using a Random Forest on Iris Dataset
+Final Predictions:
+[0, 1, 0, 1, 1, 2, 2, 1, 0, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2, 0, 1, 2, 2, 2, 0, 0, 1, 0, 2, 0], shape=[30], strides=[1], layout=CFcf (0xf), const ndim=1
+
+classes    | 0          | 1          | 2
+0          | 11         | 0          | 0
+1          | 0          | 8          | 0
+2          | 0          | 1          | 10
+
+Test accuracy: 96.666664
+ with default Decision Tree params,
+ Ensemble Size: 100,
+ Bootstrap Proportion: 0.7
+ Feature selection proportion: 0.2
+```
 
@@ -0,0 +1,64 @@
+use linfa::prelude::{Fit, Predict, ToConfusionMatrix};
+use linfa_ensemble::{EnsembleLearnerParams, RandomForestParams};
+use linfa_trees::DecisionTree;
+use ndarray_rand::rand::SeedableRng;
+use rand::rngs::SmallRng;
+
+fn ensemble_learner(ensemble_size: usize, bootstrap_proportion: f64) {
+    // Load dataset
+    let mut rng = SmallRng::seed_from_u64(42);
+    let (train, test) = linfa_datasets::iris()
+        .shuffle(&mut rng)
+        .split_with_ratio(0.8);
+
+    // Train ensemble learner model
+    let model = EnsembleLearnerParams::new_fixed_rng(DecisionTree::params(), rng)
+        .ensemble_size(ensemble_size)
+        .bootstrap_proportion(bootstrap_proportion)
+        .fit(&train)
+        .unwrap();
+
+    // Return highest ranking predictions
+    let final_predictions_ensemble = model.predict(&test);
+    println!("Final Predictions: \n{final_predictions_ensemble:?}");
+
+    let cm = final_predictions_ensemble.confusion_matrix(&test).unwrap();
+
+    println!("{cm:?}");
+    println!("Test accuracy: {} \n with default Decision Tree params, \n Ensemble Size: {ensemble_size},\n Bootstrap Proportion: {bootstrap_proportion}.\n",
+    100.0 * cm.accuracy());
+}
+
+fn random_forest(ensemble_size: usize, bootstrap_proportion: f64, feature_proportion: f64) {
+    let mut rng = SmallRng::seed_from_u64(42);
+    let (train, test) = linfa_datasets::iris()
+        .shuffle(&mut rng)
+        .split_with_ratio(0.8);
+
+    // Train ensemble learner model
+    let model = RandomForestParams::new_fixed_rng(DecisionTree::params(), rng)
+        .ensemble_size(ensemble_size)
+        .bootstrap_proportion(bootstrap_proportion)
+        .feature_proportion(feature_proportion)
+        .fit(&train)
+        .unwrap();
+
+    // Return highest ranking predictions
+    let final_predictions_ensemble = model.predict(&test);
+    println!("Final Predictions: \n{final_predictions_ensemble:?}");
+
+    let cm = final_predictions_ensemble.confusion_matrix(&test).unwrap();
+
+    println!("{cm:?}");
+    println!("Test accuracy: {} \n with default Decision Tree params, \n Ensemble Size: {ensemble_size},\n Bootstrap Proportion: {bootstrap_proportion}\n Feature selection proportion: {feature_proportion}.\n",
+    100.0 * cm.accuracy());
+}
+
+fn main() {
+    // This is an example bagging with decision tree
+    println!("An example using Bagging with Decision Tree on Iris Dataset");
+    ensemble_learner(100, 0.7);
+    // This is basically a Random Forest ensemble
+    println!("An example using a Random Forest on Iris Dataset");
+    random_forest(100, 0.7, 0.2);
+}
@@ -5,24 +5,31 @@ use linfa::{
     traits::*,
     DatasetBase,
 };
+use linfa_trees::DecisionTree;
 use ndarray::{Array2, Axis, Zip};
 use rand::Rng;
 use std::{cmp::Eq, collections::HashMap, hash::Hash};
 
+pub type RandomForest<F, L> = EnsembleLearner<DecisionTree<F, L>>;
+
 pub struct EnsembleLearner<M> {
     pub models: Vec<M>,
+    pub model_features: Vec<Vec<usize>>,
 }
 
 impl<M> EnsembleLearner<M> {
     // Generates prediction iterator returning predictions from each model
     pub fn generate_predictions<'b, R: Records, T>(
         &'b self,
-        x: &'b R,
+        x: &'b [R],
     ) -> impl Iterator<Item = T> + 'b
     where
         M: Predict<&'b R, T>,
     {
-        self.models.iter().map(move |m| m.predict(x))
+        self.models
+            .iter()
+            .zip(x.iter())
+            .map(move |(m, sub_data)| m.predict(sub_data))
     }
 }
 
@@ -40,7 +47,12 @@ where
             "The number of data points must match the number of outputs."
         );
 
-        let predictions = self.generate_predictions(x);
+        let sub_datas = self
+            .model_features
+            .iter()
+            .map(|feat| x.select(Axis(1), feat))
+            .collect::<Vec<_>>();
+        let predictions = self.generate_predictions(&sub_datas);
 
         // prediction map has same shape as y_array, but the elements are maps
         let mut prediction_maps = y_array.map(|_| HashMap::new());
@@ -81,23 +93,30 @@ where
         &self,
         dataset: &DatasetBase<Array2<D>, T>,
     ) -> core::result::Result<Self::Object, Error> {
-        let mut models = Vec::new();
+        let mut models = Vec::with_capacity(self.ensemble_size);
+        let mut model_features = Vec::with_capacity(self.ensemble_size);
         let mut rng = self.rng.clone();
 
+        // Compute dataset and the subset of features ratio to be selected
         let dataset_size =
             ((dataset.records.nrows() as f64) * self.bootstrap_proportion).ceil() as usize;
+        let n_feat = dataset.records.ncols();
+        let n_sub = ((n_feat as f64) * self.feature_proportion).ceil() as usize;
 
-        let iter = dataset.bootstrap_samples(dataset_size, &mut rng);
-
-        for train in iter {
+        let iter = dataset.bootstrap_with_indices((dataset_size, n_sub), &mut rng);
+        for (train, _, feature_selected) in iter {
             let model = self.model_params.fit(&train).unwrap();
             models.push(model);
+            model_features.push(feature_selected);
 
             if models.len() == self.ensemble_size {
                 break;
             }
         }
 
-        Ok(EnsembleLearner { models })
+        Ok(EnsembleLearner {
+            models,
+            model_features,
+        })
     }
 }
@@ -2,6 +2,7 @@ use linfa::{
     error::{Error, Result},
     ParamGuard,
 };
+use linfa_trees::DecisionTreeParams;
 use rand::rngs::ThreadRng;
 use rand::Rng;
 
@@ -11,6 +12,8 @@ pub struct EnsembleLearnerValidParams<P, R> {
     pub ensemble_size: usize,
     /// The proportion of the total number of training samples that should be given to each model for training
     pub bootstrap_proportion: f64,
+    /// The proportion of the total number of training feature that should be given to each model for training
+    pub feature_proportion: f64,
     /// The model parameters for the base model
     pub model_params: P,
     pub rng: R,
@@ -19,6 +22,8 @@ pub struct EnsembleLearnerValidParams<P, R> {
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct EnsembleLearnerParams<P, R>(EnsembleLearnerValidParams<P, R>);
 
+pub type RandomForestParams<F, L, R> = EnsembleLearnerParams<DecisionTreeParams<F, L>, R>;
+
 impl<P> EnsembleLearnerParams<P, ThreadRng> {
     pub fn new(model_params: P) -> EnsembleLearnerParams<P, ThreadRng> {
         Self::new_fixed_rng(model_params, rand::thread_rng())
@@ -30,6 +35,7 @@ impl<P, R: Rng + Clone> EnsembleLearnerParams<P, R> {
         Self(EnsembleLearnerValidParams {
             ensemble_size: 1,
             bootstrap_proportion: 1.0,
+            feature_proportion: 1.0,
             model_params,
             rng,
         })
@@ -44,6 +50,11 @@ impl<P, R: Rng + Clone> EnsembleLearnerParams<P, R> {
         self.0.bootstrap_proportion = proportion;
         self
     }
+
+    pub fn feature_proportion(mut self, proportion: f64) -> Self {
+        self.0.feature_proportion = proportion;
+        self
+    }
 }
 
 impl<P, R> ParamGuard for EnsembleLearnerParams<P, R> {
@@ -61,6 +72,11 @@ impl<P, R> ParamGuard for EnsembleLearnerParams<P, R> {
                 "Ensemble size should be less than one, but was {}",
                 self.0.ensemble_size
             )))
+        } else if self.0.feature_proportion > 1.0 || self.0.feature_proportion <= 0.0 {
+            Err(Error::Parameters(format!(
+                "Feature proportion should be greater than zero and less than or equal to one, but was {}",
+                self.0.feature_proportion
+            )))
         } else {
             Ok(&self.0)
         }
 
@@ -5,12 +5,19 @@
 //!
 //! ## Bootstrap Aggregation (aka Bagging)
 //!
-//! A typical example of ensemble method is Bootstrapo AGgregation, which combines the predictions of
+//! A typical example of ensemble method is Bootstrap Aggregation, which combines the predictions of
 //! several decision trees (see `linfa-trees`) trained on different samples subset of the training dataset.
 //!
+//! ## Random Forest
+//!
+//! A special case of Bootstrap Aggregation using decision trees (see `linfa-trees`) with random feature
+//! selection. A typical number of random prediction to be selected is $\sqrt{p}$ with $p$ being
+//! the number of available features.
+//!
 //! ## Reference
 //!
 //! * [Scikit-Learn User Guide](https://scikit-learn.org/stable/modules/ensemble.html)
+//! * [An Introduction to Statistical Learning](https://www.statlearning.com/)
 //!
 //! ## Example
 //!
@@ -32,15 +39,44 @@
 //!
 //! // Train the model on the iris dataset
 //! let bagging_model = EnsembleLearnerParams::new(DecisionTree::params())
-//!     .ensemble_size(100)
-//!     .bootstrap_proportion(0.7)
+//!     .ensemble_size(100)        // Number of Decision Tree to fit
+//!     .bootstrap_proportion(0.7) // Select only 70% of the data via bootstrap
 //!     .fit(&train)
 //!     .unwrap();
 //!
 //! // Make predictions on the test set
 //! let predictions = bagging_model.predict(&test);
 //! ```
 //!
+//! This example shows how to train a Random Forest model using 100 decision trees,
+//! each trained on 70% of the training data (bootstrap sampling) and using only
+//! 30% of the available features.
+//!
+//! ```no_run
+//! use linfa::prelude::{Fit, Predict};
+//! use linfa_ensemble::RandomForestParams;
+//! use linfa_trees::DecisionTree;
+//! use ndarray_rand::rand::SeedableRng;
+//! use rand::rngs::SmallRng;
+//!
+//! // Load Iris dataset
+//! let mut rng = SmallRng::seed_from_u64(42);
+//! let (train, test) = linfa_datasets::iris()
+//!     .shuffle(&mut rng)
+//!     .split_with_ratio(0.8);
+//!
+//! // Train the model on the iris dataset
+//! let bagging_model = RandomForestParams::new(DecisionTree::params())
+//!     .ensemble_size(100)        // Number of Decision Tree to fit
+//!     .bootstrap_proportion(0.7) // Select only 70% of the data via bootstrap
+//!     .feature_proportion(0.3)   // Select only 30% of the feature
+//!     .fit(&train)
+//!     .unwrap();
+//!
+//! // Make predictions on the test set
+//! let predictions = bagging_model.predict(&test);
+//! ```
+
 mod algorithm;
 mod hyperparams;
 
@@ -55,14 +91,35 @@ mod tests {
     use ndarray_rand::rand::SeedableRng;
     use rand::rngs::SmallRng;
 
+    #[test]
+    fn test_random_forest_accuracy_on_iris_dataset() {
+        let mut rng = SmallRng::seed_from_u64(42);
+        let (train, test) = linfa_datasets::iris()
+            .shuffle(&mut rng)
+            .split_with_ratio(0.8);
+
+        let model = RandomForestParams::new_fixed_rng(DecisionTree::params(), rng)
+            .ensemble_size(100)
+            .bootstrap_proportion(0.7)
+            .feature_proportion(0.3)
+            .fit(&train)
+            .unwrap();
+
+        let predictions = model.predict(&test);
+
+        let cm = predictions.confusion_matrix(&test).unwrap();
+        let acc = cm.accuracy();
+        assert!(acc >= 0.9, "Expected accuracy to be above 90%, got {}", acc);
+    }
+
     #[test]
     fn test_ensemble_learner_accuracy_on_iris_dataset() {
         let mut rng = SmallRng::seed_from_u64(42);
         let (train, test) = linfa_datasets::iris()
             .shuffle(&mut rng)
             .split_with_ratio(0.8);
 
-        let model = EnsembleLearnerParams::new(DecisionTree::params())
+        let model = EnsembleLearnerParams::new_fixed_rng(DecisionTree::params(), rng)
             .ensemble_size(100)
             .bootstrap_proportion(0.7)
             .fit(&train)