library(readxl)
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:MLmetrics':
## 
##     MAE, RMSE
library(tree)
library(e1071)
## Warning: package 'e1071' was built under R version 3.6.1
library(ggplot2)
library(caretEnsemble)
## Warning: package 'caretEnsemble' was built under R version 3.6.1
## 
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
## 
##     autoplot
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## Registered S3 method overwritten by 'cli':
##   method     from
##   print.tree tree
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## v purrr   0.3.4
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x caretEnsemble::autoplot() masks ggplot2::autoplot()
## x dplyr::filter()           masks stats::filter()
## x dplyr::lag()              masks stats::lag()
## x purrr::lift()             masks caret::lift()
library(rio)
library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
library(viridis)
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 3.6.3
## 
## Attaching package: 'viridis'
## The following object is masked from 'package:viridisLite':
## 
##     viridis.map
library(RColorBrewer)
library(ggthemes)
library(knitr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:rio':
## 
##     export
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(lime)
## Warning: package 'lime' was built under R version 3.6.1
## 
## Attaching package: 'lime'
## The following object is masked from 'package:dplyr':
## 
##     explain
library(plotROC)
## Warning: package 'plotROC' was built under R version 3.6.1
library(pROC)
## Warning: package 'pROC' was built under R version 3.6.1
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:plotROC':
## 
##     ggroc
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(SuperLearner)
## Warning: package 'SuperLearner' was built under R version 3.6.1
## Loading required package: nnls
## Super Learner
## Version: 2.0-25
## Package created on 2019-08-05
library(ipred)

Introduction

Ensemble machine learning methods use multiple learning algorithms to obtain better predictive performance than could be obtained from any of the constituent learning algorithms. Many of the popular modern machine learning algorithms are actually ensembles. For example, Random Forest and Gradient Boosting Machine (GBM) are both ensemble learners. Both bagging (e.g. Random Forest) and boosting (e.g. GBM) are methods for ensembling that take a collection of weak learners (e.g. decision tree) and form a single, strong learner. (https://h2o-release.s3.amazonaws.com/h2o/rel-wright/10/docs-website/h2o-docs/data-science/stacked-ensembles.html)

Reading the data…

train = read.table('F:/MS.c_Statistics/Kansas_University/881 Data Statistical Learning I/Week 13/Data/summer_train.tsv', sep='\t', header=T)
test = read.table('F:/MS.c_Statistics/Kansas_University/881 Data Statistical Learning I/Week 13/Data/summer_test.tsv', sep='\t', header=T)

Building an initial Ensemble model

The initial model was built using the same methodology used in class.The selected methods were Decision Trees (tree), Linear Regression (lm) and Support Vector Machines (SVM).

However, at this point some questions remain unanswered:

  1. Why are we using these specific methods and not different methodologies?
  2. Why are we using just three models and no more or less?
  3. What is the contribution of this model?
  4. Is there a way to estimate their associated risk for each method?

In the following lines, we will show how to answer all of these questions by using the SuperLearner Methodology.

Using the methology describe in class, we have

  1. Creating the models using the training data set.
mod1 = tree(O3 ~ ., data=train)
mod2 = lm(O3 ~ ., data=train)
mod3 = svm(O3 ~ ., data=train)
  1. Predicting with the newly created models using training data set.
preds1 = predict(mod1, data=train)
preds2 = predict(mod2, data=train)
preds3 = predict(mod3, data=train)
  1. Ensembling the models
mod4 = svm(O3 ~ preds1 + preds2  + preds3, data=train)
  1. Predicting with each model using the test data set.
preds1 = predict(mod1, data=test)
preds2 = predict(mod2, data=test)
preds3 = predict(mod3, data=test)
  1. Creating a data frame with the predictions using the training data set.
new_dat = data.frame(O3 = train$O3, preds1 = preds1, preds2 = preds2, preds3 = preds3)
  1. Calculating MSE
preds = predict(mod4, new_dat)
MSE(preds, test$O3)
## Warning in y_true - y_pred: longer object length is not a multiple of shorter
## object length
## [1] 124.3213
preds = cbind(preds1, preds2)
preds = cbind(preds, preds3)
preds = rowMeans(preds)
MSE(preds, test$O3)
## Warning in y_true - y_pred: longer object length is not a multiple of shorter
## object length
## [1] 104.4042

As seen above, MSE is 124.32 and 104.40 for the ensembled models.

Super Learner: Introduction

Stacked Ensemble method is supervised ensemble machine learning algorithm that finds the optimal combination of a collection of prediction algorithms using a process called stacking. Stacked Ensemble supports regression, binary classification and multiclass classification.

There are some ensemble methods that are broadly labeled as stacking, however, the Super Learner ensemble is distinguished by the use of cross-validation to form what is called the “level-one” data. In other words, SuperLearner is an algorithm that uses cross-validation to estimate the performance of multiple machine learning models, or the same model with different settings. It then creates an optimal weighted average of those models, which is also called an “ensemble”, using the test data performance.

(https://h2o-release.s3.amazonaws.com/h2o/rel-wright/10/docs-website/h2o-docs/data-science/stacked-ensembles.html) (https://www.datacamp.com/community/tutorials/ensemble-r-machine-learning)

Super Learner: The Idea

Leo Breiman, known for his work on classification and regression trees and random forests, formalized stacking in his 1996 paper on Stacked Regressions (Breiman 1996b). Although the idea originated in (Wolpert 1992) under the name “Stacked Generalizations”, the modern form of stacking that uses internal k-fold CV was Breiman’s contribution. (https://bradleyboehmke.github.io/HOML/stacking.html)

However, it wasn’t until 2007 that the theoretical background for stacking was developed, and also when the algorithm took on the cooler name, Super Learner (Van der Laan, Polley, and Hubbard 2007).

There are a few package implementations for model stacking in the R ecosystem. SuperLearner (Polley et al. 2019) provides the original Super Learner and includes a clean interface to 30+ algorithms. (https://bradleyboehmke.github.io/HOML/stacking.html)

We can train each of these models individually (see the code chunk below). However, to stack them later we need to do a few specific things:

1) All models must be trained on the same training set.
2) All models must be trained with the same number of CV folds.
3) All models must use the same fold assignment to ensure the same observations are used (we can do this by using fold_assignment = "Modulo").
4) The cross-validated predictions from all of the models must be preserved by setting keep_cross_validation_predictions = TRUE. This is the data which is          used to train the meta learner algorithm in the ensemble.

SuperLearner makes it trivial to run many algorithms and use the best one or an ensemble. Also, SuperLearner automatically removes models that do not contribute to the ensemble prediction power, this leaves you free to experiment with numerous algorithms! Becasuse of that, we are including more models than described in the original workflow provided in class. However, for further applications we should be aware that we should have to decide which algorithms we will want to try before fitting a model. The reason is because the computation time might be compromised. For a small data set, such as this, there is minimal impact, but larger data sets could be heavily affected.

Installing SuperLearner…

# install.packages("devtools")
#devtools::install_github("ecpolley/SuperLearner")
#install.packages("SuperLearner")

Training the model

The first step with SuperLearner, is to create a “SuperLearner Library”, including all the algorithms that we would like to assess.

set.seed(1)

sl_lib = c("SL.xgboost", "SL.randomForest", "SL.glmnet", "SL.nnet", "SL.ksvm", "SL.svm",
           "SL.caret", "SL.kernelKnn", "SL.rpartPrune", "SL.lm", "SL.mean", "SL.ridge", "SL.biglasso", "SL.ipredbagg" ) 

# This SUper Learner library (sl_lib) includes Linear model "SL.lm", Trees "SL.randomForest" and SVM "SL.ksvm" as in the above chunck. However, Super Learner R's package  allows to include more models.                                                                                                                                                                                                            

Create automatic ensemble using the training data set.

Review performance of each algorithm and ensemble weights.

# Review performance of each algorithm and ensemble weights.
result.train
## 
## Call:  SuperLearner(Y = train$O3, X = train[, -1], SL.library = sl_lib) 
## 
## 
##                          Risk       Coef
## SL.xgboost_All      136.68867 0.00000000
## SL.randomForest_All  95.64607 0.00000000
## SL.glmnet_All       100.00935 0.00000000
## SL.nnet_All         121.14006 0.00000000
## SL.ksvm_All         101.23620 0.00000000
## SL.svm_All           93.12870 0.38873391
## SL.caret_All         96.76977 0.00000000
## SL.kernelKnn_All     96.39465 0.09361768
## SL.rpartPrune_All   111.32275 0.00000000
## SL.lm_All           100.08787 0.19883442
## SL.mean_All         123.07225 0.00000000
## SL.ridge_All        100.07529 0.00000000
## SL.biglasso_All     100.15688 0.00000000
## SL.ipredbagg_All     94.01648 0.31881399

Algorithms with coefficient zero means that they are not weighted as part of the ensemble anymore. In other words, Random Forest, glmnet (lasso or elastic net regularization), nnet (Neural Netwotks), ksvm (Kernel Vector Machine), caret (Classification And REgression Training), kernelKnn (Kernel Nearest Neighbor), rpartPrune (Decision Trees), mean, biglasso (Extending Lasso Model Fitting to Big Data), are not contributing to the model.

Algorithms with coefficient different than zero means that they are weighted as part of the ensemble model. Therefore, svm (Support Vector Machine) is the method that contributed the most with a coefficient of 0.49, followed by ipredbagg (bagging) with 0.27, lm (Linear Regression) with 0.20 and xgboost (eXtreme Gradient Boosting), with only 0.03.

It is important to remember, that the best ensembles are not composed of the best performing algorithms, but rather the algorithms that best complement each other to classify a prediction.

You will notice SuperLearner is calculating this risk for you and deciding on the optimal model mix that will reduce the error.

To understand each model’s specific contribution to the model and the variation, we can use SuperLearner’s internal cross-validation function CV.SuperLearner().Using external (a.k.a nested) cross-validation we are able to estimate ensemble accuracy.

Summary and Plot performance of individual algorithms and compare to the ensemble.

The summary of cross validation shows the average risk of the model, the variation of the model and the range of the risk.

summary(result2.train)
## 
## Call:  
## CV.SuperLearner(Y = train$O3, X = train[, -1], V = 5, SL.library = sl_lib) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  5 
## 
##            Algorithm     Ave     se     Min    Max
##        Super Learner  94.536 10.410  82.405 104.66
##          Discrete SL 101.164 11.573  83.410 113.20
##       SL.xgboost_All 139.368 14.752 105.882 173.18
##  SL.randomForest_All  98.430 11.385  73.259 118.09
##        SL.glmnet_All  98.130 10.154  78.547 106.45
##          SL.nnet_All 123.002 14.031 113.846 133.27
##          SL.ksvm_All 102.667 11.217  83.206 120.71
##           SL.svm_All  92.080 10.213  71.886 105.89
##         SL.caret_All 100.803 11.600  75.505 121.77
##     SL.kernelKnn_All  95.605 10.276  73.181 103.88
##    SL.rpartPrune_All 113.614 12.827  88.107 139.28
##            SL.lm_All  98.332 10.081  77.856 106.77
##          SL.mean_All 123.002 14.031 113.846 133.27
##         SL.ridge_All  98.275 10.125  78.359 106.61
##      SL.biglasso_All  97.941 10.255  77.991 106.75
##     SL.ipredbagg_All  96.492 11.027  82.070 113.20

Plotting this also produces a nice plot of the models used and their variation:

plot(result2.train) + theme_minimal()

It’s easy to see that Super Learner performs the best on average while xgboost (eXtreme Gradient Boosting) performs the worst and contains a lot of variation compared to the other models. The beauty of SuperLearner is that, if a model does not fit well or contribute much, it is just weighted to zero! There is no need to remove it and retrain unless you plan on retraining the model in the future. Just remember that proper model training involves cross validation of the entire model.

It is important to understand, that we can try to improve model’s performance by tuning some hyperparameters of some of the models that we have in the ensemble. A method that is not weighted heavily in the model, probably is because we need to improve it by setting better parameters. Even methods selected can be improved.For example, we can improve bagging by increasing the nbagg parameter to 250 from the default of 25.

Make Predictions with SuperLearner

With the specific command predict.SuperLearner() we can easily make predictions on new data sets. That means that we can not use the normal predict() function!

predictions.train <- predict.SuperLearner(result.train, data= train)
head(predictions.train$library.predict) #individual library predictions
##   SL.xgboost_All SL.randomForest_All SL.glmnet_All SL.nnet_All SL.ksvm_All
## 1       29.53252            28.40715      30.88028     32.1974    27.37817
## 2       37.22233            35.63009      31.39095     32.1974    36.88945
## 3       29.90877            29.08707      31.42140     32.1974    28.58277
## 4       18.64757            23.86886      25.40855     32.1974    24.31104
## 5       30.51056            30.25966      29.76573     32.1974    26.89009
## 6       31.87774            30.56093      32.06034     32.1974    29.25576
##   SL.svm_All SL.caret_All SL.kernelKnn_All SL.rpartPrune_All SL.lm_All
## 1   26.97545     27.52597             28.7          26.84848  30.81041
## 2   34.77432     36.09243             33.1          26.84848  31.38506
## 3   30.44512     29.38293             28.6          32.58960  31.43141
## 4   26.24628     22.96010             21.8          26.84848  25.19740
## 5   24.95828     31.16237             30.4          26.84848  29.67821
## 6   29.03940     30.85843             31.0          32.58960  32.06846
##   SL.mean_All SL.ridge_All SL.biglasso_All SL.ipredbagg_All
## 1     32.1974     30.86928        31.01958         27.38266
## 2     32.1974     31.47323        31.40458         31.20291
## 3     32.1974     31.38599        31.40180         30.10857
## 4     32.1974     25.37937        25.82947         26.57383
## 5     32.1974     29.74906        29.93947         28.67596
## 6     32.1974     32.03807        32.04380         30.05366

This allows you to see how each model classified each observation. This could be useful in debugging the model or fitting multiple models at once to see which to use further.

MSE(predictions.train$pred, train$O3)
## [1] 71.34805

Test the model

Review performance of each algorithm and ensemble weights.

# Review performance of each algorithm and ensemble weights.
result.test
## 
## Call:  SuperLearner(Y = test$O3, X = test[, -1], SL.library = sl_lib) 
## 
## 
##                          Risk      Coef
## SL.xgboost_All      109.10187 0.0000000
## SL.randomForest_All  64.72646 0.0000000
## SL.glmnet_All        64.82395 0.0000000
## SL.nnet_All          69.76737 0.1265396
## SL.ksvm_All          64.28660 0.0000000
## SL.svm_All           59.57625 0.5619328
## SL.caret_All         65.21073 0.0000000
## SL.kernelKnn_All     64.41627 0.0000000
## SL.rpartPrune_All    79.85767 0.0000000
## SL.lm_All            64.71026 0.1758192
## SL.mean_All          75.01366 0.0000000
## SL.ridge_All         64.68603 0.0000000
## SL.biglasso_All      65.02591 0.0000000
## SL.ipredbagg_All     61.95305 0.1357084

Xgboost (eXtreme Gradient Boosting), Random Forest, glmnet (lasso or elastic net regularization), nnet (Neural Netwotks), ksvm (Kernel Vector Machine), caret (Classification And REgression Training), kernelKnn (Kernel Nearest Neighbor), rpartPrune (Decision Trees), mean, ridge, biglasso (Extending Lasso Model Fitting to Big Data), are not contributing to the model.

Svm (Support Vector Machine) is the method that contributed the most with a coefficient of 0.63, followed by biglasso (big lasso) with 0.19 and ipredbagg (bagging) with 0.17.

To understand each model’s specific contribution to the model and the variation, we can use SuperLearner’s internal cross-validation function CV.SuperLearner().Using external (a.k.a nested) cross-validation we are able to estimate ensemble accuracy.

Summary and Plot performance of individual algorithms and compare to the ensemble.

The summary of cross validation shows the average risk of the model, the variation of the model and the range of the risk.

summary(result2.test)
## 
## Call:  
## CV.SuperLearner(Y = test$O3, X = test[, -1], V = 5, SL.library = sl_lib) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  5 
## 
##            Algorithm     Ave     se    Min     Max
##        Super Learner  61.313 11.191 46.557  80.825
##          Discrete SL  61.456 11.365 43.799  84.131
##       SL.xgboost_All 105.899 16.354 84.233 124.379
##  SL.randomForest_All  64.121 11.378 51.783  90.376
##        SL.glmnet_All  65.411 11.768 45.421  86.260
##          SL.nnet_All  74.745 13.853 51.141 102.228
##          SL.ksvm_All  66.671 11.481 50.389  80.960
##           SL.svm_All  60.318 10.884 43.799  78.443
##         SL.caret_All  65.490 11.445 53.421  94.784
##     SL.kernelKnn_All  62.584 11.424 42.676  84.131
##    SL.rpartPrune_All  71.874 10.719 57.506  87.384
##            SL.lm_All  65.432 11.727 44.889  85.926
##          SL.mean_All  74.745 13.854 51.141 102.228
##         SL.ridge_All  65.279 11.799 45.333  86.759
##      SL.biglasso_All  65.019 11.766 45.972  86.270
##     SL.ipredbagg_All  61.730 10.981 45.087  88.151

Plotting this also produces a nice plot of the models used and their variation:

plot(result2.test) + theme_minimal()

It’s easy to see that Super Learner performs the best on average while xgboost (eXtreme Gradient Boosting) performs the worst and contains a lot of variation compared to the other models.

Make Predictions with SuperLearner

With the specific command predict.SuperLearner() we can easily make predictions on new data sets. That means that we can not use the normal predict() function!

predictions.test <- predict.SuperLearner(result.test, data= test)
head(predictions.test$library.predict) #individual library predictions
##   SL.xgboost_All SL.randomForest_All SL.glmnet_All SL.nnet_All SL.ksvm_All
## 1       29.02114            28.71975      29.55169    31.59585    28.75647
## 2       32.61366            33.32613      34.88132    31.59585    36.43442
## 3       32.51057            29.96637      27.22970    31.59585    32.13622
## 4       36.11968            34.75793      33.86258    31.59585    35.80895
## 5       26.85324            28.99332      29.90860    31.59585    30.54253
## 6       27.69489            28.03470      29.68722    31.59585    26.23507
##   SL.svm_All SL.caret_All SL.kernelKnn_All SL.rpartPrune_All SL.lm_All
## 1   27.99806     28.12000             24.9          28.48837  29.43213
## 2   36.25053     33.67883             32.4          34.37838  35.13458
## 3   28.85796     30.16677             27.1          28.48837  26.93331
## 4   34.76004     35.19980             32.9          37.20000  34.06367
## 5   30.39780     29.08570             29.5          28.48837  29.83482
## 6   25.11392     27.49823             26.6          28.26316  29.48818
##   SL.mean_All SL.ridge_All SL.biglasso_All SL.ipredbagg_All
## 1    31.59585     29.46058        29.50427         28.04265
## 2    31.59585     34.89660        34.97902         36.60591
## 3    31.59585     27.12396        27.11402         28.56493
## 4    31.59585     33.81420        33.93923         35.04225
## 5    31.59585     29.88731        29.87902         28.74903
## 6    31.59585     29.67075        29.61130         27.83919

Estimating the MSE

MSE(predictions.test$pred, test$O3)
## [1] 46.66526

Conclusions

  1. SuperLearner outperform the traditional approach for Ensembling models.The MSE for the test training set was 45, while with the traditional approach more than 100.
  2. SuperLearner was easy to use and answered some key questions proposed at the beginning of this work.
  3. Even for this small data set, SuperLearner requiere some computational effort.
  4. Depending on the runs, results with SuperLearner may varies.