SuperLearner : Using Simulation Study and Real Dataset to Access the Performance of the SuperLearner

Install and load packages needed to run this notebook.

# You need the suggested packages to run this notebook 

pkgs <- c("SuperLearner", "quadprog","plsgenomics","kernlab","MASS","ggplot2",
          "nloptr","gam","gbm","nnet","polspline","nnls","BayesTree","glmnet",
          "randomForest","class","e1071","stepPlr","arm","party","spls","LogicReg",
          "SIS", "ipred","mlbench","rpart","caret","mda","earth", "cowplot",
          "tidyverse", "locfit", "DT")

# see what packages are currently installed
installed_pacakges <- row.names(installed.packages())
# loop over the needed packages
for(p in pkgs){
# check if package is installed
already_installed <- p %in% installed_pacakges
# if not already installed, install it
if(!already_installed){
install.packages(p)
}
# and load package
library(p, character.only = TRUE)
}

Simulation study

Define variables for Simulation

Create 4 simulated datasets to be used in training the model.
Peturb each response with some random noise.
Obtain plots of the four simulated data.

set.seed(123) # for reproducibility
n <- 100 # sample size
x <-  runif(n, min = -4,max = 4) # single covariate 
u <- rnorm(n) # random noise

# Response variables
y1 <-  (-2*(if_else(x < -3, 1, 0)) + 2.55*(if_else(x > -2, 1, 0)) -
      2*(if_else(x > 0, 1, 0)) + 4*(if_else(x > 2, 1, 0)) - 
      1*(if_else(x > 3, 1, 0)) + u )

y2 <-  6 + (0.4*x)-(0.36*x*x) + 0.005*(x^3) + u

y3 <-  2.83*sin((pi/2) * x) + u

y4 <-  4*sin(3*pi*x)*(if_else(x > 0, 1, 0)) + u 

# Put variables in a dataframe
df <- data.frame(cbind(x, y1, y2, y3, y4))

# Display first 5 rows of dataframe
datatable(df,class = 'cell-border stripe', caption = 'Table 1: Simulated Data.')%>%
  formatRound(c("x", "y1", "y2", "y3", "y4"), 2)

summary(df) # Display summary statistics

##        x                  y1                y2               y3         
##  Min.   :-3.99500   Min.   :-4.0532   Min.   :-1.919   Min.   :-4.4299  
##  1st Qu.:-2.03624   1st Qu.:-0.1382   1st Qu.: 3.224   1st Qu.:-1.9180  
##  Median :-0.26903   Median : 1.7551   Median : 4.537   Median :-0.4467  
##  Mean   :-0.01153   Mean   : 1.6133   Mean   : 4.090   Mean   :-0.1476  
##  3rd Qu.: 2.04377   3rd Qu.: 3.4698   3rd Qu.: 5.516   3rd Qu.: 1.6085  
##  Max.   : 3.95416   Max.   : 5.6468   Max.   : 7.968   Max.   : 4.0240  
##        y4         
##  Min.   :-5.0855  
##  1st Qu.:-1.4760  
##  Median :-0.1051  
##  Mean   :-0.3917  
##  3rd Qu.: 0.9332  
##  Max.   : 3.6745

# Obtain plots of four simulation sets

plot1 <- ggplot(df, aes(x=x, y=y1)) + 
  geom_point() + 
  labs(title = "Simulation 1")

plot2 <- ggplot(df, aes(x=x, y=y2)) + 
  geom_point() + 
  labs(title = "Simulation 2")

plot3 <- ggplot(df, aes(x=x, y=y3)) + 
  geom_point() + 
  labs(title = "Simulation 3")

plot4 <- ggplot(df, aes(x=x, y=y4)) + 
  geom_point() + 
  labs(title = "Simulation 4")

plot_grid(plot1, plot2, plot3, plot4, labels = "AUTO")

Fit the Superlearner

Define a library of algorithms to be used. This will depend on whether the task at hand is a (binary) classification or regression problem.
Use the defined library of algorithms in the call of Superlearner function for each simluated data.
Obtain plots of the four simulated data.

# Specify a library of algorithms
SL.library <- c("SL.nnet", "SL.glm", "SL.randomForest", "SL.glm.interaction",
                "SL.gam", "SL.polymars", "SL.loess", "SL.mean")

 
# Run the SuperLearner function to obtain predicted values and 
# CV Risk for  each of the algorithms in the library defined above.

# Risk is defined by default as MSE
set.seed(1)
fit.SL1 <- SuperLearner(Y=df[,2],X=data.frame(df[,1]),SL.library=SL.library, 
                       family=gaussian(),method="method.NNLS", verbose=TRUE)
fit.SL1

## 
## Call:  
## SuperLearner(Y = df[, 2], X = data.frame(df[, 1]), family = gaussian(),  
##     SL.library = SL.library, method = "method.NNLS", verbose = TRUE) 
## 
## 
##                            Risk        Coef
## SL.nnet_All            2.150153 0.003471064
## SL.glm_All             3.068761 0.000000000
## SL.randomForest_All    1.091711 0.741462952
## SL.glm.interaction_All 3.068761 0.000000000
## SL.gam_All             2.728809 0.021855334
## SL.polymars_All        1.397295 0.166526392
## SL.loess_All           2.044233 0.066684259
## SL.mean_All            5.154050 0.000000000

fit.SL2 <- SuperLearner(Y=df[,3],X=data.frame(df[,1]),SL.library=SL.library, 
                       family=gaussian(),method="method.NNLS", verbose=TRUE)
fit.SL2

## 
## Call:  
## SuperLearner(Y = df[, 3], X = data.frame(df[, 1]), family = gaussian(),  
##     SL.library = SL.library, method = "method.NNLS", verbose = TRUE) 
## 
## 
##                             Risk      Coef
## SL.nnet_All            1.1464044 0.2201477
## SL.glm_All             3.6321501 0.0000000
## SL.randomForest_All    1.0865861 0.1986596
## SL.glm.interaction_All 3.6321501 0.0000000
## SL.gam_All             1.2898621 0.0000000
## SL.polymars_All        0.9725886 0.4419255
## SL.loess_All           1.0161857 0.1392672
## SL.mean_All            4.4187054 0.0000000

fit.SL3 <- SuperLearner(Y=df[,4],X=data.frame(df[,1]),SL.library=SL.library, 
                       family=gaussian(),method="method.NNLS", verbose=TRUE)
fit.SL3

## 
## Call:  
## SuperLearner(Y = df[, 4], X = data.frame(df[, 1]), family = gaussian(),  
##     SL.library = SL.library, method = "method.NNLS", verbose = TRUE) 
## 
## 
##                            Risk       Coef
## SL.nnet_All            2.645687 0.00000000
## SL.glm_All             3.841096 0.00000000
## SL.randomForest_All    1.157085 0.39924907
## SL.glm.interaction_All 3.841096 0.02586760
## SL.gam_All             3.378688 0.00000000
## SL.polymars_All        1.141151 0.41216745
## SL.loess_All           2.450159 0.01753173
## SL.mean_All            4.570271 0.14518415

fit.SL4 <- SuperLearner(Y=df[,5],X=data.frame(df[,1]),SL.library=SL.library, 
                       family=gaussian(),method="method.NNLS", verbose=TRUE)
fit.SL4

## 
## Call:  
## SuperLearner(Y = df[, 5], X = data.frame(df[, 1]), family = gaussian(),  
##     SL.library = SL.library, method = "method.NNLS", verbose = TRUE) 
## 
## 
##                            Risk Coef
## SL.nnet_All            4.370145    0
## SL.glm_All             3.978797    0
## SL.randomForest_All    2.774770    1
## SL.glm.interaction_All 3.978797    0
## SL.gam_All             4.039379    0
## SL.polymars_All        4.454660    0
## SL.loess_All           4.317657    0
## SL.mean_All            4.090017    0

Currently, we don’t have an estimate of risk of the SuperLearner.
We are only hopeful that the SuperLearner is doing a good job. That’s minimizing the risk (MSE) as compared to the individual algorithms in its library or improving the best single algorithm.
To this end we obtain(simulate) a different data set under the same conditions as before, to evaluate the performance of the SuperLearner ensemble itself through an “external” layer of cross-validation.

-We generate a separate set of sample that we don’t use to fit the SuperLearner, which allows it to be a good estimate of the SuperLearner’s performance on data it hasn’t seen.

# Generate n = 100 sample for SuperLearner Cross Validation. It takes too much time and memory to run this so I used a smaller sample compared to the text. van der Laan et al 2011 reports that it may take three times the time taken to fit the superlearner. 
set.seed(504)

n_SL = 100
x_SL <-  runif(n_SL, min = -4,max = 4) # single covariate 
u_SL <- rnorm(n_SL) # random noise

y1_SL <-  (-2*(if_else(x_SL < -3, 1, 0)) + 2.55*(if_else(x_SL > -2, 1, 0)) -
      2*(if_else(x_SL > 0, 1, 0)) + 4*(if_else(x_SL > 2, 1, 0)) - 
      1*(if_else(x_SL > 3, 1, 0)) + u_SL )

y2_SL <-  6 + (0.4*x_SL)-(0.36*x_SL*x_SL) + 0.005*(x_SL^3) + u_SL

y3_SL <-  2.83*sin((pi/2) * x_SL) + u_SL

y4_SL <-  4*sin(3*pi*x_SL)*(if_else(x_SL > 0, 1, 0)) + u_SL 

df_SL <- data.frame(cbind(x_SL,y1_SL, y2_SL, y3_SL, y4_SL))

# Data
datatable(df_SL,class = 'cell-border stripe', caption = 'Table 2: Simulated Data for SuperLearner CV.')%>%
  formatRound(c("x_SL", "y1_SL", "y2_SL", "y3_SL", "y4_SL"), 2)

Evaluate the SuperLearner itself using Cross Validation

Theory says we do well eventually, but did we do well in these data?
The risk of the super learner can be estimated using cross-validation.
We will use a 10 fold CV though the text suggested 20.
Also, we will obtain plots for each of the four simulations.

# Run the cross-validated super learner to obtain its CV risk
set.seed(1)

fitSL1.CV <- CV.SuperLearner(Y=df_SL[,2],X=data.frame(df_SL[,1]), V=10, SL.library=SL.library,
                            verbose = TRUE, method = "method.NNLS", family = gaussian())
summary(fitSL1.CV)

## 
## Call:  
## CV.SuperLearner(Y = df_SL[, 2], X = data.frame(df_SL[, 1]), V = 10,  
##     family = gaussian(), SL.library = SL.library, method = "method.NNLS",  
##     verbose = TRUE) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  10 
## 
##               Algorithm    Ave      se     Min    Max
##           Super Learner 1.1171 0.14752 0.76960 1.6082
##             Discrete SL 1.1701 0.16993 0.87263 1.6658
##             SL.nnet_All 1.8283 0.22900 0.95866 4.9399
##              SL.glm_All 3.0761 0.28966 1.72337 4.9358
##     SL.randomForest_All 1.1701 0.16993 0.87263 1.6658
##  SL.glm.interaction_All 3.0761 0.28966 1.72337 4.9358
##              SL.gam_All 2.7052 0.25683 1.47956 4.4075
##         SL.polymars_All 1.4045 0.17864 0.41260 2.7557
##            SL.loess_All 1.9888 0.22194 0.93003 3.0497
##             SL.mean_All 4.7168 0.50421 2.53110 8.4355

plot(fitSL1.CV) + theme_bw()

fitSL2.CV <- CV.SuperLearner(Y=df_SL[,3],X=data.frame(df_SL[,1]), V=10, SL.library=SL.library,
                            verbose = TRUE, method = "method.NNLS", family = gaussian())
summary(fitSL2.CV)

## 
## Call:  
## CV.SuperLearner(Y = df_SL[, 3], X = data.frame(df_SL[, 1]), V = 10,  
##     family = gaussian(), SL.library = SL.library, method = "method.NNLS",  
##     verbose = TRUE) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  10 
## 
##               Algorithm     Ave      se     Min     Max
##           Super Learner 0.91293 0.12065 0.29410  1.6889
##             Discrete SL 0.89193 0.11880 0.29410  1.6930
##             SL.nnet_All 1.82643 0.27695 0.66964  3.9083
##              SL.glm_All 4.34141 0.62676 1.96468  8.5711
##     SL.randomForest_All 1.20619 0.18533 0.51572  2.0806
##  SL.glm.interaction_All 4.34141 0.62676 1.96468  8.5711
##              SL.gam_All 1.36433 0.22718 0.47425  2.7353
##         SL.polymars_All 1.05640 0.13393 0.59806  1.9358
##            SL.loess_All 0.89193 0.11880 0.29410  1.6930
##             SL.mean_All 4.71100 0.81482 1.75957 11.3998

plot(fitSL2.CV) + theme_bw()

fitSL3.CV <- CV.SuperLearner(Y=df_SL[,4],X=data.frame(df_SL[,1]), V=10, SL.library=SL.library,
                            verbose = TRUE, method = "method.NNLS", family = gaussian())
summary(fitSL3.CV)

## 
## Call:  
## CV.SuperLearner(Y = df_SL[, 4], X = data.frame(df_SL[, 1]), V = 10,  
##     family = gaussian(), SL.library = SL.library, method = "method.NNLS",  
##     verbose = TRUE) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  10 
## 
##               Algorithm    Ave      se     Min    Max
##           Super Learner 1.1809 0.16984 0.66819 2.1041
##             Discrete SL 1.2823 0.17759 0.76208 2.7181
##             SL.nnet_All 2.4697 0.34311 1.19566 4.3863
##              SL.glm_All 3.8472 0.46799 2.37147 4.8322
##     SL.randomForest_All 1.2445 0.18649 0.86253 1.7800
##  SL.glm.interaction_All 3.8472 0.46799 2.37147 4.8322
##              SL.gam_All 3.5040 0.42620 2.43734 4.3357
##         SL.polymars_All 1.2729 0.17740 0.76208 2.7181
##            SL.loess_All 2.6353 0.36964 1.02163 4.4569
##             SL.mean_All 4.3551 0.48772 2.64991 6.4080

plot(fitSL3.CV) + theme_bw()

fitSL4.CV <- CV.SuperLearner(Y=df_SL[,5],X=data.frame(df_SL[,1]), V=10, SL.library=SL.library,
                            verbose = TRUE, method = "method.NNLS", family = gaussian())
 summary(fitSL4.CV)

## 
## Call:  
## CV.SuperLearner(Y = df_SL[, 5], X = data.frame(df_SL[, 1]), V = 10,  
##     family = gaussian(), SL.library = SL.library, method = "method.NNLS",  
##     verbose = TRUE) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  10 
## 
##               Algorithm    Ave      se     Min    Max
##           Super Learner 2.6909 0.42470 0.69539 5.3437
##             Discrete SL 2.7033 0.43178 0.69539 5.3437
##             SL.nnet_All 5.5342 0.75942 2.93337 9.2943
##              SL.glm_All 5.2863 0.69503 1.78854 8.6234
##     SL.randomForest_All 2.7033 0.43178 0.69539 5.3437
##  SL.glm.interaction_All 5.2863 0.69503 1.78854 8.6234
##              SL.gam_All 5.3511 0.70710 1.82323 8.6190
##         SL.polymars_All 5.5604 0.78063 2.05302 9.4540
##            SL.loess_All 5.3618 0.71918 1.97337 8.5717
##             SL.mean_All 5.1942 0.67691 1.77419 8.1053

plot(fitSL4.CV) + theme_bw()

We see in each case that he Superlearner has risk closest to the algorithm with the lowest risk.

Using Real dataset - Australian Athletes Data

Data Source: Cook and Weisberg 1994
Data name: ais: Australian Institute of Sport
Can be found in the locfit package under the name ais.
Outcome variable is LBM (lean body mass)

library("locfit")
data(ais)
ais <- ais 

summary(ais)

##      sex          sport         RCC             WCC        
##  female:100   Row    :37   Min.   :3.800   Min.   : 3.300  
##  male  :102   T_400m :29   1st Qu.:4.372   1st Qu.: 5.900  
##               B_Ball :25   Median :4.755   Median : 6.850  
##               Netball:23   Mean   :4.719   Mean   : 7.109  
##               Swim   :22   3rd Qu.:5.030   3rd Qu.: 8.275  
##               Field  :19   Max.   :6.720   Max.   :14.300  
##               (Other):47                                   
##        Hc              Hg             Ferr             BMI       
##  Min.   :35.90   Min.   :11.60   Min.   :  8.00   Min.   :16.75  
##  1st Qu.:40.60   1st Qu.:13.50   1st Qu.: 41.25   1st Qu.:21.08  
##  Median :43.50   Median :14.70   Median : 65.50   Median :22.72  
##  Mean   :43.09   Mean   :14.57   Mean   : 76.88   Mean   :22.96  
##  3rd Qu.:45.58   3rd Qu.:15.57   3rd Qu.: 97.00   3rd Qu.:24.46  
##  Max.   :59.70   Max.   :19.20   Max.   :234.00   Max.   :34.42  
##                                                                  
##       SSF              BFat             LBM               Ht       
##  Min.   : 28.00   Min.   : 5.630   Min.   : 34.36   Min.   :148.9  
##  1st Qu.: 43.85   1st Qu.: 8.545   1st Qu.: 54.67   1st Qu.:174.0  
##  Median : 58.60   Median :11.650   Median : 63.03   Median :179.7  
##  Mean   : 69.02   Mean   :13.507   Mean   : 64.87   Mean   :180.1  
##  3rd Qu.: 90.35   3rd Qu.:18.080   3rd Qu.: 74.75   3rd Qu.:186.2  
##  Max.   :200.80   Max.   :35.520   Max.   :106.00   Max.   :209.4  
##                                                                    
##        Wt        
##  Min.   : 37.80  
##  1st Qu.: 66.53  
##  Median : 74.40  
##  Mean   : 75.01  
##  3rd Qu.: 84.12  
##  Max.   :123.20  
##

# Reorder columns with first column being response
ais <- ais %>% 
  select(LBM, everything())

# Split data to use part for external CV of the Superlearner
set.seed(34)

sample <- sample.int(n = nrow(ais), size = floor(.50*nrow(ais)), replace = F)
train <- ais[sample, ]
test  <- ais[-sample, ]

Define a library of algorithms.
Fit the SuperLearner and obtain its coefficients.
Obtain the CV risk of the Superlearner so as to know how well it performed compared to the individual algorithms

# Specify library of algorithms
ais.SL.library <- c("SL.nnet", "SL.glm", "SL.randomForest", "SL.step.interaction",
                "SL.gam", "SL.polymars", "SL.loess", "SL.mean", 
                "SL.gbm","SL.polymars", "SL.ridge","SL.glmnet","SL.svm")

#Superlearner
ais.fit <- SuperLearner(Y=train[,1],X=(train[,4:13]), SL.library=ais.SL.library, 
                       family=gaussian(),method="method.NNLS")

ais.fit

## 
## Call:  
## SuperLearner(Y = train[, 1], X = (train[, 4:13]), family = gaussian(),  
##     SL.library = ais.SL.library, method = "method.NNLS") 
## 
## 
##                                 Risk         Coef
## SL.nnet_All             188.84062216 0.000000e+00
## SL.glm_All                0.61014534 0.000000e+00
## SL.randomForest_All      16.35805840 9.256753e-05
## SL.step.interaction_All   0.04147442 9.806138e-01
## SL.gam_All                0.58708097 0.000000e+00
## SL.polymars_All           0.27449112 0.000000e+00
## SL.loess_All            121.47244592 0.000000e+00
## SL.mean_All             188.84062199 0.000000e+00
## SL.gbm_All               18.94623339 0.000000e+00
## SL.polymars_All           0.27449112 0.000000e+00
## SL.ridge_All              0.77868860 1.859913e-02
## SL.glmnet_All             0.56394714 0.000000e+00
## SL.svm_All               16.86856570 6.944853e-04

# Obtain CV risk for CV and algorithms
ais.fit.CV <- CV.SuperLearner(Y=test[,1],X=(test[,4:13]), V=10, SL.library=ais.SL.library,
                             method = "method.NNLS", family = gaussian())

summary(ais.fit.CV)

## 
## Call:  
## CV.SuperLearner(Y = test[, 1], X = (test[, 4:13]), V = 10, family = gaussian(),  
##     SL.library = ais.SL.library, method = "method.NNLS") 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  10 
## 
##                Algorithm        Ave         se       Min       Max
##            Super Learner   0.061936  0.0092887  0.036054   0.11663
##              Discrete SL   0.056786  0.0100834  0.025064   0.09966
##              SL.nnet_All 159.517566 22.8458517 36.141781 376.38129
##               SL.glm_All   0.389047  0.0569923  0.113586   0.60976
##      SL.randomForest_All  22.141717  7.4248391  6.799904  91.93001
##  SL.step.interaction_All   0.056786  0.0100834  0.025064   0.09966
##               SL.gam_All   0.317371  0.0475677  0.086476   0.55648
##          SL.polymars_All   0.166853  0.0551810  0.045664   0.77941
##             SL.loess_All 116.150984 21.5466820 21.558099 217.09216
##              SL.mean_All 159.517566 22.8458517 36.141781 376.38129
##               SL.gbm_All  18.118818  7.1589877  3.771418  88.47116
##          SL.polymars_All   0.166853  0.0551810  0.045664   0.77941
##             SL.ridge_All   0.543278  0.0777229  0.205644   1.01760
##            SL.glmnet_All   0.376008  0.0631413  0.117992   0.67577
##               SL.svm_All  22.099259 13.3586991  0.673508 155.68404

plot(ais.fit.CV) + theme_bw()

SuperLearner : Using Simulation Study and Real Dataset to Access the Performance of the SuperLearner

Daniel A. Alhassan

2019-07-09

Install and load packages needed to run this notebook.

Simulation study

Define variables for Simulation

Fit the Superlearner

Evaluate the SuperLearner itself using Cross Validation

Using Real dataset - Australian Athletes Data