Install and load packages needed to run this notebook.
# You need the suggested packages to run this notebook
pkgs <- c("SuperLearner", "quadprog","plsgenomics","kernlab","MASS","ggplot2",
"nloptr","gam","gbm","nnet","polspline","nnls","BayesTree","glmnet",
"randomForest","class","e1071","stepPlr","arm","party","spls","LogicReg",
"SIS", "ipred","mlbench","rpart","caret","mda","earth", "cowplot",
"tidyverse", "locfit", "DT")
# see what packages are currently installed
installed_pacakges <- row.names(installed.packages())
# loop over the needed packages
for(p in pkgs){
# check if package is installed
already_installed <- p %in% installed_pacakges
# if not already installed, install it
if(!already_installed){
install.packages(p)
}
# and load package
library(p, character.only = TRUE)
}Simulation study
Define variables for Simulation
Create 4 simulated datasets to be used in training the model.
Peturb each response with some random noise.
Obtain plots of the four simulated data.
set.seed(123) # for reproducibility
n <- 100 # sample size
x <- runif(n, min = -4,max = 4) # single covariate
u <- rnorm(n) # random noise
# Response variables
y1 <- (-2*(if_else(x < -3, 1, 0)) + 2.55*(if_else(x > -2, 1, 0)) -
2*(if_else(x > 0, 1, 0)) + 4*(if_else(x > 2, 1, 0)) -
1*(if_else(x > 3, 1, 0)) + u )
y2 <- 6 + (0.4*x)-(0.36*x*x) + 0.005*(x^3) + u
y3 <- 2.83*sin((pi/2) * x) + u
y4 <- 4*sin(3*pi*x)*(if_else(x > 0, 1, 0)) + u
# Put variables in a dataframe
df <- data.frame(cbind(x, y1, y2, y3, y4))
# Display first 5 rows of dataframe
datatable(df,class = 'cell-border stripe', caption = 'Table 1: Simulated Data.')%>%
formatRound(c("x", "y1", "y2", "y3", "y4"), 2)## x y1 y2 y3
## Min. :-3.99500 Min. :-4.0532 Min. :-1.919 Min. :-4.4299
## 1st Qu.:-2.03624 1st Qu.:-0.1382 1st Qu.: 3.224 1st Qu.:-1.9180
## Median :-0.26903 Median : 1.7551 Median : 4.537 Median :-0.4467
## Mean :-0.01153 Mean : 1.6133 Mean : 4.090 Mean :-0.1476
## 3rd Qu.: 2.04377 3rd Qu.: 3.4698 3rd Qu.: 5.516 3rd Qu.: 1.6085
## Max. : 3.95416 Max. : 5.6468 Max. : 7.968 Max. : 4.0240
## y4
## Min. :-5.0855
## 1st Qu.:-1.4760
## Median :-0.1051
## Mean :-0.3917
## 3rd Qu.: 0.9332
## Max. : 3.6745
# Obtain plots of four simulation sets
plot1 <- ggplot(df, aes(x=x, y=y1)) +
geom_point() +
labs(title = "Simulation 1")
plot2 <- ggplot(df, aes(x=x, y=y2)) +
geom_point() +
labs(title = "Simulation 2")
plot3 <- ggplot(df, aes(x=x, y=y3)) +
geom_point() +
labs(title = "Simulation 3")
plot4 <- ggplot(df, aes(x=x, y=y4)) +
geom_point() +
labs(title = "Simulation 4")
plot_grid(plot1, plot2, plot3, plot4, labels = "AUTO")Fit the Superlearner
Define a library of algorithms to be used. This will depend on whether the task at hand is a (binary) classification or regression problem.
Use the defined library of algorithms in the call of Superlearner function for each simluated data.
Obtain plots of the four simulated data.
# Specify a library of algorithms
SL.library <- c("SL.nnet", "SL.glm", "SL.randomForest", "SL.glm.interaction",
"SL.gam", "SL.polymars", "SL.loess", "SL.mean")
# Run the SuperLearner function to obtain predicted values and
# CV Risk for each of the algorithms in the library defined above.
# Risk is defined by default as MSE
set.seed(1)
fit.SL1 <- SuperLearner(Y=df[,2],X=data.frame(df[,1]),SL.library=SL.library,
family=gaussian(),method="method.NNLS", verbose=TRUE)
fit.SL1##
## Call:
## SuperLearner(Y = df[, 2], X = data.frame(df[, 1]), family = gaussian(),
## SL.library = SL.library, method = "method.NNLS", verbose = TRUE)
##
##
## Risk Coef
## SL.nnet_All 2.150153 0.003471064
## SL.glm_All 3.068761 0.000000000
## SL.randomForest_All 1.091711 0.741462952
## SL.glm.interaction_All 3.068761 0.000000000
## SL.gam_All 2.728809 0.021855334
## SL.polymars_All 1.397295 0.166526392
## SL.loess_All 2.044233 0.066684259
## SL.mean_All 5.154050 0.000000000
fit.SL2 <- SuperLearner(Y=df[,3],X=data.frame(df[,1]),SL.library=SL.library,
family=gaussian(),method="method.NNLS", verbose=TRUE)
fit.SL2##
## Call:
## SuperLearner(Y = df[, 3], X = data.frame(df[, 1]), family = gaussian(),
## SL.library = SL.library, method = "method.NNLS", verbose = TRUE)
##
##
## Risk Coef
## SL.nnet_All 1.1464044 0.2201477
## SL.glm_All 3.6321501 0.0000000
## SL.randomForest_All 1.0865861 0.1986596
## SL.glm.interaction_All 3.6321501 0.0000000
## SL.gam_All 1.2898621 0.0000000
## SL.polymars_All 0.9725886 0.4419255
## SL.loess_All 1.0161857 0.1392672
## SL.mean_All 4.4187054 0.0000000
fit.SL3 <- SuperLearner(Y=df[,4],X=data.frame(df[,1]),SL.library=SL.library,
family=gaussian(),method="method.NNLS", verbose=TRUE)
fit.SL3##
## Call:
## SuperLearner(Y = df[, 4], X = data.frame(df[, 1]), family = gaussian(),
## SL.library = SL.library, method = "method.NNLS", verbose = TRUE)
##
##
## Risk Coef
## SL.nnet_All 2.645687 0.00000000
## SL.glm_All 3.841096 0.00000000
## SL.randomForest_All 1.157085 0.39924907
## SL.glm.interaction_All 3.841096 0.02586760
## SL.gam_All 3.378688 0.00000000
## SL.polymars_All 1.141151 0.41216745
## SL.loess_All 2.450159 0.01753173
## SL.mean_All 4.570271 0.14518415
fit.SL4 <- SuperLearner(Y=df[,5],X=data.frame(df[,1]),SL.library=SL.library,
family=gaussian(),method="method.NNLS", verbose=TRUE)
fit.SL4##
## Call:
## SuperLearner(Y = df[, 5], X = data.frame(df[, 1]), family = gaussian(),
## SL.library = SL.library, method = "method.NNLS", verbose = TRUE)
##
##
## Risk Coef
## SL.nnet_All 4.370145 0
## SL.glm_All 3.978797 0
## SL.randomForest_All 2.774770 1
## SL.glm.interaction_All 3.978797 0
## SL.gam_All 4.039379 0
## SL.polymars_All 4.454660 0
## SL.loess_All 4.317657 0
## SL.mean_All 4.090017 0
Currently, we don’t have an estimate of risk of the SuperLearner.
We are only hopeful that the SuperLearner is doing a good job. That’s minimizing the risk (MSE) as compared to the individual algorithms in its library or improving the best single algorithm.
To this end we obtain(simulate) a different data set under the same conditions as before, to evaluate the performance of the SuperLearner ensemble itself through an “external” layer of cross-validation.
-We generate a separate set of sample that we don’t use to fit the SuperLearner, which allows it to be a good estimate of the SuperLearner’s performance on data it hasn’t seen.
# Generate n = 100 sample for SuperLearner Cross Validation. It takes too much time and memory to run this so I used a smaller sample compared to the text. van der Laan et al 2011 reports that it may take three times the time taken to fit the superlearner.
set.seed(504)
n_SL = 100
x_SL <- runif(n_SL, min = -4,max = 4) # single covariate
u_SL <- rnorm(n_SL) # random noise
y1_SL <- (-2*(if_else(x_SL < -3, 1, 0)) + 2.55*(if_else(x_SL > -2, 1, 0)) -
2*(if_else(x_SL > 0, 1, 0)) + 4*(if_else(x_SL > 2, 1, 0)) -
1*(if_else(x_SL > 3, 1, 0)) + u_SL )
y2_SL <- 6 + (0.4*x_SL)-(0.36*x_SL*x_SL) + 0.005*(x_SL^3) + u_SL
y3_SL <- 2.83*sin((pi/2) * x_SL) + u_SL
y4_SL <- 4*sin(3*pi*x_SL)*(if_else(x_SL > 0, 1, 0)) + u_SL
df_SL <- data.frame(cbind(x_SL,y1_SL, y2_SL, y3_SL, y4_SL))
# Data
datatable(df_SL,class = 'cell-border stripe', caption = 'Table 2: Simulated Data for SuperLearner CV.')%>%
formatRound(c("x_SL", "y1_SL", "y2_SL", "y3_SL", "y4_SL"), 2)Evaluate the SuperLearner itself using Cross Validation
Theory says we do well eventually, but did we do well in these data?
The risk of the super learner can be estimated using cross-validation.
We will use a 10 fold CV though the text suggested 20.
Also, we will obtain plots for each of the four simulations.
# Run the cross-validated super learner to obtain its CV risk
set.seed(1)
fitSL1.CV <- CV.SuperLearner(Y=df_SL[,2],X=data.frame(df_SL[,1]), V=10, SL.library=SL.library,
verbose = TRUE, method = "method.NNLS", family = gaussian())
summary(fitSL1.CV)##
## Call:
## CV.SuperLearner(Y = df_SL[, 2], X = data.frame(df_SL[, 1]), V = 10,
## family = gaussian(), SL.library = SL.library, method = "method.NNLS",
## verbose = TRUE)
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 10
##
## Algorithm Ave se Min Max
## Super Learner 1.1171 0.14752 0.76960 1.6082
## Discrete SL 1.1701 0.16993 0.87263 1.6658
## SL.nnet_All 1.8283 0.22900 0.95866 4.9399
## SL.glm_All 3.0761 0.28966 1.72337 4.9358
## SL.randomForest_All 1.1701 0.16993 0.87263 1.6658
## SL.glm.interaction_All 3.0761 0.28966 1.72337 4.9358
## SL.gam_All 2.7052 0.25683 1.47956 4.4075
## SL.polymars_All 1.4045 0.17864 0.41260 2.7557
## SL.loess_All 1.9888 0.22194 0.93003 3.0497
## SL.mean_All 4.7168 0.50421 2.53110 8.4355
fitSL2.CV <- CV.SuperLearner(Y=df_SL[,3],X=data.frame(df_SL[,1]), V=10, SL.library=SL.library,
verbose = TRUE, method = "method.NNLS", family = gaussian())
summary(fitSL2.CV)##
## Call:
## CV.SuperLearner(Y = df_SL[, 3], X = data.frame(df_SL[, 1]), V = 10,
## family = gaussian(), SL.library = SL.library, method = "method.NNLS",
## verbose = TRUE)
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 10
##
## Algorithm Ave se Min Max
## Super Learner 0.91293 0.12065 0.29410 1.6889
## Discrete SL 0.89193 0.11880 0.29410 1.6930
## SL.nnet_All 1.82643 0.27695 0.66964 3.9083
## SL.glm_All 4.34141 0.62676 1.96468 8.5711
## SL.randomForest_All 1.20619 0.18533 0.51572 2.0806
## SL.glm.interaction_All 4.34141 0.62676 1.96468 8.5711
## SL.gam_All 1.36433 0.22718 0.47425 2.7353
## SL.polymars_All 1.05640 0.13393 0.59806 1.9358
## SL.loess_All 0.89193 0.11880 0.29410 1.6930
## SL.mean_All 4.71100 0.81482 1.75957 11.3998
fitSL3.CV <- CV.SuperLearner(Y=df_SL[,4],X=data.frame(df_SL[,1]), V=10, SL.library=SL.library,
verbose = TRUE, method = "method.NNLS", family = gaussian())
summary(fitSL3.CV)##
## Call:
## CV.SuperLearner(Y = df_SL[, 4], X = data.frame(df_SL[, 1]), V = 10,
## family = gaussian(), SL.library = SL.library, method = "method.NNLS",
## verbose = TRUE)
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 10
##
## Algorithm Ave se Min Max
## Super Learner 1.1809 0.16984 0.66819 2.1041
## Discrete SL 1.2823 0.17759 0.76208 2.7181
## SL.nnet_All 2.4697 0.34311 1.19566 4.3863
## SL.glm_All 3.8472 0.46799 2.37147 4.8322
## SL.randomForest_All 1.2445 0.18649 0.86253 1.7800
## SL.glm.interaction_All 3.8472 0.46799 2.37147 4.8322
## SL.gam_All 3.5040 0.42620 2.43734 4.3357
## SL.polymars_All 1.2729 0.17740 0.76208 2.7181
## SL.loess_All 2.6353 0.36964 1.02163 4.4569
## SL.mean_All 4.3551 0.48772 2.64991 6.4080
fitSL4.CV <- CV.SuperLearner(Y=df_SL[,5],X=data.frame(df_SL[,1]), V=10, SL.library=SL.library,
verbose = TRUE, method = "method.NNLS", family = gaussian())
summary(fitSL4.CV)##
## Call:
## CV.SuperLearner(Y = df_SL[, 5], X = data.frame(df_SL[, 1]), V = 10,
## family = gaussian(), SL.library = SL.library, method = "method.NNLS",
## verbose = TRUE)
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 10
##
## Algorithm Ave se Min Max
## Super Learner 2.6909 0.42470 0.69539 5.3437
## Discrete SL 2.7033 0.43178 0.69539 5.3437
## SL.nnet_All 5.5342 0.75942 2.93337 9.2943
## SL.glm_All 5.2863 0.69503 1.78854 8.6234
## SL.randomForest_All 2.7033 0.43178 0.69539 5.3437
## SL.glm.interaction_All 5.2863 0.69503 1.78854 8.6234
## SL.gam_All 5.3511 0.70710 1.82323 8.6190
## SL.polymars_All 5.5604 0.78063 2.05302 9.4540
## SL.loess_All 5.3618 0.71918 1.97337 8.5717
## SL.mean_All 5.1942 0.67691 1.77419 8.1053
- We see in each case that he Superlearner has risk closest to the algorithm with the lowest risk.
Using Real dataset - Australian Athletes Data
Data Source: Cook and Weisberg 1994
Data name: ais: Australian Institute of Sport
Can be found in the
locfitpackage under the nameais.Outcome variable is LBM (lean body mass)
## sex sport RCC WCC
## female:100 Row :37 Min. :3.800 Min. : 3.300
## male :102 T_400m :29 1st Qu.:4.372 1st Qu.: 5.900
## B_Ball :25 Median :4.755 Median : 6.850
## Netball:23 Mean :4.719 Mean : 7.109
## Swim :22 3rd Qu.:5.030 3rd Qu.: 8.275
## Field :19 Max. :6.720 Max. :14.300
## (Other):47
## Hc Hg Ferr BMI
## Min. :35.90 Min. :11.60 Min. : 8.00 Min. :16.75
## 1st Qu.:40.60 1st Qu.:13.50 1st Qu.: 41.25 1st Qu.:21.08
## Median :43.50 Median :14.70 Median : 65.50 Median :22.72
## Mean :43.09 Mean :14.57 Mean : 76.88 Mean :22.96
## 3rd Qu.:45.58 3rd Qu.:15.57 3rd Qu.: 97.00 3rd Qu.:24.46
## Max. :59.70 Max. :19.20 Max. :234.00 Max. :34.42
##
## SSF BFat LBM Ht
## Min. : 28.00 Min. : 5.630 Min. : 34.36 Min. :148.9
## 1st Qu.: 43.85 1st Qu.: 8.545 1st Qu.: 54.67 1st Qu.:174.0
## Median : 58.60 Median :11.650 Median : 63.03 Median :179.7
## Mean : 69.02 Mean :13.507 Mean : 64.87 Mean :180.1
## 3rd Qu.: 90.35 3rd Qu.:18.080 3rd Qu.: 74.75 3rd Qu.:186.2
## Max. :200.80 Max. :35.520 Max. :106.00 Max. :209.4
##
## Wt
## Min. : 37.80
## 1st Qu.: 66.53
## Median : 74.40
## Mean : 75.01
## 3rd Qu.: 84.12
## Max. :123.20
##
# Reorder columns with first column being response
ais <- ais %>%
select(LBM, everything())
# Split data to use part for external CV of the Superlearner
set.seed(34)
sample <- sample.int(n = nrow(ais), size = floor(.50*nrow(ais)), replace = F)
train <- ais[sample, ]
test <- ais[-sample, ]Define a library of algorithms.
Fit the SuperLearner and obtain its coefficients.
Obtain the CV risk of the Superlearner so as to know how well it performed compared to the individual algorithms
# Specify library of algorithms
ais.SL.library <- c("SL.nnet", "SL.glm", "SL.randomForest", "SL.step.interaction",
"SL.gam", "SL.polymars", "SL.loess", "SL.mean",
"SL.gbm","SL.polymars", "SL.ridge","SL.glmnet","SL.svm")
#Superlearner
ais.fit <- SuperLearner(Y=train[,1],X=(train[,4:13]), SL.library=ais.SL.library,
family=gaussian(),method="method.NNLS")
ais.fit##
## Call:
## SuperLearner(Y = train[, 1], X = (train[, 4:13]), family = gaussian(),
## SL.library = ais.SL.library, method = "method.NNLS")
##
##
## Risk Coef
## SL.nnet_All 188.84062216 0.000000e+00
## SL.glm_All 0.61014534 0.000000e+00
## SL.randomForest_All 16.35805840 9.256753e-05
## SL.step.interaction_All 0.04147442 9.806138e-01
## SL.gam_All 0.58708097 0.000000e+00
## SL.polymars_All 0.27449112 0.000000e+00
## SL.loess_All 121.47244592 0.000000e+00
## SL.mean_All 188.84062199 0.000000e+00
## SL.gbm_All 18.94623339 0.000000e+00
## SL.polymars_All 0.27449112 0.000000e+00
## SL.ridge_All 0.77868860 1.859913e-02
## SL.glmnet_All 0.56394714 0.000000e+00
## SL.svm_All 16.86856570 6.944853e-04
# Obtain CV risk for CV and algorithms
ais.fit.CV <- CV.SuperLearner(Y=test[,1],X=(test[,4:13]), V=10, SL.library=ais.SL.library,
method = "method.NNLS", family = gaussian())
summary(ais.fit.CV)##
## Call:
## CV.SuperLearner(Y = test[, 1], X = (test[, 4:13]), V = 10, family = gaussian(),
## SL.library = ais.SL.library, method = "method.NNLS")
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 10
##
## Algorithm Ave se Min Max
## Super Learner 0.061936 0.0092887 0.036054 0.11663
## Discrete SL 0.056786 0.0100834 0.025064 0.09966
## SL.nnet_All 159.517566 22.8458517 36.141781 376.38129
## SL.glm_All 0.389047 0.0569923 0.113586 0.60976
## SL.randomForest_All 22.141717 7.4248391 6.799904 91.93001
## SL.step.interaction_All 0.056786 0.0100834 0.025064 0.09966
## SL.gam_All 0.317371 0.0475677 0.086476 0.55648
## SL.polymars_All 0.166853 0.0551810 0.045664 0.77941
## SL.loess_All 116.150984 21.5466820 21.558099 217.09216
## SL.mean_All 159.517566 22.8458517 36.141781 376.38129
## SL.gbm_All 18.118818 7.1589877 3.771418 88.47116
## SL.polymars_All 0.166853 0.0551810 0.045664 0.77941
## SL.ridge_All 0.543278 0.0777229 0.205644 1.01760
## SL.glmnet_All 0.376008 0.0631413 0.117992 0.67577
## SL.svm_All 22.099259 13.3586991 0.673508 155.68404