SuperLearner

In detail please see the reference by Ashley I Naimi.

library(nnls) #non-negative least squares 
library(earth) #Multivariate Adaptive Regression Splines

## Loading required package: Formula

## Loading required package: plotmo

## Loading required package: plotrix

## Loading required package: TeachingDemos

library(gam) #A generalized additive model

## Loading required package: splines

## Loading required package: foreach

## Loaded gam 1.22-2

library(rmutil) #A toolkit of functions for nonlinear regression and repeated measurements

## 
## Attaching package: 'rmutil'

## The following object is masked from 'package:foreach':
## 
##     times

## The following object is masked from 'package:stats':
## 
##     nobs

## The following objects are masked from 'package:base':
## 
##     as.data.frame, units

library(data.table)

Simulate data

# set the seed for reproducibility
set.seed(123)
# generate the observed data
n = 1000
x = runif(n, 0, 8)
y = 5 + 4 * sqrt(9 * x) * as.numeric(x <
2) + as.numeric(x >= 2) * (abs(x - 6)^(2)) +
rlaplace(n)
D <- data.frame(x, y) # observed data

Specify the number of folds for v fold cross validation

# lapply, Apply a Function over a List or Vector
folds=5

index <- split(1:1000, 1:folds) #random or not

splt <- lapply(1:folds, function(ind) D[index[[ind]],
])

# build model by split data for cross validation
m1 <- lapply(1:folds, function(ii) gam(y ~
s(x, 5), family = "gaussian", data = rbindlist(splt[-ii])))

m2 <- lapply(1:folds, function(ii) earth(y ~
x, data = rbindlist(splt[-ii]), degree = 2,
penalty = 3, nk = 21, pmethod = "backward",
nfold = 0, ncross = 1, minspan = 0, endspan = 0))
# m1[[1]]

Predict the outcomes in the ii th validation set (each subject be used as validation test in turn, named p1)

p1 <- lapply(1:folds, function(ii) predict(m1[[ii]],
newdata = rbindlist(splt[ii]), type = "response"))

p2 <- lapply(1:folds, function(ii) predict(m2[[ii]],
newdata = rbindlist(splt[ii]), type = "response"))

Predictions grouped by ‘splt’ in the whole data set

# add the predictions to grouped dataset by 'splt'  
for (i in 1:folds) {
splt[[i]] <- cbind(splt[[i]], p1[[i]],
p2[[i]])
}
 
# all be saved in list
head(splt[[1]])  #only one list

##           x         y   p1[[i]]         y
## 1  2.300620 18.082553 18.315169 19.529790
## 6  0.364452 12.199149 12.705405 11.722377
## 11 7.654667  7.659093  7.570780  7.866627
## 16 7.198600  7.229326  6.527277  6.541223
## 21 7.116315  6.165418  6.355935  6.302089
## 26 5.668244  4.659275  4.993494  4.934016

head(splt[[2]])  #only one list

##           x         y   p1[[i]]         y
## 2  6.306441  5.301833  5.098293  4.984760
## 7  4.224844  8.943891  8.192367  8.251329
## 12 3.626673 10.249762 11.132650 10.179693
## 17 1.968702 21.964325 19.215480 19.657218
## 22 5.542427  6.089999  5.038031  4.891809
## 27 4.352528  7.891969  7.677124  7.839704

Calculate cv risk/mean Squared Error in each ii validation set

risk1 <- lapply(1:folds, function(ii) mean((splt[[ii]][,
2] - splt[[ii]][, 3])^2))

risk2 <- lapply(1:folds, function(ii) mean((splt[[ii]][,
2] - splt[[ii]][, 4])^2))

# length(risk1) is 5 length

# average the estimated risks
## across the 5 folds
a <- rbind(cbind("gam", mean(do.call(rbind,
risk1), na.rm = T)), cbind("earth", mean(do.call(rbind,
risk2), na.rm = T)))
a

##      [,1]    [,2]              
## [1,] "gam"   "2.4469015494432" 
## [2,] "earth" "2.30911178845963"

Estimate SL weights using nnls regression

# lapply() applies a given function for each element in a list, so there will be several function calls. do. call() applies a given function to the list as a whole, so there is only one function call.

# combine the datasets by row for regression
X <- data.frame(do.call(rbind, splt))[, -1]

names(X) <- c("y", "gam", "earth")
head(X)

##            y       gam     earth
## 1  18.082553 18.315169 19.529790
## 6  12.199149 12.705405 11.722377
## 11  7.659093  7.570780  7.866627
## 16  7.229326  6.527277  6.541223
## 21  6.165418  6.355935  6.302089
## 26  4.659275  4.993494  4.934016

# using nnls regression to calculate the coefficient
SL.r <- nnls(cbind(X[, 2], X[, 3]), X[, 1])$x

alpha <- as.matrix(SL.r/sum(SL.r))

round(alpha, 3)

##       [,1]
## [1,] 0.338
## [2,] 0.662

SL.r

## [1] 0.3388756 0.6639334

sum(SL.r)

## [1] 1.002809

Generate predictions using original data with weights

## fit all algorithms to original data
m1 <- gam(y ~ s(x, 5), family = "gaussian",
data = D)
m2 <- earth(y ~ x, data = D, degree = 2,
penalty = 3, nk = 21, pmethod = "backward",
nfold = 0, ncross = 1, minspan = 0, endspan = 0)

## predict from each fit using all data
p1 <- predict(m1, newdata = D, type = "response")
p2 <- predict(m2, newdata = D, type = "response")
predictions <- cbind(p1, p2)

## for the observed data take a
## weighted combination of predictions
## using nnls coeficients as weights
y_pred <- predictions %*% alpha

# prediction by combining two learners

SuperLearner

2023-05-08

SuperLearner