In detail please see the reference by Ashley I Naimi.
library(nnls) #non-negative least squares
library(earth) #Multivariate Adaptive Regression Splines
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
library(gam) #A generalized additive model
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-2
library(rmutil) #A toolkit of functions for nonlinear regression and repeated measurements
##
## Attaching package: 'rmutil'
## The following object is masked from 'package:foreach':
##
## times
## The following object is masked from 'package:stats':
##
## nobs
## The following objects are masked from 'package:base':
##
## as.data.frame, units
library(data.table)
Simulate data
# set the seed for reproducibility
set.seed(123)
# generate the observed data
n = 1000
x = runif(n, 0, 8)
y = 5 + 4 * sqrt(9 * x) * as.numeric(x <
2) + as.numeric(x >= 2) * (abs(x - 6)^(2)) +
rlaplace(n)
D <- data.frame(x, y) # observed data
Specify the number of folds for v fold cross validation
# lapply, Apply a Function over a List or Vector
folds=5
index <- split(1:1000, 1:folds) #random or not
splt <- lapply(1:folds, function(ind) D[index[[ind]],
])
# build model by split data for cross validation
m1 <- lapply(1:folds, function(ii) gam(y ~
s(x, 5), family = "gaussian", data = rbindlist(splt[-ii])))
m2 <- lapply(1:folds, function(ii) earth(y ~
x, data = rbindlist(splt[-ii]), degree = 2,
penalty = 3, nk = 21, pmethod = "backward",
nfold = 0, ncross = 1, minspan = 0, endspan = 0))
# m1[[1]]
Predict the outcomes in the ii th validation set (each subject be used as validation test in turn, named p1)
p1 <- lapply(1:folds, function(ii) predict(m1[[ii]],
newdata = rbindlist(splt[ii]), type = "response"))
p2 <- lapply(1:folds, function(ii) predict(m2[[ii]],
newdata = rbindlist(splt[ii]), type = "response"))
Predictions grouped by ‘splt’ in the whole data set
# add the predictions to grouped dataset by 'splt'
for (i in 1:folds) {
splt[[i]] <- cbind(splt[[i]], p1[[i]],
p2[[i]])
}
# all be saved in list
head(splt[[1]]) #only one list
## x y p1[[i]] y
## 1 2.300620 18.082553 18.315169 19.529790
## 6 0.364452 12.199149 12.705405 11.722377
## 11 7.654667 7.659093 7.570780 7.866627
## 16 7.198600 7.229326 6.527277 6.541223
## 21 7.116315 6.165418 6.355935 6.302089
## 26 5.668244 4.659275 4.993494 4.934016
head(splt[[2]]) #only one list
## x y p1[[i]] y
## 2 6.306441 5.301833 5.098293 4.984760
## 7 4.224844 8.943891 8.192367 8.251329
## 12 3.626673 10.249762 11.132650 10.179693
## 17 1.968702 21.964325 19.215480 19.657218
## 22 5.542427 6.089999 5.038031 4.891809
## 27 4.352528 7.891969 7.677124 7.839704
Calculate cv risk/mean Squared Error in each ii validation set
risk1 <- lapply(1:folds, function(ii) mean((splt[[ii]][,
2] - splt[[ii]][, 3])^2))
risk2 <- lapply(1:folds, function(ii) mean((splt[[ii]][,
2] - splt[[ii]][, 4])^2))
# length(risk1) is 5 length
# average the estimated risks
## across the 5 folds
a <- rbind(cbind("gam", mean(do.call(rbind,
risk1), na.rm = T)), cbind("earth", mean(do.call(rbind,
risk2), na.rm = T)))
a
## [,1] [,2]
## [1,] "gam" "2.4469015494432"
## [2,] "earth" "2.30911178845963"
Estimate SL weights using nnls regression
# lapply() applies a given function for each element in a list, so there will be several function calls. do. call() applies a given function to the list as a whole, so there is only one function call.
# combine the datasets by row for regression
X <- data.frame(do.call(rbind, splt))[, -1]
names(X) <- c("y", "gam", "earth")
head(X)
## y gam earth
## 1 18.082553 18.315169 19.529790
## 6 12.199149 12.705405 11.722377
## 11 7.659093 7.570780 7.866627
## 16 7.229326 6.527277 6.541223
## 21 6.165418 6.355935 6.302089
## 26 4.659275 4.993494 4.934016
# using nnls regression to calculate the coefficient
SL.r <- nnls(cbind(X[, 2], X[, 3]), X[, 1])$x
alpha <- as.matrix(SL.r/sum(SL.r))
round(alpha, 3)
## [,1]
## [1,] 0.338
## [2,] 0.662
SL.r
## [1] 0.3388756 0.6639334
sum(SL.r)
## [1] 1.002809
Generate predictions using original data with weights
## fit all algorithms to original data
m1 <- gam(y ~ s(x, 5), family = "gaussian",
data = D)
m2 <- earth(y ~ x, data = D, degree = 2,
penalty = 3, nk = 21, pmethod = "backward",
nfold = 0, ncross = 1, minspan = 0, endspan = 0)
## predict from each fit using all data
p1 <- predict(m1, newdata = D, type = "response")
p2 <- predict(m2, newdata = D, type = "response")
predictions <- cbind(p1, p2)
## for the observed data take a
## weighted combination of predictions
## using nnls coeficients as weights
y_pred <- predictions %*% alpha
# prediction by combining two learners