In detail please see the reference by Ashley I Naimi.
Scale variables
library(sl3)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
scale_ <- function(x) {
(x - mean(x, na.rm = TRUE))/sd(x, na.rm = TRUE)
}
nhefs <- read.csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\nhefs_an.csv") %>%
mutate(wt_delta = as.numeric(wt82_71 >
median(wt82_71)), age = scale_(age),
sbp = scale_(sbp), dbp = scale_(dbp),
price71 = scale_(price71), tax71 = scale_(tax71)) %>%
select(-wt82_71)
head(nhefs)
## seqn qsmk sex age income sbp dbp price71 tax71
## 1 233 0 0 -0.11221830 19 2.5010972 1.74466329 0.1970824 0.2038726
## 2 235 0 0 -0.61387187 18 -0.2836708 0.22420965 0.9223406 1.4371534
## 3 244 0 1 1.05830670 15 -0.7120966 -0.25093211 -2.5334931 -2.3830370
## 4 245 0 0 2.06161385 15 1.0551600 0.03415294 -2.8136078 -2.5068235
## 5 252 0 0 -0.27943615 18 -0.5514369 -0.06087541 0.9223406 1.4371534
## 6 257 0 1 -0.02860937 11 0.6802874 0.50929471 0.3143397 0.4502996
## race wt_delta
## 1 1 0
## 2 0 1
## 3 1 1
## 4 1 1
## 5 0 1
## 6 1 1
Create the prediction task
# Begin modeling nhefs with super
# learner
# (i.e., use nhefs to predict outcome)
task <- make_sl3_Task(data = nhefs, outcome = "wt_delta",
covariates = c("qsmk", "age", "sbp",
"dbp", "price71", "tax71", "sex",
"income", "race"), folds = 5)
# qsmk is covariate here, setup folds as 5
# let's look at the task
task
## An sl3 Task with 1394 obs and these nodes:
## $covariates
## [1] "qsmk" "age" "sbp" "dbp" "price71" "tax71" "sex"
## [8] "income" "race"
##
## $outcome
## [1] "wt_delta"
##
## $id
## NULL
##
## $weights
## NULL
##
## $offset
## NULL
##
## $time
## NULL
Construct a library of learners
# no change
# in any tuning parameters
lrn_glm <- Lrnr_glm$new()
lrn_mean <- Lrnr_mean$new()
# create ridge and lasso regression
lrn_ridge <- Lrnr_glmnet$new(alpha = 0) #no select variables
lrn_lasso <- Lrnr_glmnet$new(alpha = 1) #select variables
# create xgboost and ranger:
lrn_ranger <- Lrnr_ranger$new()
lrn_xgb <- Lrnr_xgboost$new()
Combine learners
stack <- Stack$new(lrn_glm, lrn_mean, lrn_ridge,
lrn_lasso, lrn_ranger, lrn_xgb)
stack
## [1] "Lrnr_glm_TRUE"
## [2] "Lrnr_mean"
## [3] "Lrnr_glmnet_NULL_deviance_10_0_100_TRUE_FALSE"
## [4] "Lrnr_glmnet_NULL_deviance_10_1_100_TRUE_FALSE"
## [5] "Lrnr_ranger_500_TRUE_none_1"
## [6] "Lrnr_xgboost_20_1"
lrn_ridge
## [1] "Lrnr_glmnet_NULL_deviance_10_0_100_TRUE_FALSE"
lrn_lasso
## [1] "Lrnr_glmnet_NULL_deviance_10_1_100_TRUE_FALSE"
Create the function to run the super learner
sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(convex = T))
Run the stack learner
set.seed(123)
sl_fit <- sl$train(task = task)
sl_fit$fit_object$cv_meta_fit
## [1] "Lrnr_nnls_TRUE"
## lrnrs weights
## 1: Lrnr_glm_TRUE 0.0000000
## 2: Lrnr_mean 0.0000000
## 3: Lrnr_glmnet_NULL_deviance_10_0_100_TRUE_FALSE 0.7328216
## 4: Lrnr_glmnet_NULL_deviance_10_1_100_TRUE_FALSE 0.1247784
## 5: Lrnr_ranger_500_TRUE_none_1 0.0000000
## 6: Lrnr_xgboost_20_1 0.1424000
Define the stack super learner
grid_params <- list(num.trees = c(250, 500,
1000, 2000), mtry = c(2, 4, 6), min.node.size = c(50,
100))
grid <- expand.grid(grid_params, KEEP.OUT.ATTRS = FALSE)
lrnr_ranger <- vector("list", length = nrow(grid))
for (i in 1:nrow(grid)) {
lrnr_ranger[[i]] <- make_learner(Lrnr_ranger,
num.trees = grid[i, ]$num.trees,
mtry = grid[i, ]$mtry, min.node.size = grid[i,
]$min.node.size)
}
# create the learning stack with the
# new lrnr_ranger library
stack <- Stack$new(lrnr_ranger)
stack
## [1] "Lrnr_ranger_250_TRUE_none_1_2_50" "Lrnr_ranger_500_TRUE_none_1_2_50"
## [3] "Lrnr_ranger_1000_TRUE_none_1_2_50" "Lrnr_ranger_2000_TRUE_none_1_2_50"
## [5] "Lrnr_ranger_250_TRUE_none_1_4_50" "Lrnr_ranger_500_TRUE_none_1_4_50"
## [7] "Lrnr_ranger_1000_TRUE_none_1_4_50" "Lrnr_ranger_2000_TRUE_none_1_4_50"
## [9] "Lrnr_ranger_250_TRUE_none_1_6_50" "Lrnr_ranger_500_TRUE_none_1_6_50"
## [11] "Lrnr_ranger_1000_TRUE_none_1_6_50" "Lrnr_ranger_2000_TRUE_none_1_6_50"
## [13] "Lrnr_ranger_250_TRUE_none_1_2_100" "Lrnr_ranger_500_TRUE_none_1_2_100"
## [15] "Lrnr_ranger_1000_TRUE_none_1_2_100" "Lrnr_ranger_2000_TRUE_none_1_2_100"
## [17] "Lrnr_ranger_250_TRUE_none_1_4_100" "Lrnr_ranger_500_TRUE_none_1_4_100"
## [19] "Lrnr_ranger_1000_TRUE_none_1_4_100" "Lrnr_ranger_2000_TRUE_none_1_4_100"
## [21] "Lrnr_ranger_250_TRUE_none_1_6_100" "Lrnr_ranger_500_TRUE_none_1_6_100"
## [23] "Lrnr_ranger_1000_TRUE_none_1_6_100" "Lrnr_ranger_2000_TRUE_none_1_6_100"
Run the defined super learner
# define the metalearner
sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(convex = T))
# print the
# results
set.seed(123)
sl_fit <- sl$train(task = task)
sl_fit$fit_object$cv_meta_fit
## [1] "Lrnr_nnls_TRUE"
## lrnrs weights
## 1: Lrnr_ranger_250_TRUE_none_1_2_50 0.0000000
## 2: Lrnr_ranger_500_TRUE_none_1_2_50 0.0000000
## 3: Lrnr_ranger_1000_TRUE_none_1_2_50 0.0000000
## 4: Lrnr_ranger_2000_TRUE_none_1_2_50 0.0000000
## 5: Lrnr_ranger_250_TRUE_none_1_4_50 0.0000000
## 6: Lrnr_ranger_500_TRUE_none_1_4_50 0.0000000
## 7: Lrnr_ranger_1000_TRUE_none_1_4_50 0.0000000
## 8: Lrnr_ranger_2000_TRUE_none_1_4_50 0.0000000
## 9: Lrnr_ranger_250_TRUE_none_1_6_50 0.0000000
## 10: Lrnr_ranger_500_TRUE_none_1_6_50 0.0000000
## 11: Lrnr_ranger_1000_TRUE_none_1_6_50 0.0000000
## 12: Lrnr_ranger_2000_TRUE_none_1_6_50 0.0000000
## 13: Lrnr_ranger_250_TRUE_none_1_2_100 0.6350147
## 14: Lrnr_ranger_500_TRUE_none_1_2_100 0.0000000
## 15: Lrnr_ranger_1000_TRUE_none_1_2_100 0.0000000
## 16: Lrnr_ranger_2000_TRUE_none_1_2_100 0.0000000
## 17: Lrnr_ranger_250_TRUE_none_1_4_100 0.0000000
## 18: Lrnr_ranger_500_TRUE_none_1_4_100 0.3649853
## 19: Lrnr_ranger_1000_TRUE_none_1_4_100 0.0000000
## 20: Lrnr_ranger_2000_TRUE_none_1_4_100 0.0000000
## 21: Lrnr_ranger_250_TRUE_none_1_6_100 0.0000000
## 22: Lrnr_ranger_500_TRUE_none_1_6_100 0.0000000
## 23: Lrnr_ranger_1000_TRUE_none_1_6_100 0.0000000
## 24: Lrnr_ranger_2000_TRUE_none_1_6_100 0.0000000
## lrnrs weights