In detail please see the reference by Ashley I Naimi.
library(AIPW)
library(tmle3)
library(sl3)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Scale the variables
# data manipulate, standardization or centralization
scale_ <- function(x) {
(x - mean(x, na.rm = TRUE))/sd(x, na.rm = TRUE)
}
nhefs <- read.csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\nhefs_an.csv") %>%
mutate(wt_delta = as.numeric(wt82_71 >
median(wt82_71)), age = scale_(age),
sbp = scale_(sbp), dbp = scale_(dbp),
price71 = scale_(price71), tax71 = scale_(tax71)) %>%
select(-wt82_71)
head(nhefs)
## seqn qsmk sex age income sbp dbp price71 tax71
## 1 233 0 0 -0.11221830 19 2.5010972 1.74466329 0.1970824 0.2038726
## 2 235 0 0 -0.61387187 18 -0.2836708 0.22420965 0.9223406 1.4371534
## 3 244 0 1 1.05830670 15 -0.7120966 -0.25093211 -2.5334931 -2.3830370
## 4 245 0 0 2.06161385 15 1.0551600 0.03415294 -2.8136078 -2.5068235
## 5 252 0 0 -0.27943615 18 -0.5514369 -0.06087541 0.9223406 1.4371534
## 6 257 0 1 -0.02860937 11 0.6802874 0.50929471 0.3143397 0.4502996
## race wt_delta
## 1 1 0
## 2 0 1
## 3 1 1
## 4 1 1
## 5 0 1
## 6 1 1
Choose learners and configuration
lrnr_mean <- make_learner(Lrnr_mean)
lrnr_glm <- make_learner(Lrnr_glm)
# ranger learner
grid_params <- list(num.trees = c(250, 500,
1000, 2000), mtry = c(2, 4, 6), min.node.size = c(50,
100))
grid <- expand.grid(grid_params, KEEP.OUT.ATTRS = FALSE)
lrnr_ranger <- vector("list", length = nrow(grid))
for (i in 1:nrow(grid)) {
lrnr_ranger[[i]] <- make_learner(Lrnr_ranger,
num.trees = grid[i, ]$num.trees,
mtry = grid[i, ]$mtry, min.node.size = grid[i,
]$min.node.size)
}
lrnr_ranger <- make_learner(Lrnr_ranger) ########### FLAG! ###########
# glmnet learner
grid_params <- seq(0, 1, by = 0.25)
lrnr_glmnet <- vector("list", length = length(grid_params))
for (i in 1:length(grid_params)) {
lrnr_glmnet[[i]] <- make_learner(Lrnr_glmnet,
alpha = grid_params[i])
}
lrnr_glmnet <- make_learner(Lrnr_glmnet) ########### FLAG! ###########
# xgboost learner
grid_params <- list(max_depth = c(2, 4, 6,
8), eta = c(0.01, 0.1, 0.2), nrounds = c(50,
100, 500))
grid <- expand.grid(grid_params, KEEP.OUT.ATTRS = FALSE)
lrnr_xgboost <- vector("list", length = nrow(grid))
for (i in 1:nrow(grid)) {
lrnr_xgboost[[i]] <- make_learner(Lrnr_xgboost,
max_depth = grid[i, ]$max_depth,
eta = grid[i, ]$eta)
}
lrnr_xgboost <- make_learner(Lrnr_xgboost) ########## FLAG! ##########
# earth learner
grid_params <- c(2, 3, 4, 5, 6)
lrnr_earth <- vector("list", length = length(grid_params))
for (i in 1:length(grid_params)) {
lrnr_earth[[i]] <- make_learner(Lrnr_earth,
degree = grid_params[i])
}
lrnr_earth <- make_learner(Lrnr_earth) ############ FLAG! ############
Set a stack learners library
stacklearner <- Stack$new(lrnr_mean, lrnr_glm,
lrnr_ranger, lrnr_glmnet, lrnr_xgboost,
lrnr_earth)
metalearner <- Lrnr_nnls$new(convex = T)
sl.lib <- Lrnr_sl$new(learners = stacklearner,
metalearner = metalearner)
Set parameters of the model
outcome <- nhefs$wt_delta
exposure <- nhefs$qsmk
covariates <- nhefs[, c("age", "sbp", "dbp",
"price71", "tax71", "sex", "income",
"race")]
# set parameters
set.seed(123)
AIPW_SL <- AIPW$new(Y = outcome, A = exposure,
W = covariates, Q.SL.library = sl.lib,
g.SL.library = sl.lib, k_split = 10,
verbose = FALSE)$fit()$summary(g.bound = 0.025)$plot.p_score()
# ouput ATE
print(AIPW_SL$result, digits = 2)
## Estimate SE 95% LCL 95% UCL N
## Risk of exposure 0.61 0.027 0.557 0.66 340
## Risk of control 0.47 0.015 0.437 0.50 1054
## Risk Difference 0.14 0.031 0.083 0.20 1394
## Risk Ratio 1.31 0.054 1.175 1.45 1394
## Odds Ratio 1.78 0.126 1.392 2.28 1394
Stack learners
sl_ <- make_learner(Stack, unlist(list(lrnr_mean,
lrnr_glm, lrnr_ranger, lrnr_glmnet, lrnr_xgboost,lrnr_earth), recursive = TRUE))
# DEFINE SL_Y AND SL_A We only need
# one, because they're the same
Q_learner <- Lrnr_sl$new(learners = sl_, #stacking learners
metalearner = Lrnr_nnls$new(convex = T))
g_learner <- Lrnr_sl$new(learners = sl_,
metalearner = Lrnr_nnls$new(convex = T))
learner_list <- list(Y = Q_learner, A = g_learner)
Setup the TMLE3 model
# PREPARE THE THINGS WE WANT TO FEED IN
# TO TMLE3
ate_spec <- tmle_ATE(treatment_level = 1,
control_level = 0)
nodes_ <- list(W = c("age", "sbp", "dbp",
"price71", "tax71", "sex", "income",
"race"), A = "qsmk", Y = "wt_delta")
# RUN TMLE3
set.seed(123)
tmle_fit_ <- tmle3(ate_spec, nhefs, nodes_,
learner_list)
print(tmle_fit_)
## A tmle3_Fit that took 1 step(s)
## type param init_est tmle_est se lower upper
## 1: ATE ATE[Y_{A=1}-Y_{A=0}] 0.129564 0.1448363 0.03030263 0.0854442 0.2042283
## psi_transformed lower_transformed upper_transformed
## 1: 0.1448363 0.0854442 0.2042283
Model properties to diagnose (calculate PS and stabilized weights)
tmle_task <- ate_spec$make_tmle_task(nhefs,
nodes_)
initial_likelihood <- ate_spec$make_initial_likelihood(tmle_task,
learner_list)
## save propensity score for diagnosis
# initial ps
propensity_score <- initial_likelihood$get_likelihoods(tmle_task)$A
# updated ps
propensity_score <- propensity_score * nhefs$qsmk +
(1 - propensity_score) * (1 - nhefs$qsmk)
plap_ <- tibble(exposure = nhefs$qsmk, pscore = propensity_score)
# updated weight
plap_ <- plap_ %>%
mutate(sw = exposure * (mean(exposure)/propensity_score) +
(1 - exposure) * ((1 - mean(exposure))/(1 -
propensity_score)))
# distribution of
summary(plap_$sw)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4408 0.9049 0.9662 0.9581 1.0268 1.6379
summary(propensity_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.09695 0.20111 0.24080 0.24417 0.28600 0.55330
plot the final pscore
# ps overlap plot
ggplot(plap_) + geom_density(aes(pscore,
color = factor(exposure))) + scale_fill_manual(values = c("blue",
"orange")) + scale_x_continuous(expand = c(0,
0)) + scale_y_continuous(expand = c(0,
0))
Output p and q weights respectively by different learners
# super learner coefficients for PS
# model
g_fit <- tmle_fit_$likelihood$factor_list[["A"]]$learner
g_fit$fit_object$full_fit$learner_fits$Lrnr_nnls_TRUE
## [1] "Lrnr_nnls_TRUE"
## lrnrs weights
## 1: Lrnr_mean 0.1852163
## 2: Lrnr_glm_TRUE 0.5660202
## 3: Lrnr_ranger_500_TRUE_none_1 0.1379925
## 4: Lrnr_glmnet_NULL_deviance_10_1_100_TRUE_FALSE 0.0000000
## 5: Lrnr_xgboost_20_1 0.0000000
## 6: Lrnr_earth_2_3_backward_0_1_0_0 0.1107711
# super learner coefficients for
# outcome model
Q_fit <- tmle_fit_$likelihood$factor_list[["Y"]]$learner
Q_fit$fit_object$full_fit$learner_fits$Lrnr_nnls_TRUE
## [1] "Lrnr_nnls_TRUE"
## lrnrs weights
## 1: Lrnr_mean 0.07770549
## 2: Lrnr_glm_TRUE 0.47785626
## 3: Lrnr_ranger_500_TRUE_none_1 0.00000000
## 4: Lrnr_glmnet_NULL_deviance_10_1_100_TRUE_FALSE 0.00000000
## 5: Lrnr_xgboost_20_1 0.01487531
## 6: Lrnr_earth_2_3_backward_0_1_0_0 0.42956295