In a randomized control trial (RCT) we assume that.
\[ \mathrm{E}[Y^0|D=1] = \mathrm{E}[Y^0|D=0] \] and \[ \mathrm{E}[Y^1|D=1] = \mathrm{E}[Y^1|D=0] \]
Since \[ \begin{align} \mathrm{E}[\delta] =& \{ \pi \mathrm{E}[Y^1|D=1] + (1-\pi) \mathrm{E}[Y^1|D=0] \} \\ -& \{ \pi \mathrm{E}[Y^0|D=1] + (1-\pi) \mathrm{E}[Y^0|D=0] \}=ATE \end{align} \]
then
\[ \mathrm{E}[Y^1|D=1] - \mathrm{E}[Y^0|D=0]=ATE \]
# load packages
pacman::p_load(tidyverse, here, sandwich, lmtest, boot, ranger,
ggplot2, broom, SuperLearner, tmle, AIPW,
ranger, xgboost, e1071, nnet, glmnet, remotes)
library(tlverse)
library(tmle3)
library(sl3)
library(data.table)
Data preparation
nhefs <- read_csv("C:\\Users\\hed2\\Downloads\\mlci_shortcourse-main\\mlci_shortcourse-main\\data\\nhefs.csv") %>%
mutate(wt_delta = as.numeric(wt82_71>median(wt82_71)))
## Rows: 1394 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): seqn, qsmk, sex, age, income, sbp, dbp, price71, tax71, race, wt82_71
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
formulaVars <- "qsmk + sex + age + income + sbp + dbp + price71 + tax71 + race"
modelForm0 <- as.formula(paste0("wt_delta ~", formulaVars))
modelForm0
## wt_delta ~ qsmk + sex + age + income + sbp + dbp + price71 +
## tax71 + race
#' This model can be used to quantify a conditionally adjusted
#' odds ratio with correct standard error
modelOR <- glm(modelForm0,data=nhefs,family = binomial("logit"))
# summary(modelOR)
exp(tidy(modelOR)[2,2])
## # A tibble: 1 × 1
## estimate
## <dbl>
## 1 1.81
# modeling account for any "potential" exposure-covariates interactions
formulaVars <- "sex + age + income + sbp + dbp + price71 + tax71 + race"
modelForm <- as.formula(paste0("wt_delta ~", formulaVars))
modelForm
## wt_delta ~ sex + age + income + sbp + dbp + price71 + tax71 +
## race
#' Regress the outcome against the confounders among the unexposed and exposed sub datasets
model0 <- glm(modelForm,data=subset(nhefs,qsmk==0),family=binomial("logit"))
model1 <- glm(modelForm,data=subset(nhefs,qsmk==1),family=binomial("logit"))
##' Generate predictions for everyone in the sample using the models
mu1 <- predict(model1,newdata=nhefs,type="response")
mu0 <- predict(model0,newdata=nhefs,type="response")
#' Marginally adjusted odds ratio
marg_stand_OR <- (mean(mu1)/mean(1-mu1))/(mean(mu0)/mean(1-mu0))
#' Marginally adjusted risk ratio
marg_stand_RR <- mean(mu1)/mean(mu0)
#' Marginally adjusted risk difference
marg_stand_RD <- mean(mu1)-mean(mu0)
#' Using the bootstrap to obtain confidence intervals
bootfunc <- function(data,index){
boot_dat <- data[index,]
model0 <- glm(modelForm,data=subset(boot_dat,qsmk==0),family=binomial("logit"))
model1 <- glm(modelForm,data=subset(boot_dat,qsmk==1),family=binomial("logit"))
mu1 <- predict(model1,newdata=boot_dat,type="response")
mu0 <- predict(model0,newdata=boot_dat,type="response")
marg_stand_OR_ <- (mean(mu1)/mean(1-mu1))/(mean(mu0)/mean(1-mu0))
marg_stand_RR_ <- mean(mu1)/mean(mu0)
marg_stand_RD_ <- mean(mu1)-mean(mu0)
res <- c(marg_stand_RD_,marg_stand_RR_,marg_stand_OR_)
return(res)
}
#' Run the boot function. Set a seed to obtain reproducibility
set.seed(123)
boot_res <- boot(nhefs,bootfunc,R=2000)
boot_RD <- boot.ci(boot_res,index=1)
## Warning in boot.ci(boot_res, index = 1): bootstrap variances needed for
## studentized intervals
boot_RR <- boot.ci(boot_res,index=2)
## Warning in boot.ci(boot_res, index = 2): bootstrap variances needed for
## studentized intervals
boot_OR <- boot.ci(boot_res,index=3)
## Warning in boot.ci(boot_res, index = 3): bootstrap variances needed for
## studentized intervals
marg_stand_OR
## [1] 1.757907
# marg_stand_RD
# create the propensity score in the dataset
nhefs$propensity_score <- glm(qsmk ~ sex + age + income + sbp + dbp + price71 + tax71 + race, data = nhefs, family = binomial("logit"))$fitted.values
# stabilized inverse probability weights
nhefs$sw <- (mean(nhefs$qsmk)/nhefs$propensity_score)*nhefs$qsmk +
((1-mean(nhefs$qsmk))/(1-nhefs$propensity_score))*(1-nhefs$qsmk)
# summary(nhefs$sw)
# F----------------------------------------------------------------------------
model_RD_weighted <- glm(wt_delta ~ qsmk, data = nhefs, weights=sw, family = quasibinomial("identity"))
# summary(model_RD_weighted)$coefficients
exp(summary(model_RD_weighted)$coefficients[2])
## [1] 1.147979
# t test----------------------------------------------------------------------------
library(lmtest)
library(sandwich)
# coeftest(model_RD_weighted, vcov. = vcovHC)
# using the sl3 function create the super learner, that contains a simple
# average estimator and a glm estimator.
lrnr_glm <- make_learner(Lrnr_glm)
lrnr_mean <- make_learner(Lrnr_mean)
sl <- Lrnr_sl$new(learners = list(lrnr_glm,lrnr_mean))
learner_list <- list(Y = sl, A = sl)
# PREPARE THE THINGS WE WANT TO FEED IN TO TMLE3
# ate_spec defines the exposure levels we want
# to compare
ate_spec <- tmle_ATE(treatment_level = 1, control_level = 0)
# nodes define the variables we want to adjsut for, and the exposure and the outcome
nodes <- list(W = c("sex", "age", "income", "sbp", "dbp", "price71", "tax71", "race"),
A = "qsmk",
Y = "wt_delta")
# Run the tmle3 function using the pieces defined above
tmle_fit <- tmle3(ate_spec, nhefs, nodes, learner_list)
exp(tmle_fit$summary$psi_transformed)
## [1] 1.145531
# tmle_fit$summary$lower_transformed
#
# tmle_fit$summary$upper_transformed
# using the SuperLearner function with simple average estimator and a glm estimator.
sl <- c("SL.mean","SL.glm")
outcome <- nhefs$wt_delta
exposure <- nhefs$qsmk
covariates <- nhefs[,c("sex", "age", "income", "sbp", "dbp", "price71", "tax71", "race")]
AIPW_SL <- AIPW$new(Y = outcome,
A = exposure,
W = covariates,
Q.SL.library = sl,
g.SL.library = sl,
k_split = 3,
verbose=FALSE)$
fit()$
summary(g.bound = 0.025)$
plot.p_score()$
plot.ip_weights()
exp(AIPW_SL$result[3,1])
## [1] 1.139096
# random Forrest
#' Marginal Standardization with Random Forest
model0 <- ranger(modelForm0, num.trees=500, mtry=3, min.node.size = 50, data=(nhefs))
nhefs$mu0 <- predict(model0, data=nhefs, type="response")$pred
marg_stand_OR_reg <- mean(nhefs$mu0[nhefs$qsmk==1])/mean(1-nhefs$mu0[nhefs$qsmk==1])/( mean(nhefs$mu0[nhefs$qsmk==0])/mean(1-nhefs$mu0[nhefs$qsmk==0]))
marg_stand_OR_reg
## [1] 1.488545
Non parameter- potential outcome
# random Forrest
#' Marginal Standardization with Random Forest
# model0 <- ranger(modelForm, num.trees=500, mtry=3, min.node.size = 50, data=subset(nhefs, qsmk==0))
# model1 <- ranger(modelForm, num.trees=500, mtry=3, min.node.size = 50, data=subset(nhefs, qsmk==1))
#
# mu1 <- predict(model1, data=nhefs, type="response")$pred
# mu0 <- predict(model0, data=nhefs, type="response")$pred
#
# marg_stand_RDrf <- mean(mu1) - mean(mu0)
# bootfunc <- function(data,index){
# boot_dat <- data[index,]
# model0 <- ranger(modelForm, num.trees=500, mtry=5, data=subset(boot_dat, qsmk==0))
# model1 <- ranger(modelForm, num.trees=500, mtry=5, data=subset(boot_dat, qsmk==1))
# mu1 <- predict(model1,data=boot_dat,type="response")$pred
# mu0 <- predict(model0,data=boot_dat,type="response")$pred
#
# marg_stand_RD_ <- mean(mu1)-mean(mu0)
# return(marg_stand_RD_)
# }
#
# #' Run the boot function.
# set.seed(123)
# boot_res <- boot(nhefs,bootfunc,R=200)
# boot_RDrf <- boot.ci(boot_res)
# boot_RDrf$bca
# marg_stand_RDrf