Introduction

In detail please see the reference by Ashley I Naimi.

Install packages from remote github

# https://www.youtube.com/watch?v=c14aqVC-Szo

# usethis::create_github_token()
# get a token

# usethis::edit_r_environ()
# .Renviron
# GITHUB_TOKEN = ghp_6... l
# save and restart r studio

# options(download.file.method = "wininet")
# devtools::install_github("tlverse/tlverse")
library(tlverse)

Datasets

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
nhefs <- read.csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\nhefs_an.csv") %>%
select(seqn, qsmk, sex, age, income,
sbp, dbp, price71, tax71, race, wt82_71) %>%
na.omit()
names(nhefs)
##  [1] "seqn"    "qsmk"    "sex"     "age"     "income"  "sbp"     "dbp"    
##  [8] "price71" "tax71"   "race"    "wt82_71"
nhefs <-  nhefs %>%
mutate(wt_delta = as.numeric(wt82_71 >
median(wt82_71)))

Parametric regression,

Marginally Adjusted Parametric Regression, like g computation, is to weight the outcome

# modelForm is a regression argument
formulaVars <- "qsmk + sex + age + income + sbp + dbp + price71 + tax71 + race"
modelForm <- as.formula(paste0("wt_delta ~",
                               formulaVars))
modelForm
## wt_delta ~ qsmk + sex + age + income + sbp + dbp + price71 + 
##     tax71 + race
#' Regress the outcome against the confounders with interaction (no interaction)
ms_model <- glm(modelForm, data = nhefs,family = binomial("logit"))

##' Generate predictions for everyone in the sample 
##' to obtain
##' unexposed (mu0 predictions) and exposed (mu1 predictions) risks.
mu1 <- predict(ms_model, newdata = transform(nhefs,
qsmk = 1), type = "response")

mu0 <- predict(ms_model, newdata = transform(nhefs,
qsmk = 0), type = "response")

#' Marginally adjusted odds ratio
marg_stand_OR <- (mean(mu1)/mean(1 - mu1))/(mean(mu0)/mean(1 -
mu0))
#' Marginally adjusted risk ratio
marg_stand_RR <- mean(mu1)/mean(mu0)
#' Marginally adjusted risk difference
marg_stand_RD <- mean(mu1) - mean(mu0)
1/marg_stand_OR
## [1] 0.5726679
1/marg_stand_RR
## [1] 0.7720054
-1*marg_stand_RD
## [1] -0.1377615

Random forest (non parameter), modeling by the exposure, weighting on the outcome

library(ranger)
#' Marginal Standardization with Random Forest
model0 <- ranger(modelForm, num.trees = 500,
mtry = 3, min.node.size = 50, data = subset(nhefs,
qsmk == 0))
model1 <- ranger(modelForm, num.trees = 500,
mtry = 3, min.node.size = 50, data = subset(nhefs,
qsmk == 1))

mu1 <- predict(model1, data = nhefs, type = "response")$pred
mu0 <- predict(model0, data = nhefs, type = "response")$pred
marg_stand_RDrf <- mean(mu1) - mean(mu0)

marg_stand_RDrf
## [1] 0.1334112

Inverse Probability Weighting, propensity score, is to weight the exposure

# create the propensity score in the
# dataset

nhefs$propensity_score <- glm(qsmk ~ sex +
age + income + sbp + dbp + price71 +
tax71 + race, data = nhefs, family = binomial("logit"))$fitted.values
# stabilized inverse probability
# weights
nhefs$sw <- (mean(nhefs$qsmk)/nhefs$propensity_score) *
nhefs$qsmk + ((1 - mean(nhefs$qsmk))/(1 -
nhefs$propensity_score)) * (1 - nhefs$qsmk)
summary(nhefs$sw)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5090  0.9110  0.9814  1.0001  1.0638  3.0970
model_RD_weighted <- glm(wt_delta ~ qsmk,
data = nhefs, weights = sw, family = quasibinomial("identity"))
summary(model_RD_weighted)$coefficients
##              Estimate Std. Error   t value      Pr(>|t|)
## (Intercept) 0.4668671 0.01537912 30.357194 9.184292e-156
## qsmk        0.1380031 0.03066295  4.500646  7.339547e-06

Estimate the standard errors

# 
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
coeftest(model_RD_weighted, vcov. = vcovHC)
## 
## z test of coefficients:
## 
##             Estimate Std. Error z value  Pr(>|z|)    
## (Intercept) 0.466867   0.015446 30.2249 < 2.2e-16 ***
## qsmk        0.138003   0.032042  4.3069 1.655e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Curse of Dimensionality, Machine Learning (non parameter estimation)

design matrix using model.matrix

nhefs_test=nhefs %>% select(wt82_71,qsmk,age,income) %>% 
  mutate(age_cat = cut(age, 4, labels = F))

test=lm(wt82_71~qsmk+income+as.factor(age_cat),data=nhefs_test)
summary(test)
## 
## Call:
## lm(formula = wt82_71 ~ qsmk + income + as.factor(age_cat), data = nhefs_test)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.006  -3.860  -0.115   4.041  45.838 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.89197    1.47149   1.965   0.0496 *  
## qsmk                 3.05565    0.47623   6.416 1.91e-10 ***
## income               0.02525    0.07866   0.321   0.7483    
## as.factor(age_cat)2 -0.69619    0.49358  -1.410   0.1586    
## as.factor(age_cat)3 -2.72074    0.53807  -5.057 4.84e-07 ***
## as.factor(age_cat)4 -7.12473    0.80956  -8.801  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.575 on 1388 degrees of freedom
## Multiple R-squared:  0.0842, Adjusted R-squared:  0.0809 
## F-statistic: 25.52 on 5 and 1388 DF,  p-value: < 2.2e-16
  aa=model.matrix(test) %*%test$coefficients
  bb=predict(test)
  
  head(aa-bb)
##   [,1]
## 1    0
## 2    0
## 3    0
## 4    0
## 5    0
## 6    0
set.seed(123)
n = 250
c1 <-  (round(rnorm(n), 2))
c2 <-  (round(rnorm(n), 2))
c3 <-  (round(rnorm(n), 2))
c4 <- factor (round(rnorm(n), 2))
x <- factor(rbinom(n, 1, 0.5))
dat_ <- data.frame(x, c1, c2, c3, c4)

mod_mat <- model.matrix(~., data = dat_)
dim(mod_mat)
## [1] 250 181

Sample sizes (in machine learning) are exponentially larger than those required for (fast converging) parametric methods to maintain the same degree of accuracy. It is just the cost of making weaker assumptions: if a parametric model is misspecified, it will converge very quickly to the wrong answer.

Doubly robust methods can mitigate or resolve problems caused by the curse of dimensionality from machine learning. However, misspecification resulting an incomplete confounder adjustment set, or incorrectly adjusting for a mediator, cannot be fixed with doubly robust machine learning methods