Doubly robust estimators

Reference

Step 1 Transformation of continuous outcome variable. Step 2 Predict from initial outcome modelling: G-computation, y1-y0=average total effect (using regression to calculate the predictions). Step 3 Predict from propensity score model, x=covariates (using logistic regression to calculate ps for exposure). Step 4 Estimate clever covariate H (calculate the weight for exposure). Step 5 Estimate fluctuation parameter ϵ (calculate the weight for outcome). Step 6 Update the initial outcome model prediction based on targeted adjustment of the initial predictions using the PS model (adjust the outcome using the above weight). Step 7 Find treatment effect estimate , average total effect. Step 8 Transform back the treatment effect estimate in the original outcome scale. Step 9 Confidence interval estimation based on closed form formula.

# load the dataset
ObsData <- read.csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\obsdata.csv", header = TRUE)
# add column for outcome Y: length of stay 
# Y = date of discharge - study admission date
# Y = date of death - study admission date if date of discharge not available
ObsData$Y <- ObsData$dschdte - ObsData$sadmdte
ObsData$Y[is.na(ObsData$Y)] <- ObsData$dthdte[is.na(ObsData$Y)] - 
  ObsData$sadmdte[is.na(ObsData$Y)]
# remove outcomes we are not examining in this example
ObsData <- dplyr::select(ObsData, 
                         !c(dthdte, lstctdte, dschdte, death, t3d30, dth30, surv2md1))
# remove unnecessary and problematic variables 
ObsData <- dplyr::select(ObsData, 
                         !c(sadmdte, ptid, X, adld3p, urin1, cat2))

# convert all categorical variables to factors 
factors <- c("cat1", "ca", "cardiohx", "chfhx", "dementhx", "psychhx", 
             "chrpulhx", "renalhx", "liverhx", "gibledhx", "malighx", 
             "immunhx", "transhx", "amihx", "sex", "dnr1", "ninsclas", 
             "resp", "card", "neuro", "gastr", "renal", "meta", "hema", 
             "seps", "trauma", "ortho", "race", "income")
ObsData[factors] <- lapply(ObsData[factors], as.factor)
# convert our treatment A (RHC vs. No RHC) to a binary variable
ObsData$A <- ifelse(ObsData$swang1 == "RHC", 1, 0)
ObsData <- dplyr::select(ObsData, !swang1)
# Categorize the variables to match with the original paper
ObsData$age <- cut(ObsData$age,breaks=c(-Inf, 50, 60, 70, 80, Inf),right=FALSE)
ObsData$race <- factor(ObsData$race, levels=c("white","black","other"))
ObsData$sex <- as.factor(ObsData$sex)
ObsData$sex <- relevel(ObsData$sex, ref = "Male")
ObsData$cat1 <- as.factor(ObsData$cat1)
levels(ObsData$cat1) <- c("ARF","CHF","Other","Other","Other",
                          "Other","Other","MOSF","MOSF")
ObsData$ca <- as.factor(ObsData$ca)
levels(ObsData$ca) <- c("Metastatic","None","Localized (Yes)")
ObsData$ca <- factor(ObsData$ca, levels=c("None",
                                          "Localized (Yes)","Metastatic"))
# Rename variables
names(ObsData) <- c("Disease.category", "Cancer", "Cardiovascular", 
                    "Congestive.HF", "Dementia", "Psychiatric", "Pulmonary", 
                    "Renal", "Hepatic", "GI.Bleed", "Tumor", 
                    "Immunosupperssion", "Transfer.hx", "MI", "age", "sex", 
                    "edu", "DASIndex", "APACHE.score", "Glasgow.Coma.Score", 
                    "blood.pressure", "WBC", "Heart.rate", "Respiratory.rate", 
                    "Temperature", "PaO2vs.FIO2", "Albumin", "Hematocrit", 
                    "Bilirubin", "Creatinine", "Sodium", "Potassium", "PaCo2", 
                    "PH", "Weight", "DNR.status", "Medical.insurance", 
                    "Respiratory.Diag", "Cardiovascular.Diag", 
                    "Neurological.Diag", "Gastrointestinal.Diag", "Renal.Diag",
                    "Metabolic.Diag", "Hematologic.Diag", "Sepsis.Diag", 
                    "Trauma.Diag", "Orthopedic.Diag", "race", "income", 
                    "Y", "A")

step 1 Transform continuous outcome to be within the range [0,1]

min.Y <- min(ObsData$Y)
max.Y <- max(ObsData$Y)
ObsData$Y.bounded <- (ObsData$Y-min.Y)/(max.Y-min.Y)

Step 2: Initial G-comp estimate

We construct our outcome model, and make our initial predictions using Superlearner.

library(SuperLearner)
## Loading required package: nnls
## Loading required package: gam
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-2
## Super Learner
## Version: 2.0-28
## Package created on 2021-05-04
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
set.seed(123)
ObsData.noY <- dplyr::select(ObsData, !c(Y,Y.bounded))
Y.fit.sl <- SuperLearner(Y=ObsData$Y.bounded, 
                       X=ObsData.noY, 
                       cvControl = list(V = 3),
                       
                       SL.library=c("SL.glm", 
                                    "SL.glmnet", 
                                    "SL.xgboost"),
                       method="method.CC_nloglik", 
                       family="gaussian")
## Loading required package: nloptr
## Loading required namespace: xgboost
ObsData$init.Pred <- predict(Y.fit.sl, newdata = ObsData.noY, 
                           type = "response")$pred

summary(ObsData$init.Pred)
##        V1         
##  Min.   :0.00100  
##  1st Qu.:0.03723  
##  Median :0.04948  
##  Mean   :0.04877  
##  3rd Qu.:0.06067  
##  Max.   :0.13659

Get predictions under both treatments:

A=1

ObsData.noY$A <- 1
ObsData$Pred.Y1 <- predict(Y.fit.sl, newdata = ObsData.noY, 
                           type = "response")$pred
summary(ObsData$Pred.Y1)
##        V1         
##  Min.   :0.00100  
##  1st Qu.:0.04240  
##  Median :0.05429  
##  Mean   :0.05322  
##  3rd Qu.:0.06446  
##  Max.   :0.13659

A=0

ObsData.noY$A <- 0
ObsData$Pred.Y0 <- predict(Y.fit.sl, newdata = ObsData.noY, 
                           type = "response")$pred
summary(ObsData$Pred.Y0)
##        V1         
##  Min.   :0.00100  
##  1st Qu.:0.03524  
##  Median :0.04708  
##  Mean   :0.04609  
##  3rd Qu.:0.05734  
##  Max.   :0.12652

Get initial treatment effect estimate

ObsData$Pred.TE <- ObsData$Pred.Y1 - ObsData$Pred.Y0   

summary(ObsData$Pred.TE) 
##        V1           
##  Min.   :-0.010333  
##  1st Qu.: 0.006682  
##  Median : 0.007071  
##  Mean   : 0.007134  
##  3rd Qu.: 0.007541  
##  Max.   : 0.021592

Step 3: PS model

calculate ps

At this point, we have our initial estimate and now want to perform our targeted improvement.

library(SuperLearner)
set.seed(124)
ObsData.noYA <- dplyr::select(ObsData, !c(Y,Y.bounded,
                                          A,init.Pred,
                                          Pred.Y1,Pred.Y0,
                                          Pred.TE))   #delete Pred.TE
PS.fit.SL <- SuperLearner(Y=ObsData$A, 
                          
                       X=ObsData.noYA, 
                       cvControl = list(V = 3),
                       
                       SL.library=c("SL.glm", 
                                    "SL.glmnet", 
                                    "SL.xgboost"),
                       method="method.CC_nloglik",
                       family="binomial")  

all.pred <- predict(PS.fit.SL, type = "response")
ObsData$PS.SL <- all.pred$pred 
summary(ObsData$PS.SL)
##        V1          
##  Min.   :0.002806  
##  1st Qu.:0.133409  
##  Median :0.329181  
##  Mean   :0.375143  
##  3rd Qu.:0.596584  
##  Max.   :0.983057
tapply(ObsData$PS.SL, ObsData$A, summary)
## $`0`
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.002806 0.079637 0.177020 0.219962 0.327519 0.866585 
## 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03745 0.49045 0.65384 0.62745 0.78263 0.98306

Plot ps and treatment

plot(density(ObsData$PS.SL[ObsData$A==0]), 
     col = "red", main = "")
lines(density(ObsData$PS.SL[ObsData$A==1]), 
      col = "blue", lty = 2)
legend("topright", c("No RHC","RHC"), 
       col = c("red", "blue"), lty=1:2) 

independently estimate

Step 4: Estimate H

We have estimated g(a=1|l) by ps model then can estimate g(a=0|l) by 1 - PS.SL.

# inverse probability
ObsData$H.A1L <- (ObsData$A) / ObsData$PS.SL 
ObsData$H.A0L <- (1-ObsData$A) / (1- ObsData$PS.SL)

# estimate H
ObsData$H.AL <- ObsData$H.A1L - ObsData$H.A0L

summary(ObsData$H.AL)
##        V1         
##  Min.   :-7.4954  
##  1st Qu.:-1.2922  
##  Median :-1.0659  
##  Mean   :-0.1378  
##  3rd Qu.: 1.3662  
##  Max.   :26.7017
tapply(ObsData$H.AL, ObsData$A, summary)
## $`0`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -7.495  -1.487  -1.215  -1.377  -1.087  -1.003 
## 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.017   1.278   1.529   1.878   2.039  26.702

\[ H(A_i, L_i) = \frac{I(A_i=1)}{g(A_i=1|L_i)} - \frac{I(A_i=0)}{g(A_i=0|L_i)} \]

Step 5: Estimate ϵ

H clever covariate is to calculate the e epsilon (H coefficient). It is estimated through MLE, using a model with an offset based on the initial estimate (initial g compute predictions), and H clever covariates as independent variables.

Fluctuation parameter ϵ , representing how large of an adjustment we will make to the initial estimate.

eps_mod <- glm(Y.bounded ~ -1 + H.A1L + H.A0L +  #Using two H covariates 
                 
                 offset(qlogis(init.Pred)),  #initial g compute 
               family = "binomial",
               data = ObsData)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
# -1 + H.A1L + H.A0L , without intercept 
# qlogis(0.2) ==-1.39, inverse logistic function 
# plogis(-1.39) ==0.2  
# Y.bounded is standardized Y

# length(offset(qlogis(ObsData$init.Pred)))= 5735, residuals

epsilon <- coef(eps_mod)  
epsilon 
##      H.A1L      H.A0L 
## 0.01568808 0.02070037
epsilon["H.A1L"]
##      H.A1L 
## 0.01568808

Note that, if init.Pred includes negative values, NaNs would be produced after applying qlogis(). y.bounded is from 0 to 1.

Step 6: Update

We can use epsilon[“H.A1L”] and epsilon[“H.A0L”] to update

ObsData$Pred.Y1.update <- plogis(qlogis(ObsData$Pred.Y1) +  
                                   epsilon["H.A1L"]*ObsData$H.A1L) #adjusted by ps weighting 

ObsData$Pred.Y0.update <- plogis(qlogis(ObsData$Pred.Y0) + 
                                   epsilon["H.A0L"]*ObsData$H.A0L)

summary(ObsData$Pred.Y1.update)
##        V1          
##  Min.   :0.001031  
##  1st Qu.:0.042745  
##  Median :0.054882  
##  Mean   :0.053810  
##  3rd Qu.:0.065188  
##  Max.   :0.139189
summary(ObsData$Pred.Y0.update)  
##        V1         
##  Min.   :0.00100  
##  1st Qu.:0.03603  
##  Median :0.04779  
##  Mean   :0.04686  
##  3rd Qu.:0.05809  
##  Max.   :0.12652

Step 7: Effect estimate

Now that the updated predictions of our outcome models are calculated, we can calculate the ATE.

ATE.TMLE.bounded.vector <- ObsData$Pred.Y1.update -  
                           ObsData$Pred.Y0.update
ATE.TMLE.bounded <- mean(ATE.TMLE.bounded.vector, 
                         na.rm = TRUE) 
ATE.TMLE.bounded 
## [1] 0.006953925
# close to the g formula result
# Mean   : 0.00713

Step 8: Rescale effect estimate

We make sure to transform back to our original scale.

ATE.TMLE <- (max.Y-min.Y)*ATE.TMLE.bounded   
ATE.TMLE 
## [1] 2.725938

Step 9: Confidence interval estimation

Since the machine learning algorithms were used only in intermediary steps, rather than estimating our parameter of interest directly, 95% confidence intervals can be calculated directly.

ci.estimate <- function(data = ObsData, H.AL.components = 1){
  min.Y <- min(data$Y)
  max.Y <- max(data$Y)
  # transform predicted outcomes back to original scale
  if (H.AL.components == 2){
    data$Pred.Y1.update.rescaled <- 
      (max.Y- min.Y)*data$Pred.Y1.update + min.Y
    data$Pred.Y0.update.rescaled <- 
      (max.Y- min.Y)*data$Pred.Y0.update + min.Y
  } 
  if (H.AL.components == 1) {
    data$Pred.Y1.update.rescaled <- 
      (max.Y- min.Y)*data$Pred.Y1.update1 + min.Y
    data$Pred.Y0.update.rescaled <- 
      (max.Y- min.Y)*data$Pred.Y0.update1 + min.Y
  }
  EY1_TMLE1 <- mean(data$Pred.Y1.update.rescaled, 
                    na.rm = TRUE)
  EY0_TMLE1 <- mean(data$Pred.Y0.update.rescaled, 
                    na.rm = TRUE)
  # ATE efficient influence curve
  D1 <- data$A/data$PS.SL*
    (data$Y - data$Pred.Y1.update.rescaled) + 
    data$Pred.Y1.update.rescaled - EY1_TMLE1
  D0 <- (1 - data$A)/(1 - data$PS.SL)*
    (data$Y - data$Pred.Y0.update.rescaled) + 
    data$Pred.Y0.update.rescaled - EY0_TMLE1
  EIC <- D1 - D0
  # ATE variance
  n <- nrow(data)
  varHat.IC <- var(EIC, na.rm = TRUE)/n
  # ATE 95% CI
  if (H.AL.components == 2) {
    ATE.TMLE.CI <- c(ATE.TMLE - 1.96*sqrt(varHat.IC), 
                   ATE.TMLE + 1.96*sqrt(varHat.IC))
  }
  if (H.AL.components == 1) {
    ATE.TMLE.CI <- c(ATE.TMLE1 - 1.96*sqrt(varHat.IC), 
                   ATE.TMLE1 + 1.96*sqrt(varHat.IC))
  }
  return(ATE.TMLE.CI) 
}

effect confidence interval

CI2 <- ci.estimate(data = ObsData, H.AL.components = 2) 
CI2
## [1] 1.585187 3.866689