Step 1 Transformation of continuous outcome variable. Step 2 Predict from initial outcome modelling: G-computation, y1-y0=average total effect (using regression to calculate the predictions). Step 3 Predict from propensity score model, x=covariates (using logistic regression to calculate ps for exposure). Step 4 Estimate clever covariate H (calculate the weight for exposure). Step 5 Estimate fluctuation parameter ϵ (calculate the weight for outcome). Step 6 Update the initial outcome model prediction based on targeted adjustment of the initial predictions using the PS model (adjust the outcome using the above weight). Step 7 Find treatment effect estimate , average total effect. Step 8 Transform back the treatment effect estimate in the original outcome scale. Step 9 Confidence interval estimation based on closed form formula.
# load the dataset
ObsData <- read.csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\obsdata.csv", header = TRUE)
# add column for outcome Y: length of stay
# Y = date of discharge - study admission date
# Y = date of death - study admission date if date of discharge not available
ObsData$Y <- ObsData$dschdte - ObsData$sadmdte
ObsData$Y[is.na(ObsData$Y)] <- ObsData$dthdte[is.na(ObsData$Y)] -
ObsData$sadmdte[is.na(ObsData$Y)]
# remove outcomes we are not examining in this example
ObsData <- dplyr::select(ObsData,
!c(dthdte, lstctdte, dschdte, death, t3d30, dth30, surv2md1))
# remove unnecessary and problematic variables
ObsData <- dplyr::select(ObsData,
!c(sadmdte, ptid, X, adld3p, urin1, cat2))
# convert all categorical variables to factors
factors <- c("cat1", "ca", "cardiohx", "chfhx", "dementhx", "psychhx",
"chrpulhx", "renalhx", "liverhx", "gibledhx", "malighx",
"immunhx", "transhx", "amihx", "sex", "dnr1", "ninsclas",
"resp", "card", "neuro", "gastr", "renal", "meta", "hema",
"seps", "trauma", "ortho", "race", "income")
ObsData[factors] <- lapply(ObsData[factors], as.factor)
# convert our treatment A (RHC vs. No RHC) to a binary variable
ObsData$A <- ifelse(ObsData$swang1 == "RHC", 1, 0)
ObsData <- dplyr::select(ObsData, !swang1)
# Categorize the variables to match with the original paper
ObsData$age <- cut(ObsData$age,breaks=c(-Inf, 50, 60, 70, 80, Inf),right=FALSE)
ObsData$race <- factor(ObsData$race, levels=c("white","black","other"))
ObsData$sex <- as.factor(ObsData$sex)
ObsData$sex <- relevel(ObsData$sex, ref = "Male")
ObsData$cat1 <- as.factor(ObsData$cat1)
levels(ObsData$cat1) <- c("ARF","CHF","Other","Other","Other",
"Other","Other","MOSF","MOSF")
ObsData$ca <- as.factor(ObsData$ca)
levels(ObsData$ca) <- c("Metastatic","None","Localized (Yes)")
ObsData$ca <- factor(ObsData$ca, levels=c("None",
"Localized (Yes)","Metastatic"))
# Rename variables
names(ObsData) <- c("Disease.category", "Cancer", "Cardiovascular",
"Congestive.HF", "Dementia", "Psychiatric", "Pulmonary",
"Renal", "Hepatic", "GI.Bleed", "Tumor",
"Immunosupperssion", "Transfer.hx", "MI", "age", "sex",
"edu", "DASIndex", "APACHE.score", "Glasgow.Coma.Score",
"blood.pressure", "WBC", "Heart.rate", "Respiratory.rate",
"Temperature", "PaO2vs.FIO2", "Albumin", "Hematocrit",
"Bilirubin", "Creatinine", "Sodium", "Potassium", "PaCo2",
"PH", "Weight", "DNR.status", "Medical.insurance",
"Respiratory.Diag", "Cardiovascular.Diag",
"Neurological.Diag", "Gastrointestinal.Diag", "Renal.Diag",
"Metabolic.Diag", "Hematologic.Diag", "Sepsis.Diag",
"Trauma.Diag", "Orthopedic.Diag", "race", "income",
"Y", "A")
min.Y <- min(ObsData$Y)
max.Y <- max(ObsData$Y)
ObsData$Y.bounded <- (ObsData$Y-min.Y)/(max.Y-min.Y)
We construct our outcome model, and make our initial predictions using Superlearner.
library(SuperLearner)
## Loading required package: nnls
## Loading required package: gam
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-2
## Super Learner
## Version: 2.0-28
## Package created on 2021-05-04
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
set.seed(123)
ObsData.noY <- dplyr::select(ObsData, !c(Y,Y.bounded))
Y.fit.sl <- SuperLearner(Y=ObsData$Y.bounded,
X=ObsData.noY,
cvControl = list(V = 3),
SL.library=c("SL.glm",
"SL.glmnet",
"SL.xgboost"),
method="method.CC_nloglik",
family="gaussian")
## Loading required package: nloptr
## Loading required namespace: xgboost
ObsData$init.Pred <- predict(Y.fit.sl, newdata = ObsData.noY,
type = "response")$pred
summary(ObsData$init.Pred)
## V1
## Min. :0.00100
## 1st Qu.:0.03723
## Median :0.04948
## Mean :0.04877
## 3rd Qu.:0.06067
## Max. :0.13659
Get predictions under both treatments:
A=1
ObsData.noY$A <- 1
ObsData$Pred.Y1 <- predict(Y.fit.sl, newdata = ObsData.noY,
type = "response")$pred
summary(ObsData$Pred.Y1)
## V1
## Min. :0.00100
## 1st Qu.:0.04240
## Median :0.05429
## Mean :0.05322
## 3rd Qu.:0.06446
## Max. :0.13659
A=0
ObsData.noY$A <- 0
ObsData$Pred.Y0 <- predict(Y.fit.sl, newdata = ObsData.noY,
type = "response")$pred
summary(ObsData$Pred.Y0)
## V1
## Min. :0.00100
## 1st Qu.:0.03524
## Median :0.04708
## Mean :0.04609
## 3rd Qu.:0.05734
## Max. :0.12652
Get initial treatment effect estimate
ObsData$Pred.TE <- ObsData$Pred.Y1 - ObsData$Pred.Y0
summary(ObsData$Pred.TE)
## V1
## Min. :-0.010333
## 1st Qu.: 0.006682
## Median : 0.007071
## Mean : 0.007134
## 3rd Qu.: 0.007541
## Max. : 0.021592
calculate ps
At this point, we have our initial estimate and now want to perform our targeted improvement.
library(SuperLearner)
set.seed(124)
ObsData.noYA <- dplyr::select(ObsData, !c(Y,Y.bounded,
A,init.Pred,
Pred.Y1,Pred.Y0,
Pred.TE)) #delete Pred.TE
PS.fit.SL <- SuperLearner(Y=ObsData$A,
X=ObsData.noYA,
cvControl = list(V = 3),
SL.library=c("SL.glm",
"SL.glmnet",
"SL.xgboost"),
method="method.CC_nloglik",
family="binomial")
all.pred <- predict(PS.fit.SL, type = "response")
ObsData$PS.SL <- all.pred$pred
summary(ObsData$PS.SL)
## V1
## Min. :0.002806
## 1st Qu.:0.133409
## Median :0.329181
## Mean :0.375143
## 3rd Qu.:0.596584
## Max. :0.983057
tapply(ObsData$PS.SL, ObsData$A, summary)
## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002806 0.079637 0.177020 0.219962 0.327519 0.866585
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03745 0.49045 0.65384 0.62745 0.78263 0.98306
Plot ps and treatment
plot(density(ObsData$PS.SL[ObsData$A==0]),
col = "red", main = "")
lines(density(ObsData$PS.SL[ObsData$A==1]),
col = "blue", lty = 2)
legend("topright", c("No RHC","RHC"),
col = c("red", "blue"), lty=1:2)
independently estimate
We have estimated g(a=1|l) by ps model then can estimate g(a=0|l) by 1 - PS.SL.
# inverse probability
ObsData$H.A1L <- (ObsData$A) / ObsData$PS.SL
ObsData$H.A0L <- (1-ObsData$A) / (1- ObsData$PS.SL)
# estimate H
ObsData$H.AL <- ObsData$H.A1L - ObsData$H.A0L
summary(ObsData$H.AL)
## V1
## Min. :-7.4954
## 1st Qu.:-1.2922
## Median :-1.0659
## Mean :-0.1378
## 3rd Qu.: 1.3662
## Max. :26.7017
tapply(ObsData$H.AL, ObsData$A, summary)
## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7.495 -1.487 -1.215 -1.377 -1.087 -1.003
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.017 1.278 1.529 1.878 2.039 26.702
\[ H(A_i, L_i) = \frac{I(A_i=1)}{g(A_i=1|L_i)} - \frac{I(A_i=0)}{g(A_i=0|L_i)} \]
H clever covariate is to calculate the e epsilon
(H coefficient). It is estimated through MLE, using a model
with an offset based on the initial estimate (initial g
compute predictions), and H clever covariates as independent
variables.
Fluctuation parameter ϵ , representing how large of an adjustment we will make to the initial estimate.
eps_mod <- glm(Y.bounded ~ -1 + H.A1L + H.A0L + #Using two H covariates
offset(qlogis(init.Pred)), #initial g compute
family = "binomial",
data = ObsData)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
# -1 + H.A1L + H.A0L , without intercept
# qlogis(0.2) ==-1.39, inverse logistic function
# plogis(-1.39) ==0.2
# Y.bounded is standardized Y
# length(offset(qlogis(ObsData$init.Pred)))= 5735, residuals
epsilon <- coef(eps_mod)
epsilon
## H.A1L H.A0L
## 0.01568808 0.02070037
epsilon["H.A1L"]
## H.A1L
## 0.01568808
Note that, if init.Pred includes negative values, NaNs would be produced after applying qlogis(). y.bounded is from 0 to 1.
We can use epsilon[“H.A1L”] and epsilon[“H.A0L”] to update
ObsData$Pred.Y1.update <- plogis(qlogis(ObsData$Pred.Y1) +
epsilon["H.A1L"]*ObsData$H.A1L) #adjusted by ps weighting
ObsData$Pred.Y0.update <- plogis(qlogis(ObsData$Pred.Y0) +
epsilon["H.A0L"]*ObsData$H.A0L)
summary(ObsData$Pred.Y1.update)
## V1
## Min. :0.001031
## 1st Qu.:0.042745
## Median :0.054882
## Mean :0.053810
## 3rd Qu.:0.065188
## Max. :0.139189
summary(ObsData$Pred.Y0.update)
## V1
## Min. :0.00100
## 1st Qu.:0.03603
## Median :0.04779
## Mean :0.04686
## 3rd Qu.:0.05809
## Max. :0.12652
Now that the updated predictions of our outcome models are calculated, we can calculate the ATE.
ATE.TMLE.bounded.vector <- ObsData$Pred.Y1.update -
ObsData$Pred.Y0.update
ATE.TMLE.bounded <- mean(ATE.TMLE.bounded.vector,
na.rm = TRUE)
ATE.TMLE.bounded
## [1] 0.006953925
# close to the g formula result
# Mean : 0.00713
We make sure to transform back to our original scale.
ATE.TMLE <- (max.Y-min.Y)*ATE.TMLE.bounded
ATE.TMLE
## [1] 2.725938
Since the machine learning algorithms were used only in intermediary steps, rather than estimating our parameter of interest directly, 95% confidence intervals can be calculated directly.
ci.estimate <- function(data = ObsData, H.AL.components = 1){
min.Y <- min(data$Y)
max.Y <- max(data$Y)
# transform predicted outcomes back to original scale
if (H.AL.components == 2){
data$Pred.Y1.update.rescaled <-
(max.Y- min.Y)*data$Pred.Y1.update + min.Y
data$Pred.Y0.update.rescaled <-
(max.Y- min.Y)*data$Pred.Y0.update + min.Y
}
if (H.AL.components == 1) {
data$Pred.Y1.update.rescaled <-
(max.Y- min.Y)*data$Pred.Y1.update1 + min.Y
data$Pred.Y0.update.rescaled <-
(max.Y- min.Y)*data$Pred.Y0.update1 + min.Y
}
EY1_TMLE1 <- mean(data$Pred.Y1.update.rescaled,
na.rm = TRUE)
EY0_TMLE1 <- mean(data$Pred.Y0.update.rescaled,
na.rm = TRUE)
# ATE efficient influence curve
D1 <- data$A/data$PS.SL*
(data$Y - data$Pred.Y1.update.rescaled) +
data$Pred.Y1.update.rescaled - EY1_TMLE1
D0 <- (1 - data$A)/(1 - data$PS.SL)*
(data$Y - data$Pred.Y0.update.rescaled) +
data$Pred.Y0.update.rescaled - EY0_TMLE1
EIC <- D1 - D0
# ATE variance
n <- nrow(data)
varHat.IC <- var(EIC, na.rm = TRUE)/n
# ATE 95% CI
if (H.AL.components == 2) {
ATE.TMLE.CI <- c(ATE.TMLE - 1.96*sqrt(varHat.IC),
ATE.TMLE + 1.96*sqrt(varHat.IC))
}
if (H.AL.components == 1) {
ATE.TMLE.CI <- c(ATE.TMLE1 - 1.96*sqrt(varHat.IC),
ATE.TMLE1 + 1.96*sqrt(varHat.IC))
}
return(ATE.TMLE.CI)
}
effect confidence interval
CI2 <- ci.estimate(data = ObsData, H.AL.components = 2)
CI2
## [1] 1.585187 3.866689