Propensity Score Calibration

Preprocess

Code
pckgs <- c("knitr", "tidyverse", "ggalt", "cobalt", "ggsci",
    "modEvA", "naniar", "DT", "Hmisc", "hrbrthemes", "summarytools", 
     "survival", "causalCmprsk") # "kableExtra",

ix=NULL
for (package in pckgs)
  if (!requireNamespace(package, quietly = TRUE)) ix <- c(ix, package)
if (length(ix)!=0)
{
pr <- "WARNING: Packages: "
for (i in ix) 
  pr <- paste(pr, i, sep="")
pr <- paste(pr, "\nhave to be installed in order to successfully compile the rest of the vignette. Please install these packages and compile again.", sep="")
cat(pr)
knitr::knit_exit()
}


for (package in pckgs)
  library(package, character.only=TRUE) 

opts_chunk$set(results = 'asis',      # Can also be set at the chunk-level
                 comment = NA,
                 prompt  = FALSE,
                 cache   = FALSE)
summarytools::st_options(plain.ascii = FALSE,        # Always use this option in Rmd documents
          style        = "rmarkdown",  # Always use this option in Rmd documents
          footnote     = NA,           # Makes html-rendered results more concise
          subtitle.emphasis = FALSE)   # Improves layout with some rmardown themes


Hmisc::getHdata(rhc) # loading data using the Hmisc package
rhc_raw <- rhc


miss.report <- rhc_raw %>% miss_var_summary()
kable(miss.report[1:10,]) # %>% kable_styling(bootstrap_options = "striped", full_width = F)
variable n_miss pct_miss
cat2 4535 79.0758500
adld3p 4296 74.9084568
urin1 3028 52.7986051
dthdte 2013 35.1002616
dschdte 1 0.0174368
cat1 0 0.0000000
ca 0 0.0000000
sadmdte 0 0.0000000
lstctdte 0 0.0000000
death 0 0.0000000
Code
rhc_cleaning1 <- rhc_raw %>%
  mutate(RHC = as.numeric(swang1 == "RHC"), 
         trt=swang1,
         E = ifelse(is.na(dthdte), 1, ifelse(dschdte==dthdte, 2, 1)), 
         Time = dschdte - sadmdte, #=time-to-"Discharge" (either alive or due to death in a hospital)
         T.death = ifelse(is.na(dthdte), lstctdte - sadmdte, dthdte - sadmdte), #=min(Time to death before or after discharge, Time to a last follow-up visit)
         D = ifelse(death =="No", 0, 1), # death indicator
         sex_Male = as.numeric(sex == "Male"),
         race_black = as.numeric(race == "black"),
         race_other = as.numeric(race == "other"),
         income_11_25K = as.numeric(income == "$11-$25k"),
         income_25_50K = as.numeric(income == "$25-$50k"),
         income_50K = as.numeric(income == "> $50k"),
         ninsclas_Private_Medicare = as.numeric(ninsclas == "Private & Medicare"),
         ninsclas_Medicare = as.numeric(ninsclas == "Medicare"),
         ninsclas_Medicare_Medicaid = as.numeric(ninsclas == "Medicare & Medicaid"),
         ninsclas_Medicaid = as.numeric(ninsclas == "Medicaid"),
         ninsclas_No_Insurance = as.numeric(ninsclas == "No insurance"),
# combine cat1 with cat2, i.e the primary disease category with the secondary disease category:         
         cat_CHF = as.numeric(cat1 == "CHF" | (!is.na(cat2))&(cat2 == "CHF") ),
         cat_Cirrhosis = as.numeric(cat1 == "Cirrhosis" | (!is.na(cat2))&(cat2 == "Cirrhosis")),
         cat_Colon_Cancer = as.numeric(cat1 == "Colon Cancer" | (!is.na(cat2))&(cat2 == "Colon Cancer")),
         cat_Coma = as.numeric(cat1 == "Coma" | (!is.na(cat2))&(cat2 == "Coma")),
         cat_COPD = as.numeric(cat1 == "COPD" | (!is.na(cat2))&(cat2 == "COPD")),
         cat_Lung_Cancer = as.numeric(cat1 == "Lung Cancer" | (!is.na(cat2))&(cat2 == "Lung Cancer")),
         cat_MOSF_Malignancy = as.numeric(cat1 == "MOSF w/Malignancy" | (!is.na(cat2))&(cat2 == "MOSF w/Malignancy")),
         cat_MOSF_Sepsis = as.numeric(cat1 == "MOSF w/Sepsis" | (!is.na(cat2))&(cat2 == "MOSF w/Sepsis")),
         dnr1_Yes = as.numeric(dnr1 == "Yes"),
         card_Yes = as.numeric(card == "Yes"),
         gastr_Yes = as.numeric(gastr == "Yes"),
         hema_Yes = as.numeric(hema == "Yes"),
         meta_Yes = as.numeric(meta == "Yes"),
         neuro_Yes = as.numeric(neuro == "Yes"),
         ortho_Yes = as.numeric(ortho == "Yes"),
         renal_Yes = as.numeric(renal == "Yes"),
         resp_Yes = as.numeric(resp == "Yes"),
         seps_Yes = as.numeric(seps == "Yes"),
         trauma_Yes = as.numeric(trauma == "Yes"),
         ca_Yes = as.numeric(ca == "Yes"),
         ca_Metastatic = as.numeric(ca == "Metastatic")
  )

# variables selection and data reordering:
rhc_full <- rhc_cleaning1 %>% 
  select(ptid, RHC, trt, Time, T.death, E, D, sex_Male, 
         age, edu, race_black, race_other, income_11_25K, income_25_50K, income_50K,
         ninsclas_Private_Medicare, ninsclas_Medicare, ninsclas_Medicare_Medicaid,
         ninsclas_Medicaid, ninsclas_No_Insurance, 
         cat_CHF, cat_Cirrhosis, cat_Colon_Cancer, cat_Coma, cat_COPD, cat_Lung_Cancer, 
         cat_MOSF_Malignancy, cat_MOSF_Sepsis,
         # diagnoses:
         dnr1_Yes, card_Yes, gastr_Yes, hema_Yes, meta_Yes, neuro_Yes, ortho_Yes, renal_Yes, 
         resp_Yes, seps_Yes, trauma_Yes,
         ca_Yes, ca_Metastatic,
         # lab tests:
         wtkilo1, hrt1, meanbp1, resp1, temp1,
         aps1, das2d3pc, scoma1, 
         surv2md1, alb1, bili1, crea1, hema1, paco21, 
         pafi1, ph1, pot1, sod1, wblc1,
         # all variables with "hx" are preexisting conditions: 
         amihx, cardiohx, chfhx, chrpulhx, dementhx, 
         gibledhx, immunhx, liverhx, malighx, psychhx, 
         renalhx, transhx, 
         death, sadmdte, dschdte, dthdte, lstctdte)

# omit 1 obs with missing discharge date for the length-of-stay analysis:
rhc <- rhc_full[!is.na(rhc_raw$dschdte),]
rhc_full$T.death.30 <- pmin(30, rhc_full$T.death)
rhc_full$D.30 <- rhc_full$D*ifelse(rhc_full$T.death <=30, 1, 0)


E <- as.factor(rhc$E)
levels(E) <- c("discharge", "in-hospital death")
t <- addmargins(table(E, useNA="no"))
kable(t) # %>% kable_styling(bootstrap_options = "striped", full_width = F)
E Freq
discharge 3704
in-hospital death 2030
Sum 5734
Code
D <- as.factor(rhc_full$D)
levels(D) <- c("censored", "died")
t <- addmargins(table(D, useNA="no"))
kable(t) # %>% kable_styling(bootstrap_options = "striped", full_width = F)
D Freq
censored 2013
died 3722
Sum 5735
Code
D.30 <- as.factor(rhc_full$D.30)
levels(D.30) <- c("censored", "died")
t <- addmargins(table(D.30, useNA="no"))
kable(t) #%>% kable_styling(bootstrap_options = "striped", full_width = F)
D.30 Freq
censored 3817
died 1918
Sum 5735
Code
t <- addmargins(table(rhc_full$trt, useNA="no"))
kable(t) #%>% kable_styling(bootstrap_options = "striped", full_width = F)
Var1 Freq
No RHC 3551
RHC 2184
Sum 5735
Code
covs.names <- c("age", "sex_Male", "edu", "race_black", "race_other",
                "income_11_25K", "income_25_50K", "income_50K", 
                "ninsclas_Private_Medicare", "ninsclas_Medicare", "ninsclas_Medicare_Medicaid",
                "ninsclas_Medicaid", "ninsclas_No_Insurance",
                "cat_CHF", "cat_Cirrhosis", "cat_Colon_Cancer", "cat_Coma", "cat_COPD", "cat_Lung_Cancer", 
                "cat_MOSF_Malignancy", "cat_MOSF_Sepsis", 
                "dnr1_Yes", "wtkilo1", "hrt1", "meanbp1", 
                "resp1", "temp1", 
                "card_Yes", "gastr_Yes", "hema_Yes", "meta_Yes", "neuro_Yes", "ortho_Yes", 
                "renal_Yes", "resp_Yes", "seps_Yes", "trauma_Yes", 
                "ca_Yes", "ca_Metastatic",
                "amihx", "cardiohx", "chfhx", "chrpulhx",
                "dementhx", "gibledhx", "immunhx", "liverhx", 
                "malighx", "psychhx", "renalhx", "transhx",
                "aps1", "das2d3pc", "scoma1", "surv2md1",
                "alb1", "bili1", "crea1", "hema1", "paco21", "pafi1", 
                "ph1", "pot1", "sod1", "wblc1")

Testing goodness-of-fit of a PS model

We use get.weights function in order to fit a PS model and estimate weights. The input argument wtype="stab.ATE" requests calculation of stabilized ATE weights (@sw_hernan):

Code
form.txt <- paste0("RHC", " ~ ", paste0(covs.names, collapse = "+"))
trt.formula <- as.formula(form.txt)
ps.stab.ATE <- get.weights(formula=trt.formula, data=rhc, wtype = "stab.ATE") 

The estimated propensity scores are saved in the ps field of the output list ps.stab.ATE, and the summary.glm field returns the summary of the fitted logistic regression. The requested weights are saved in the w field.

We can verify the goodness-of-fit (GOF) of the treatment assignment model both graphically and using the formal test of @HL, where we check whether the model-based predicted probabilities of treatment assignment are similar to the corresponding empirical probabilities:

Code
gof <- HLfit(obs=rhc$RHC, pred=ps.stab.ATE$ps, n.bins=50, bin.method = "quantiles", plot.bin.size=FALSE)

Visual test of goodness-of-fit of a PS model and Hosmer-Lemeshow test.

Ideally, all the points should fall on the diagonal line. In our example, as the Hosmer and Lemeshow’s test shows, the logistic regression model with all the main effects does not fit the data well (\(pvalue<0.001\)), although visually the estimates fluctuate around the reference line.

For comparison, if we omit all the covariates that are insignificant at the 0.05 level from the original model, the new model perfectly fits the data:

Code
ind <- ps.stab.ATE$summary.glm$coefficients[,4]<0.05
form.txt <- paste0("RHC", " ~ ", paste0(covs.names[ind], collapse = "+"))
trt.formula.1 <- as.formula(form.txt)
ps.stab.ATE.2 <- get.weights(formula=trt.formula.1, data=rhc, wtype = "stab.ATE")
gof <- HLfit(obs=rhc$RHC, pred=ps.stab.ATE.2$ps, n.bins=50, bin.method = "quantiles", plot.bin.size=FALSE)

Checking goodness-of-fit of a PS model with selection of significant covariates.

However, we notice that:

  1. Although the omission of all insignificant covariates resulted in a better fit, using this selective model for weighting can leave the excluded covariates imbalanced, and moreover this might even distort the initially existing balance in them, as we can see further on Figure @ref(fig:figcompare) which corresponds to the PS model with only significant covariates. The GOF criterion optimizes the loss function which does not necessarily goes along with balancing of important confounders.

  2. Another potential “danger” of keeping only strong predictors of the treatment assingment in the PS model is an issue of possible bias amplification - see, for example, @Ding.

In summary, a good PS model cannot be built based only on the covariates-“treatment assignment” relationship. It should incorporate extra knowledge on the covariates-outcome relationship and other aspects of the data generating processes.

In what follows, we will proceed with the PS model that includes all the available covariates.

Calibration by rtichoke

Code
library(rtichoke)

create_calibration_curve(
  probs = list("Full Model" = ps.stab.ATE$ps, "Reduced Model" = ps.stab.ATE.2$ps),
  reals = list(rhc$RHC),
  type = "discrete",
  size = 500
)
Code
library(rtichoke)

create_calibration_curve(
  probs = list("Full Model" = ps.stab.ATE$ps, "Reduced Model" = ps.stab.ATE.2$ps),
  reals = list(rhc$RHC),
  type = "smooth",
  size = 500
)