Propensity Score Calibration

Preprocess

Code

pckgs <- c("knitr", "tidyverse", "ggalt", "cobalt", "ggsci",
    "modEvA", "naniar", "DT", "Hmisc", "hrbrthemes", "summarytools", 
     "survival", "causalCmprsk") # "kableExtra",

ix=NULL
for (package in pckgs)
  if (!requireNamespace(package, quietly = TRUE)) ix <- c(ix, package)
if (length(ix)!=0)
{
pr <- "WARNING: Packages: "
for (i in ix) 
  pr <- paste(pr, i, sep="")
pr <- paste(pr, "\nhave to be installed in order to successfully compile the rest of the vignette. Please install these packages and compile again.", sep="")
cat(pr)
knitr::knit_exit()
}


for (package in pckgs)
  library(package, character.only=TRUE) 

opts_chunk$set(results = 'asis',      # Can also be set at the chunk-level
                 comment = NA,
                 prompt  = FALSE,
                 cache   = FALSE)
summarytools::st_options(plain.ascii = FALSE,        # Always use this option in Rmd documents
          style        = "rmarkdown",  # Always use this option in Rmd documents
          footnote     = NA,           # Makes html-rendered results more concise
          subtitle.emphasis = FALSE)   # Improves layout with some rmardown themes


Hmisc::getHdata(rhc) # loading data using the Hmisc package
rhc_raw <- rhc


miss.report <- rhc_raw %>% miss_var_summary()
kable(miss.report[1:10,]) # %>% kable_styling(bootstrap_options = "striped", full_width = F)

variable	n_miss	pct_miss
cat2	4535	79.0758500
adld3p	4296	74.9084568
urin1	3028	52.7986051
dthdte	2013	35.1002616
dschdte	1	0.0174368
cat1	0	0.0000000
ca	0	0.0000000
sadmdte	0	0.0000000
lstctdte	0	0.0000000
death	0	0.0000000

Code

rhc_cleaning1 <- rhc_raw %>%
  mutate(RHC = as.numeric(swang1 == "RHC"), 
         trt=swang1,
         E = ifelse(is.na(dthdte), 1, ifelse(dschdte==dthdte, 2, 1)), 
         Time = dschdte - sadmdte, #=time-to-"Discharge" (either alive or due to death in a hospital)
         T.death = ifelse(is.na(dthdte), lstctdte - sadmdte, dthdte - sadmdte), #=min(Time to death before or after discharge, Time to a last follow-up visit)
         D = ifelse(death =="No", 0, 1), # death indicator
         sex_Male = as.numeric(sex == "Male"),
         race_black = as.numeric(race == "black"),
         race_other = as.numeric(race == "other"),
         income_11_25K = as.numeric(income == "$11-$25k"),
         income_25_50K = as.numeric(income == "$25-$50k"),
         income_50K = as.numeric(income == "> $50k"),
         ninsclas_Private_Medicare = as.numeric(ninsclas == "Private & Medicare"),
         ninsclas_Medicare = as.numeric(ninsclas == "Medicare"),
         ninsclas_Medicare_Medicaid = as.numeric(ninsclas == "Medicare & Medicaid"),
         ninsclas_Medicaid = as.numeric(ninsclas == "Medicaid"),
         ninsclas_No_Insurance = as.numeric(ninsclas == "No insurance"),
# combine cat1 with cat2, i.e the primary disease category with the secondary disease category:         
         cat_CHF = as.numeric(cat1 == "CHF" | (!is.na(cat2))&(cat2 == "CHF") ),
         cat_Cirrhosis = as.numeric(cat1 == "Cirrhosis" | (!is.na(cat2))&(cat2 == "Cirrhosis")),
         cat_Colon_Cancer = as.numeric(cat1 == "Colon Cancer" | (!is.na(cat2))&(cat2 == "Colon Cancer")),
         cat_Coma = as.numeric(cat1 == "Coma" | (!is.na(cat2))&(cat2 == "Coma")),
         cat_COPD = as.numeric(cat1 == "COPD" | (!is.na(cat2))&(cat2 == "COPD")),
         cat_Lung_Cancer = as.numeric(cat1 == "Lung Cancer" | (!is.na(cat2))&(cat2 == "Lung Cancer")),
         cat_MOSF_Malignancy = as.numeric(cat1 == "MOSF w/Malignancy" | (!is.na(cat2))&(cat2 == "MOSF w/Malignancy")),
         cat_MOSF_Sepsis = as.numeric(cat1 == "MOSF w/Sepsis" | (!is.na(cat2))&(cat2 == "MOSF w/Sepsis")),
         dnr1_Yes = as.numeric(dnr1 == "Yes"),
         card_Yes = as.numeric(card == "Yes"),
         gastr_Yes = as.numeric(gastr == "Yes"),
         hema_Yes = as.numeric(hema == "Yes"),
         meta_Yes = as.numeric(meta == "Yes"),
         neuro_Yes = as.numeric(neuro == "Yes"),
         ortho_Yes = as.numeric(ortho == "Yes"),
         renal_Yes = as.numeric(renal == "Yes"),
         resp_Yes = as.numeric(resp == "Yes"),
         seps_Yes = as.numeric(seps == "Yes"),
         trauma_Yes = as.numeric(trauma == "Yes"),
         ca_Yes = as.numeric(ca == "Yes"),
         ca_Metastatic = as.numeric(ca == "Metastatic")
  )

# variables selection and data reordering:
rhc_full <- rhc_cleaning1 %>% 
  select(ptid, RHC, trt, Time, T.death, E, D, sex_Male, 
         age, edu, race_black, race_other, income_11_25K, income_25_50K, income_50K,
         ninsclas_Private_Medicare, ninsclas_Medicare, ninsclas_Medicare_Medicaid,
         ninsclas_Medicaid, ninsclas_No_Insurance, 
         cat_CHF, cat_Cirrhosis, cat_Colon_Cancer, cat_Coma, cat_COPD, cat_Lung_Cancer, 
         cat_MOSF_Malignancy, cat_MOSF_Sepsis,
         # diagnoses:
         dnr1_Yes, card_Yes, gastr_Yes, hema_Yes, meta_Yes, neuro_Yes, ortho_Yes, renal_Yes, 
         resp_Yes, seps_Yes, trauma_Yes,
         ca_Yes, ca_Metastatic,
         # lab tests:
         wtkilo1, hrt1, meanbp1, resp1, temp1,
         aps1, das2d3pc, scoma1, 
         surv2md1, alb1, bili1, crea1, hema1, paco21, 
         pafi1, ph1, pot1, sod1, wblc1,
         # all variables with "hx" are preexisting conditions: 
         amihx, cardiohx, chfhx, chrpulhx, dementhx, 
         gibledhx, immunhx, liverhx, malighx, psychhx, 
         renalhx, transhx, 
         death, sadmdte, dschdte, dthdte, lstctdte)

# omit 1 obs with missing discharge date for the length-of-stay analysis:
rhc <- rhc_full[!is.na(rhc_raw$dschdte),]
rhc_full$T.death.30 <- pmin(30, rhc_full$T.death)
rhc_full$D.30 <- rhc_full$D*ifelse(rhc_full$T.death <=30, 1, 0)


E <- as.factor(rhc$E)
levels(E) <- c("discharge", "in-hospital death")
t <- addmargins(table(E, useNA="no"))
kable(t) # %>% kable_styling(bootstrap_options = "striped", full_width = F)

E	Freq
discharge	3704
in-hospital death	2030
Sum	5734

Code

D <- as.factor(rhc_full$D)
levels(D) <- c("censored", "died")
t <- addmargins(table(D, useNA="no"))
kable(t) # %>% kable_styling(bootstrap_options = "striped", full_width = F)

D	Freq
censored	2013
died	3722
Sum	5735

Code

D.30 <- as.factor(rhc_full$D.30)
levels(D.30) <- c("censored", "died")
t <- addmargins(table(D.30, useNA="no"))
kable(t) #%>% kable_styling(bootstrap_options = "striped", full_width = F)

D.30	Freq
censored	3817
died	1918
Sum	5735

Code

t <- addmargins(table(rhc_full$trt, useNA="no"))
kable(t) #%>% kable_styling(bootstrap_options = "striped", full_width = F)

Var1	Freq
No RHC	3551
RHC	2184
Sum	5735

Code

covs.names <- c("age", "sex_Male", "edu", "race_black", "race_other",
                "income_11_25K", "income_25_50K", "income_50K", 
                "ninsclas_Private_Medicare", "ninsclas_Medicare", "ninsclas_Medicare_Medicaid",
                "ninsclas_Medicaid", "ninsclas_No_Insurance",
                "cat_CHF", "cat_Cirrhosis", "cat_Colon_Cancer", "cat_Coma", "cat_COPD", "cat_Lung_Cancer", 
                "cat_MOSF_Malignancy", "cat_MOSF_Sepsis", 
                "dnr1_Yes", "wtkilo1", "hrt1", "meanbp1", 
                "resp1", "temp1", 
                "card_Yes", "gastr_Yes", "hema_Yes", "meta_Yes", "neuro_Yes", "ortho_Yes", 
                "renal_Yes", "resp_Yes", "seps_Yes", "trauma_Yes", 
                "ca_Yes", "ca_Metastatic",
                "amihx", "cardiohx", "chfhx", "chrpulhx",
                "dementhx", "gibledhx", "immunhx", "liverhx", 
                "malighx", "psychhx", "renalhx", "transhx",
                "aps1", "das2d3pc", "scoma1", "surv2md1",
                "alb1", "bili1", "crea1", "hema1", "paco21", "pafi1", 
                "ph1", "pot1", "sod1", "wblc1")

Testing goodness-of-fit of a PS model

We use get.weights function in order to fit a PS model and estimate weights. The input argument wtype="stab.ATE" requests calculation of stabilized ATE weights (@sw_hernan):

Code

form.txt <- paste0("RHC", " ~ ", paste0(covs.names, collapse = "+"))
trt.formula <- as.formula(form.txt)
ps.stab.ATE <- get.weights(formula=trt.formula, data=rhc, wtype = "stab.ATE")

The estimated propensity scores are saved in the ps field of the output list ps.stab.ATE, and the summary.glm field returns the summary of the fitted logistic regression. The requested weights are saved in the w field.

We can verify the goodness-of-fit (GOF) of the treatment assignment model both graphically and using the formal test of @HL, where we check whether the model-based predicted probabilities of treatment assignment are similar to the corresponding empirical probabilities:

Code

gof <- HLfit(obs=rhc$RHC, pred=ps.stab.ATE$ps, n.bins=50, bin.method = "quantiles", plot.bin.size=FALSE)

Visual test of goodness-of-fit of a PS model and Hosmer-Lemeshow test.

Ideally, all the points should fall on the diagonal line. In our example, as the Hosmer and Lemeshow’s test shows, the logistic regression model with all the main effects does not fit the data well (\(pvalue<0.001\)), although visually the estimates fluctuate around the reference line.

For comparison, if we omit all the covariates that are insignificant at the 0.05 level from the original model, the new model perfectly fits the data:

Code

ind <- ps.stab.ATE$summary.glm$coefficients[,4]<0.05
form.txt <- paste0("RHC", " ~ ", paste0(covs.names[ind], collapse = "+"))
trt.formula.1 <- as.formula(form.txt)
ps.stab.ATE.2 <- get.weights(formula=trt.formula.1, data=rhc, wtype = "stab.ATE")
gof <- HLfit(obs=rhc$RHC, pred=ps.stab.ATE.2$ps, n.bins=50, bin.method = "quantiles", plot.bin.size=FALSE)

Checking goodness-of-fit of a PS model with selection of significant covariates.

However, we notice that:

Although the omission of all insignificant covariates resulted in a better fit, using this selective model for weighting can leave the excluded covariates imbalanced, and moreover this might even distort the initially existing balance in them, as we can see further on Figure @ref(fig:figcompare) which corresponds to the PS model with only significant covariates. The GOF criterion optimizes the loss function which does not necessarily goes along with balancing of important confounders.
Another potential “danger” of keeping only strong predictors of the treatment assingment in the PS model is an issue of possible bias amplification - see, for example, @Ding.

In summary, a good PS model cannot be built based only on the covariates-“treatment assignment” relationship. It should incorporate extra knowledge on the covariates-outcome relationship and other aspects of the data generating processes.

In what follows, we will proceed with the PS model that includes all the available covariates.

Calibration by rtichoke

Code

library(rtichoke)

create_calibration_curve(
  probs = list("Full Model" = ps.stab.ATE$ps, "Reduced Model" = ps.stab.ATE.2$ps),
  reals = list(rhc$RHC),
  type = "discrete",
  size = 500
)

Code

library(rtichoke)

create_calibration_curve(
  probs = list("Full Model" = ps.stab.ATE$ps, "Reduced Model" = ps.stab.ATE.2$ps),
  reals = list(rhc$RHC),
  type = "smooth",
  size = 500
)