library(data.table)
library(knitr)
library(kableExtra)
library(grf)
library(ggplot2)
raw_patients = fread('~/Downloads/CGM/RT-CGM Randomized Clinical Trial/DataTables/tblAPtSummary.csv')
raw_hba1c = fread('~/Downloads/CGM/RT-CGM Randomized Clinical Trial/DataTables/tblALabHbA1c.csv')

Heterogeneous treatment effects in CGM RCT

Treatment group was given CGM for six months, Control group was not. Then both groups had CGM for the following 6 months. Primary outcome: Change in HbA1c from study enrollment to end of phase 1 (six months later).

Study design: https://pubmed.ncbi.nlm.nih.gov/18828243/

More info: https://clinicaltrials.gov/ct2/show/NCT00406133

Paper with results: https://pubmed.ncbi.nlm.nih.gov/18779236/

Data: http://publicfiles.jaeb.org/jdrfapp/dataset/RT_CGM_Randomized_Clinical_Trial.zip

Patients in the control and treatment (CGM) groups

raw_patients[,.(n = .N), by = 'TxGroup'] %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
TxGroup n
Control 219
RT-CGM 232

Calculate outcome: a1c change from beginning to end of phase 1

deltas_phase_1 = merge(
  raw_hba1c[Visit == "Randomization", .(PtID, a1c_baseline = LabA1cResult)],
  raw_hba1c[Visit == "Phase 1: 26 week", .(PtID, a1c_end_phase1 = LabA1cResult)],
  by = "PtID"
)
deltas_phase_1[, delta := a1c_end_phase1 - a1c_baseline]
deltas_phase_1[, summary(delta)]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -2.700  -0.500  -0.100  -0.131   0.200   2.500       1
# Join with patient groups to estimate treatment effect
deltas_phase_1_t = merge(deltas_phase_1, raw_patients[,.(PtID, treated = ifelse(TxGroup == "RT-CGM", 1,0))], by = 'PtID')
deltas_phase_1_t[, .(avg_outcome = mean(delta, na.rm=T), .N), by = 'treated']  %>% kable() %>%  kable_styling(c("striped", "hover", "condensed"))
treated avg_outcome N
1 -0.2393013 229
0 -0.0145540 214

T test

t.test(deltas_phase_1_t[treated==1, delta], deltas_phase_1_t[treated==0, delta])
## 
##  Welch Two Sample t-test
## 
## data:  deltas_phase_1_t[treated == 1, delta] and deltas_phase_1_t[treated == 0, delta]
## t = -3.7944, df = 433.24, p-value = 0.000169
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3411625 -0.1083322
## sample estimates:
##   mean of x   mean of y 
## -0.23930131 -0.01455399

Estimate Conditional Average Treatment Effects (CATEs)

Patient Features:

  • pump: 1 if the patient has an insuling pump, 0 if they are injecting themselves.
  • female: 1 if patient is female
  • family_caregiver: 1 if the patient’s primary caregiver is a familymember, 0 if the patient is their own primary caregiver
  • highly_educated_caregiver: 1 if the primary caregiver has a Bachelors degree or higher.
# Encode features
raw_patients[, pump := ifelse(InsulinModality == "Pump", 1, 0)]
raw_patients[, female := ifelse(Gender == "F", 1, 0)]
raw_patients[, family_caregiver := ifelse(EduCareGvrP == "Subject", 0, 1)]
raw_patients[, highly_educated_caregiver := ifelse(EduCareGvrPEdu %in% c("Bachelors", "Masters", "Professional"), 1, 0)]
# Join patient features and outcomes
hte_feats_outcomes_phase1 = merge(
  deltas_phase_1[!is.na(delta)], 
  raw_patients[,.(
    PtID, treated = ifelse(TxGroup == "RT-CGM", 1,0),
    pump, female, family_caregiver, highly_educated_caregiver)], 
  by = 'PtID')

hte_feats_outcomes_phase1[, .(
  n = .N,
  mean_outcome = mean(delta),
  sd_outcome = sd(delta),
  pump = mean(pump),
  female = mean(female),
  family_caregiver = mean(family_caregiver),
  highly_educated_caregiver = mean(highly_educated_caregiver)
  ), by = 'treated'] %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
treated n mean_outcome sd_outcome pump female family_caregiver highly_educated_caregiver
1 229 -0.2393013 0.6829576 0.8253275 0.5327511 0.5196507 0.7248908
0 213 -0.0145540 0.5598386 0.8028169 0.5680751 0.5352113 0.7511737
X = hte_feats_outcomes_phase1[, .(pump, female, family_caregiver, highly_educated_caregiver)]
Y = hte_feats_outcomes_phase1[, delta]
W = hte_feats_outcomes_phase1[, treated]

get_all_the_cates = function(X, W, Y) {
  
  # S-learner (OLS)
  dt = copy(X)
  dt[, treated := W]
  dt[, delta := Y]
  sf = lm(delta ~ treated + female + pump + family_caregiver + highly_educated_caregiver, data=dt)
  pred.sf.0 = predict(sf, dt[, .(treated=0, female, pump, family_caregiver, highly_educated_caregiver)])
  pred.sf.1 = predict(sf, dt[, .(treated=1, female, pump, family_caregiver, highly_educated_caregiver)])
  sl.cates = pred.sf.1 - pred.sf.0
  
  # T-learner (OLS)
  tf0 = lm(delta ~ female + pump + family_caregiver + highly_educated_caregiver, data=dt[treated==0])
  tf1 = lm(delta ~ female + pump + family_caregiver + highly_educated_caregiver, data=dt[treated==1])
  tf.preds.0 = predict(tf0, dt)
  tf.preds.1 = predict(tf1, dt)
  tl.cates = tf.preds.1 - tf.preds.0
  
  # Causal Forest
  cf = causal_forest(X, Y, W, tune.parameters = "all")
  cf.cates = predict(cf)$predictions
  
  # Return combined results
  return(list(
    dt = rbind(
    data.table(method = "S-learner", cate = sl.cates),
    data.table(method = "T-learner", cate = tl.cates),
    data.table(method = "Causal Forest", cate = cf.cates)),
    cf = cf))
}

cates = get_all_the_cates(X, W, Y)

Summary statistics of CATEs by method

cates$dt[, .(min = min(cate), mean = mean(cate), sd = sd(cate), max = max(cate)), by = 'method']  %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
method min mean sd max
S-learner -0.2227113 -0.2227113 0.0000000 -0.2227113
T-learner -0.6831823 -0.2234573 0.1990053 0.0523166
Causal Forest -0.3324368 -0.2221452 0.0702671 -0.1127619
Ordered predictions by CATE from T-learner
hte_feats_outcomes_phase1[, cate_cf := cates$cf$predictions]
hte_feats_outcomes_phase1[, cate_tl := cates$dt[method == "T-learner", cate]]

hte_feats_outcomes_phase1[, .(cate = mean(cate_tl)), by = .(pump, female, family_caregiver, highly_educated_caregiver)][order(cate)] %>%
    kable(digits=2) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
pump female family_caregiver highly_educated_caregiver cate
0 0 0 1 -0.68
0 0 0 0 -0.67
0 1 0 1 -0.53
0 1 0 0 -0.52
1 0 0 1 -0.43
1 0 0 0 -0.42
0 0 1 1 -0.36
0 0 1 0 -0.35
1 1 0 1 -0.28
1 1 0 0 -0.27
0 1 1 1 -0.21
0 1 1 0 -0.20
1 0 1 1 -0.11
1 0 1 0 -0.10
1 1 1 1 0.04
1 1 1 0 0.05

It’s interesting that the highest CATE is for those that inject insulin and are their own caregiver. Lowest CATEs are for those with a pump and a less educated family caregiver.

ATEs by CATE quartile from T-learner (Causal Forest performed poorly so omitting results)
hte_feats_outcomes_phase1[, tl_cate_qrt := findInterval(
  cate_tl, 
  hte_feats_outcomes_phase1[, quantile(cate_tl, seq(0,1,length.out=5), na.rm=T)],
  all.inside = T)]

rbindlist(lapply(hte_feats_outcomes_phase1[,unique(tl_cate_qrt)], function(x) {
  subset_dt = hte_feats_outcomes_phase1[tl_cate_qrt == x]
  m = lm(delta ~ treated, data = subset_dt)
  return(data.table(
    tl_cate_qrt = x,
    mean_cate = round(subset_dt[, mean(cate_tl, na.rm=TRUE)],4),
    n = subset_dt[, .N],
    p_treated = subset_dt[, mean(treated)],
    control_mean = round(subset_dt[, mean(delta, na.rm=TRUE)],4),
    cate = round(summary(m)$coefficients["treated","Estimate"],4),
    se = round(summary(m)$coefficients["treated","Std. Error"],4),
    t = round(summary(m)$coefficients["treated","t value"],4),
    p = round(summary(m)$coefficients["treated","Pr(>|t|)"],4)
  ))
}))[order(tl_cate_qrt)] %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
tl_cate_qrt mean_cate n p_treated control_mean cate se t p
1 -0.4896 106 0.5471698 -0.0915 -0.5251 0.0935 -5.6165 0.0000
2 -0.2991 103 0.5339806 -0.1553 -0.2909 0.0957 -3.0413 0.0030
3 -0.1639 112 0.4732143 -0.0902 -0.1189 0.1181 -1.0074 0.3159
4 0.0190 121 0.5206612 -0.1826 -0.0031 0.1468 -0.0211 0.9832
Two-way split ITTs for each variable
itts_by_var = function(dt, var) {
  rbindlist(lapply(dt[,unique(get(var))], function(x) {
    subset_dt = dt[get(var) == x]
    m = lm(delta ~ treated, data = subset_dt)
    return(data.table(
      var = var,
      value = x,
      n = subset_dt[, .N],
      control_mean = round(subset_dt[, mean(delta, na.rm=TRUE)],4),
      cate = round(summary(m)$coefficients["treated","Estimate"],4),
      se = round(summary(m)$coefficients["treated","Std. Error"],4),
      t = round(summary(m)$coefficients["treated","t value"],4),
      p = round(summary(m)$coefficients["treated","Pr(>|t|)"],4)
    ))
  }))
}


rbindlist(lapply(colnames(X), function(x) itts_by_var(hte_feats_outcomes_phase1, x))) %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
var value n control_mean cate se t p
pump 1 360 -0.1531 -0.1768 0.0656 -2.6938 0.0074
pump 0 82 -0.0341 -0.4214 0.1409 -2.9916 0.0037
female 1 243 -0.1399 -0.1635 0.0868 -1.8834 0.0609
female 0 199 -0.1201 -0.3022 0.0797 -3.7918 0.0002
family_caregiver 0 209 -0.1301 -0.3874 0.0665 -5.8225 0.0000
family_caregiver 1 233 -0.1318 -0.0794 0.0954 -0.8318 0.4064
highly_educated_caregiver 1 326 -0.1267 -0.2242 0.0669 -3.3500 0.0009
highly_educated_caregiver 0 116 -0.1431 -0.2253 0.1289 -1.7480 0.0832
Try single causal tree
#remotes::install_github('susanathey/causalTree')
library(causalTree)
set.seed(1814)
dt = copy(hte_feats_outcomes_phase1)
eval_ind = sample.int(nrow(dt), round(0.5*nrow(dt)))

train_dt = dt[-eval_ind,]
eval_dt = dt[eval_ind,]

fmla_ct = "delta ~ female + pump + family_caregiver + highly_educated_caregiver"
ct_unpruned <- honest.causalTree(
  formula=fmla_ct,                # Define the model
  data=train_dt,                  # Subset used to create tree structure
  est_data=eval_dt,               # Which data set to use to estimate effects
  treatment=train_dt$treated,     # Splitting sample treatment variable
  est_treatment=eval_dt$treated,  # Estimation sample treatment variable
  split.Rule="CT",                # Define the splitting option
  cv.option="TOT",                # Cross validation options
  cp=0,                           # Complexity parameter
  split.Honest=TRUE,              # Use honesty when splitting
  cv.Honest=TRUE,                 # Use honesty when performing cross-validation
  minsize=10,                     # Min. number of treatment and control cases in each leaf
  HonestSampleSize=nrow(eval_dt)) # Num obs used in estimation after building the tree
## [1] 2
## [1] "CT"
# Table of cross-validated values by tuning parameter.
ct_cptable <- as.data.frame(ct_unpruned$cptable)

# Obtain optimal complexity parameter to prune tree.
selected_cp <- which.min(ct_cptable$xerror)
optim_cp_ct <- ct_cptable[selected_cp, "CP"]

# Prune the tree at optimal complexity parameter.
ct_pruned <- prune(tree=ct_unpruned, cp=optim_cp_ct)
library(rpart.plot)
rpart.plot(
  x=ct_pruned,        # Pruned tree
  type=3,             # Draw separate split labels for the left and right directions
  fallen=TRUE,        # Position the leaf nodes at the bottom of the graph
  leaf.round=1,       # Rounding of the corners of the leaf node boxes
  extra=100,          # Display the percentage of observations in the node
  box.palette="RdBu") # Palette for coloring the node