library(data.table)
library(knitr)
library(kableExtra)
library(grf)
library(ggplot2)
raw_patients = fread('~/Downloads/CGM/RT-CGM Randomized Clinical Trial/DataTables/tblAPtSummary.csv')
raw_hba1c = fread('~/Downloads/CGM/RT-CGM Randomized Clinical Trial/DataTables/tblALabHbA1c.csv')
Treatment group was given CGM for six months, Control group was not. Then both groups had CGM for the following 6 months. Primary outcome: Change in HbA1c from study enrollment to end of phase 1 (six months later).
Study design: https://pubmed.ncbi.nlm.nih.gov/18828243/
More info: https://clinicaltrials.gov/ct2/show/NCT00406133
Paper with results: https://pubmed.ncbi.nlm.nih.gov/18779236/
Data: http://publicfiles.jaeb.org/jdrfapp/dataset/RT_CGM_Randomized_Clinical_Trial.zip
Patients in the control and treatment (CGM) groups
raw_patients[,.(n = .N), by = 'TxGroup'] %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
| TxGroup | n |
|---|---|
| Control | 219 |
| RT-CGM | 232 |
Calculate outcome: a1c change from beginning to end of phase 1
deltas_phase_1 = merge(
raw_hba1c[Visit == "Randomization", .(PtID, a1c_baseline = LabA1cResult)],
raw_hba1c[Visit == "Phase 1: 26 week", .(PtID, a1c_end_phase1 = LabA1cResult)],
by = "PtID"
)
deltas_phase_1[, delta := a1c_end_phase1 - a1c_baseline]
deltas_phase_1[, summary(delta)]
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -2.700 -0.500 -0.100 -0.131 0.200 2.500 1
# Join with patient groups to estimate treatment effect
deltas_phase_1_t = merge(deltas_phase_1, raw_patients[,.(PtID, treated = ifelse(TxGroup == "RT-CGM", 1,0))], by = 'PtID')
deltas_phase_1_t[, .(avg_outcome = mean(delta, na.rm=T), .N), by = 'treated'] %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
| treated | avg_outcome | N |
|---|---|---|
| 1 | -0.2393013 | 229 |
| 0 | -0.0145540 | 214 |
T test
t.test(deltas_phase_1_t[treated==1, delta], deltas_phase_1_t[treated==0, delta])
##
## Welch Two Sample t-test
##
## data: deltas_phase_1_t[treated == 1, delta] and deltas_phase_1_t[treated == 0, delta]
## t = -3.7944, df = 433.24, p-value = 0.000169
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3411625 -0.1083322
## sample estimates:
## mean of x mean of y
## -0.23930131 -0.01455399
Patient Features:
pump: 1 if the patient has an insuling pump, 0 if they are injecting themselves.female: 1 if patient is femalefamily_caregiver: 1 if the patient’s primary caregiver is a familymember, 0 if the patient is their own primary caregiverhighly_educated_caregiver: 1 if the primary caregiver has a Bachelors degree or higher.# Encode features
raw_patients[, pump := ifelse(InsulinModality == "Pump", 1, 0)]
raw_patients[, female := ifelse(Gender == "F", 1, 0)]
raw_patients[, family_caregiver := ifelse(EduCareGvrP == "Subject", 0, 1)]
raw_patients[, highly_educated_caregiver := ifelse(EduCareGvrPEdu %in% c("Bachelors", "Masters", "Professional"), 1, 0)]
# Join patient features and outcomes
hte_feats_outcomes_phase1 = merge(
deltas_phase_1[!is.na(delta)],
raw_patients[,.(
PtID, treated = ifelse(TxGroup == "RT-CGM", 1,0),
pump, female, family_caregiver, highly_educated_caregiver)],
by = 'PtID')
hte_feats_outcomes_phase1[, .(
n = .N,
mean_outcome = mean(delta),
sd_outcome = sd(delta),
pump = mean(pump),
female = mean(female),
family_caregiver = mean(family_caregiver),
highly_educated_caregiver = mean(highly_educated_caregiver)
), by = 'treated'] %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
| treated | n | mean_outcome | sd_outcome | pump | female | family_caregiver | highly_educated_caregiver |
|---|---|---|---|---|---|---|---|
| 1 | 229 | -0.2393013 | 0.6829576 | 0.8253275 | 0.5327511 | 0.5196507 | 0.7248908 |
| 0 | 213 | -0.0145540 | 0.5598386 | 0.8028169 | 0.5680751 | 0.5352113 | 0.7511737 |
X = hte_feats_outcomes_phase1[, .(pump, female, family_caregiver, highly_educated_caregiver)]
Y = hte_feats_outcomes_phase1[, delta]
W = hte_feats_outcomes_phase1[, treated]
get_all_the_cates = function(X, W, Y) {
# S-learner (OLS)
dt = copy(X)
dt[, treated := W]
dt[, delta := Y]
sf = lm(delta ~ treated + female + pump + family_caregiver + highly_educated_caregiver, data=dt)
pred.sf.0 = predict(sf, dt[, .(treated=0, female, pump, family_caregiver, highly_educated_caregiver)])
pred.sf.1 = predict(sf, dt[, .(treated=1, female, pump, family_caregiver, highly_educated_caregiver)])
sl.cates = pred.sf.1 - pred.sf.0
# T-learner (OLS)
tf0 = lm(delta ~ female + pump + family_caregiver + highly_educated_caregiver, data=dt[treated==0])
tf1 = lm(delta ~ female + pump + family_caregiver + highly_educated_caregiver, data=dt[treated==1])
tf.preds.0 = predict(tf0, dt)
tf.preds.1 = predict(tf1, dt)
tl.cates = tf.preds.1 - tf.preds.0
# Causal Forest
cf = causal_forest(X, Y, W, tune.parameters = "all")
cf.cates = predict(cf)$predictions
# Return combined results
return(list(
dt = rbind(
data.table(method = "S-learner", cate = sl.cates),
data.table(method = "T-learner", cate = tl.cates),
data.table(method = "Causal Forest", cate = cf.cates)),
cf = cf))
}
cates = get_all_the_cates(X, W, Y)
Summary statistics of CATEs by method
cates$dt[, .(min = min(cate), mean = mean(cate), sd = sd(cate), max = max(cate)), by = 'method'] %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
| method | min | mean | sd | max |
|---|---|---|---|---|
| S-learner | -0.2227113 | -0.2227113 | 0.0000000 | -0.2227113 |
| T-learner | -0.6831823 | -0.2234573 | 0.1990053 | 0.0523166 |
| Causal Forest | -0.3324368 | -0.2221452 | 0.0702671 | -0.1127619 |
hte_feats_outcomes_phase1[, cate_cf := cates$cf$predictions]
hte_feats_outcomes_phase1[, cate_tl := cates$dt[method == "T-learner", cate]]
hte_feats_outcomes_phase1[, .(cate = mean(cate_tl)), by = .(pump, female, family_caregiver, highly_educated_caregiver)][order(cate)] %>%
kable(digits=2) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| pump | female | family_caregiver | highly_educated_caregiver | cate |
|---|---|---|---|---|
| 0 | 0 | 0 | 1 | -0.68 |
| 0 | 0 | 0 | 0 | -0.67 |
| 0 | 1 | 0 | 1 | -0.53 |
| 0 | 1 | 0 | 0 | -0.52 |
| 1 | 0 | 0 | 1 | -0.43 |
| 1 | 0 | 0 | 0 | -0.42 |
| 0 | 0 | 1 | 1 | -0.36 |
| 0 | 0 | 1 | 0 | -0.35 |
| 1 | 1 | 0 | 1 | -0.28 |
| 1 | 1 | 0 | 0 | -0.27 |
| 0 | 1 | 1 | 1 | -0.21 |
| 0 | 1 | 1 | 0 | -0.20 |
| 1 | 0 | 1 | 1 | -0.11 |
| 1 | 0 | 1 | 0 | -0.10 |
| 1 | 1 | 1 | 1 | 0.04 |
| 1 | 1 | 1 | 0 | 0.05 |
It’s interesting that the highest CATE is for those that inject insulin and are their own caregiver. Lowest CATEs are for those with a pump and a less educated family caregiver.
hte_feats_outcomes_phase1[, tl_cate_qrt := findInterval(
cate_tl,
hte_feats_outcomes_phase1[, quantile(cate_tl, seq(0,1,length.out=5), na.rm=T)],
all.inside = T)]
rbindlist(lapply(hte_feats_outcomes_phase1[,unique(tl_cate_qrt)], function(x) {
subset_dt = hte_feats_outcomes_phase1[tl_cate_qrt == x]
m = lm(delta ~ treated, data = subset_dt)
return(data.table(
tl_cate_qrt = x,
mean_cate = round(subset_dt[, mean(cate_tl, na.rm=TRUE)],4),
n = subset_dt[, .N],
p_treated = subset_dt[, mean(treated)],
control_mean = round(subset_dt[, mean(delta, na.rm=TRUE)],4),
cate = round(summary(m)$coefficients["treated","Estimate"],4),
se = round(summary(m)$coefficients["treated","Std. Error"],4),
t = round(summary(m)$coefficients["treated","t value"],4),
p = round(summary(m)$coefficients["treated","Pr(>|t|)"],4)
))
}))[order(tl_cate_qrt)] %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
| tl_cate_qrt | mean_cate | n | p_treated | control_mean | cate | se | t | p |
|---|---|---|---|---|---|---|---|---|
| 1 | -0.4896 | 106 | 0.5471698 | -0.0915 | -0.5251 | 0.0935 | -5.6165 | 0.0000 |
| 2 | -0.2991 | 103 | 0.5339806 | -0.1553 | -0.2909 | 0.0957 | -3.0413 | 0.0030 |
| 3 | -0.1639 | 112 | 0.4732143 | -0.0902 | -0.1189 | 0.1181 | -1.0074 | 0.3159 |
| 4 | 0.0190 | 121 | 0.5206612 | -0.1826 | -0.0031 | 0.1468 | -0.0211 | 0.9832 |
itts_by_var = function(dt, var) {
rbindlist(lapply(dt[,unique(get(var))], function(x) {
subset_dt = dt[get(var) == x]
m = lm(delta ~ treated, data = subset_dt)
return(data.table(
var = var,
value = x,
n = subset_dt[, .N],
control_mean = round(subset_dt[, mean(delta, na.rm=TRUE)],4),
cate = round(summary(m)$coefficients["treated","Estimate"],4),
se = round(summary(m)$coefficients["treated","Std. Error"],4),
t = round(summary(m)$coefficients["treated","t value"],4),
p = round(summary(m)$coefficients["treated","Pr(>|t|)"],4)
))
}))
}
rbindlist(lapply(colnames(X), function(x) itts_by_var(hte_feats_outcomes_phase1, x))) %>% kable() %>% kable_styling(c("striped", "hover", "condensed"))
| var | value | n | control_mean | cate | se | t | p |
|---|---|---|---|---|---|---|---|
| pump | 1 | 360 | -0.1531 | -0.1768 | 0.0656 | -2.6938 | 0.0074 |
| pump | 0 | 82 | -0.0341 | -0.4214 | 0.1409 | -2.9916 | 0.0037 |
| female | 1 | 243 | -0.1399 | -0.1635 | 0.0868 | -1.8834 | 0.0609 |
| female | 0 | 199 | -0.1201 | -0.3022 | 0.0797 | -3.7918 | 0.0002 |
| family_caregiver | 0 | 209 | -0.1301 | -0.3874 | 0.0665 | -5.8225 | 0.0000 |
| family_caregiver | 1 | 233 | -0.1318 | -0.0794 | 0.0954 | -0.8318 | 0.4064 |
| highly_educated_caregiver | 1 | 326 | -0.1267 | -0.2242 | 0.0669 | -3.3500 | 0.0009 |
| highly_educated_caregiver | 0 | 116 | -0.1431 | -0.2253 | 0.1289 | -1.7480 | 0.0832 |
#remotes::install_github('susanathey/causalTree')
library(causalTree)
set.seed(1814)
dt = copy(hte_feats_outcomes_phase1)
eval_ind = sample.int(nrow(dt), round(0.5*nrow(dt)))
train_dt = dt[-eval_ind,]
eval_dt = dt[eval_ind,]
fmla_ct = "delta ~ female + pump + family_caregiver + highly_educated_caregiver"
ct_unpruned <- honest.causalTree(
formula=fmla_ct, # Define the model
data=train_dt, # Subset used to create tree structure
est_data=eval_dt, # Which data set to use to estimate effects
treatment=train_dt$treated, # Splitting sample treatment variable
est_treatment=eval_dt$treated, # Estimation sample treatment variable
split.Rule="CT", # Define the splitting option
cv.option="TOT", # Cross validation options
cp=0, # Complexity parameter
split.Honest=TRUE, # Use honesty when splitting
cv.Honest=TRUE, # Use honesty when performing cross-validation
minsize=10, # Min. number of treatment and control cases in each leaf
HonestSampleSize=nrow(eval_dt)) # Num obs used in estimation after building the tree
## [1] 2
## [1] "CT"
# Table of cross-validated values by tuning parameter.
ct_cptable <- as.data.frame(ct_unpruned$cptable)
# Obtain optimal complexity parameter to prune tree.
selected_cp <- which.min(ct_cptable$xerror)
optim_cp_ct <- ct_cptable[selected_cp, "CP"]
# Prune the tree at optimal complexity parameter.
ct_pruned <- prune(tree=ct_unpruned, cp=optim_cp_ct)
library(rpart.plot)
rpart.plot(
x=ct_pruned, # Pruned tree
type=3, # Draw separate split labels for the left and right directions
fallen=TRUE, # Position the leaf nodes at the bottom of the graph
leaf.round=1, # Rounding of the corners of the leaf node boxes
extra=100, # Display the percentage of observations in the node
box.palette="RdBu") # Palette for coloring the node