library(ggplot2)
library(data.table)
library(grf)
library(ivreg)
full_dt = fread("~/Downloads/CGM/deid_daily_timeseries.csv")
# Aggregate to week
week_dt = full_dt[, .(
p_worn = mean(fcoalesce(p_worn,0)),
TIR = mean(TIR, na.rm=T),
months_since_onset = min(months_since_onset),
epic_read = max(fcoalesce(epic_read,0L)),
public_insurance = max(public_insurance)
), by=c('pop','pat_id','obs_week_id')]
# Get lag and lead TIR
setorder(week_dt, pop, pat_id, obs_week_id)
week_dt[,`:=`(
lag_TIR = shift(TIR, n=1, type="lag"),
lag_p_worn = shift(p_worn, n=1, type="lag"),
lead_TIR = shift(TIR, n=1, type="lead"),
lead_p_worn = shift(p_worn, n=1, type="lead")
)]
Estimate P(contact | TIR) and use this for simulation
ggplot(week_dt[p_worn>0.5 & months_since_onset<=12], aes(x=TIR, y=epic_read)) + geom_smooth() + expand_limits(y=0) + theme_bw() + ylab("Actual P(contact)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
loess_fit = loess(epic_read ~ TIR, data=week_dt[p_worn>0.5 & months_since_onset<=12])
plot_dt = data.table(TIR=seq(0,1,0.01))
plot_dt[, p_contact := predict(loess_fit, plot_dt)]
ggplot(plot_dt, aes(x=TIR, y=p_contact)) + geom_line() + expand_limits(y=0) + theme_bw() + ylab("Estimated P(contact)")
Let’s randomize eligibility among patients 12+ months after diagnosis. Then sample contacts using loess fit of P(contact | TIR) from patients <12 months since diagnosis. Finally apply a treatment effect of 5pp TIR improvement when contacts happen.
week_dt[months_since_onset>12, .(n_pats = uniqueN(pat_id), n_obs= .N)]
## n_pats n_obs
## 1: 149 8706
sim_dt = week_dt[months_since_onset>12 & lag_p_worn>0.5 & lead_p_worn>0.5, .(pat_id, obs_week_id, lag_TIR, lead_TIR, TIR, public_insurance)]
# Randomize contacts
sim_dt[, eligible_given_week := (pat_id + obs_week_id) %% 4 == 0]
# Sample contacts
sim_dt[, p_contact := predict(loess_fit, sim_dt)]
sim_dt[eligible_given_week==0, contact := 0]
set.seed(1814); sim_dt[eligible_given_week==1, contact := rbinom(.N, 1, fcoalesce(p_contact,0))]
ggplot(sim_dt[eligible_given_week==1], aes(x=TIR, y=contact)) + geom_smooth() + expand_limits(y=0) + theme_bw() + ylab("Simulation actual P(contact)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
# Apply treatment effect
sim_dt[, lead_TIR_adj := ifelse(contact==1, pmin(1,lead_TIR+0.05), lead_TIR)]
sim_dt[, .( .N, mTIR = mean(lead_TIR_adj)), by='contact']
## contact N mTIR
## 1: 0 6647 0.6403138
## 2: 1 377 0.6721714
sim_dt[, .( .N, mTIR = mean(lead_TIR_adj)), by='eligible_given_week']
## eligible_given_week N mTIR
## 1: FALSE 5270 0.6386476
## 2: TRUE 1754 0.6521675
Linear IV Regressions and Reduced Form
# Without controls
m_iv <- ivreg(lead_TIR_adj ~ contact | eligible_given_week, data = sim_dt)
summary(m_iv)
##
## Call:
## ivreg(formula = lead_TIR_adj ~ contact | eligible_given_week,
## data = sim_dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64954 -0.12284 0.01569 0.13445 0.36135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.638648 0.002475 258.08 < 2e-16 ***
## contact 0.062902 0.023040 2.73 0.00635 **
##
## Diagnostic tests:
## df1 df2 statistic p-value
## Weak instruments 1 7022 1442.429 <2e-16 ***
## Wu-Hausman 1 7021 2.192 0.139
## Sargan 0 NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1796 on 7022 degrees of freedom
## Multiple R-Squared: 8.053e-05, Adjusted R-squared: -6.187e-05
## Wald test: 7.454 on 1 and 7022 DF, p-value: 0.006347
# With controls
m_iv_wc <- ivreg(lead_TIR_adj ~ contact + TIR + lag_TIR | eligible_given_week + TIR + lag_TIR, data = sim_dt)
summary(m_iv_wc)
##
## Call:
## ivreg(formula = lead_TIR_adj ~ contact + TIR + lag_TIR | eligible_given_week +
## TIR + lag_TIR, data = sim_dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.834312 -0.051976 0.006119 0.057951 0.644598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.075280 0.004833 15.577 < 2e-16 ***
## contact 0.058242 0.012869 4.526 6.12e-06 ***
## TIR 0.467529 0.011933 39.181 < 2e-16 ***
## lag_TIR 0.412866 0.012109 34.096 < 2e-16 ***
##
## Diagnostic tests:
## df1 df2 statistic p-value
## Weak instruments 1 7010 1444.982 <2e-16 ***
## Wu-Hausman 1 7009 0.008 0.929
## Sargan 0 NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1004 on 7010 degrees of freedom
## Multiple R-Squared: 0.6874, Adjusted R-squared: 0.6873
## Wald test: 5107 on 3 and 7010 DF, p-value: < 2.2e-16
# Reduced form
rlm = lm(lead_TIR_adj ~ eligible_given_week, data = sim_dt)
summary(rlm)
##
## Call:
## lm(formula = lead_TIR_adj ~ eligible_given_week, data = sim_dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63865 -0.12223 0.01604 0.13460 0.36135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.638648 0.002473 258.204 < 2e-16 ***
## eligible_given_weekTRUE 0.013520 0.004950 2.731 0.00632 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1796 on 7022 degrees of freedom
## Multiple R-squared: 0.001061, Adjusted R-squared: 0.0009191
## F-statistic: 7.461 on 1 and 7022 DF, p-value: 0.006321
GRF’s Instrumental Variable Forest
ivf = instrumental_forest(
X = sim_dt[,.(lag_TIR, TIR)],
Y = sim_dt[,lead_TIR_adj],
W = sim_dt[,contact],
Z = sim_dt[,eligible_given_week],
#tune.parameters = "all",
# From tuning
sample.fraction = 0.4709779,
mtry = 1,
min.node.size = 36,
honesty.fraction = 0.55761,
honesty.prune.leaves = FALSE,
alpha = 0.2485017,
imbalance.penalty = 0.5069303,
# Seed for reproducibility
seed = 1814
)
# ivf$tuning.output$status
# ivf$tuning.output$params
sim_dt[, w_hat := ivf$W.hat]
ggplot(melt(sim_dt[,.(w_hat = w_hat*4, p_contact, TIR)], id.vars = c('TIR')),
aes(x=TIR, y=value, color=variable)) + geom_smooth() + theme_bw()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 20 rows containing non-finite values (stat_smooth).
IV Forest sort of recovered true P(contact | TIR) but it’s definitely not perfect.
Let’s look at LATE
average_treatment_effect(ivf)
## estimate std.err
## 0.06282860 0.01445016
Recovered LATE of ~5pp! But SE is ~1.4pp, so not small.
We will need ti be able to predict TIR better in the absence of contact to detect smaller effect sizes or HTEs.
Let’s assume publicly insured patients have LATE 10pp and non-publicly insured have LATE 5pp.
sim_dt[,lead_TIR_adj_hte := lead_TIR_adj + (contact*public_insurance)*0.05]
ivf_hte = instrumental_forest(
X = sim_dt[,.(lag_TIR, TIR, public_insurance)],
Y = sim_dt[,lead_TIR_adj_hte],
W = sim_dt[,contact],
Z = sim_dt[,eligible_given_week],
# tune.parameters = "all",
# From tuning
sample.fraction = 0.4983636,
mtry = 3,
min.node.size = 46,
honesty.fraction = 0.6754995,
honesty.prune.leaves = TRUE,
alpha = 0.2273693,
imbalance.penalty = 0.3267822,
# Seed for reproducibility
seed = 1814
)
# ivf_hte$tuning.output$status
# ivf_hte$tuning.output$params
average_treatment_effect(ivf_hte)
## estimate std.err
## 0.06565841 0.01386774
sim_dt[, ivf_hte_scores := get_scores(ivf_hte)]
sim_dt[, .(n=.N, pe=mean(ivf_hte_scores), se=sd(ivf_hte_scores)/sqrt(.N) ), by='public_insurance']
## public_insurance n pe se
## 1: 1 866 0.08844954 0.04151141
## 2: 0 6158 0.06320305 0.01481226
average_treatment_effect(ivf_hte, subset=sim_dt[,public_insurance==0])
## estimate std.err
## 0.06330580 0.01478893
average_treatment_effect(ivf_hte, subset=sim_dt[,public_insurance==1])
## estimate std.err
## 0.09262759 0.04191611
Sort of recovered 5pp for non-publicly insured and 10pp for publicly insured.. But SEs are too large to say the groups have different ATEs.
We will need to better predict baseline TIR to detect differences in effect sizes.