library(ggplot2)
library(data.table)
library(grf)
library(ivreg)

Power simulations when eligiblity as instrument and TIR as outcome.

full_dt = fread("~/Downloads/CGM/deid_daily_timeseries.csv")

# Aggregate to week
week_dt = full_dt[, .(
  p_worn = mean(fcoalesce(p_worn,0)),
  TIR = mean(TIR, na.rm=T),
  months_since_onset = min(months_since_onset),
  epic_read = max(fcoalesce(epic_read,0L)),
  public_insurance = max(public_insurance)
  ), by=c('pop','pat_id','obs_week_id')]


# Get lag and lead TIR
setorder(week_dt, pop, pat_id, obs_week_id)
week_dt[,`:=`(
  lag_TIR = shift(TIR, n=1, type="lag"),
  lag_p_worn = shift(p_worn, n=1, type="lag"),
  lead_TIR = shift(TIR, n=1, type="lead"),
  lead_p_worn = shift(p_worn, n=1, type="lead")
)]

Estimate P(contact | TIR) and use this for simulation

ggplot(week_dt[p_worn>0.5 & months_since_onset<=12], aes(x=TIR, y=epic_read)) + geom_smooth() + expand_limits(y=0) + theme_bw() + ylab("Actual P(contact)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

loess_fit = loess(epic_read ~ TIR, data=week_dt[p_worn>0.5 & months_since_onset<=12]) 

plot_dt = data.table(TIR=seq(0,1,0.01))
plot_dt[, p_contact := predict(loess_fit, plot_dt)]

ggplot(plot_dt, aes(x=TIR, y=p_contact)) + geom_line() + expand_limits(y=0) + theme_bw() + ylab("Estimated P(contact)")

Let’s randomize eligibility among patients 12+ months after diagnosis. Then sample contacts using loess fit of P(contact | TIR) from patients <12 months since diagnosis. Finally apply a treatment effect of 5pp TIR improvement when contacts happen.

week_dt[months_since_onset>12, .(n_pats = uniqueN(pat_id), n_obs= .N)]
##    n_pats n_obs
## 1:    149  8706
sim_dt = week_dt[months_since_onset>12 & lag_p_worn>0.5 & lead_p_worn>0.5, .(pat_id, obs_week_id, lag_TIR, lead_TIR, TIR, public_insurance)]

# Randomize contacts
sim_dt[, eligible_given_week := (pat_id + obs_week_id) %% 4 == 0]

# Sample contacts
sim_dt[, p_contact := predict(loess_fit, sim_dt)]
sim_dt[eligible_given_week==0, contact := 0]
set.seed(1814); sim_dt[eligible_given_week==1, contact := rbinom(.N, 1, fcoalesce(p_contact,0))]

ggplot(sim_dt[eligible_given_week==1], aes(x=TIR, y=contact)) + geom_smooth() + expand_limits(y=0) + theme_bw() + ylab("Simulation actual P(contact)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).

# Apply treatment effect
sim_dt[, lead_TIR_adj := ifelse(contact==1, pmin(1,lead_TIR+0.05), lead_TIR)]
sim_dt[, .( .N, mTIR =  mean(lead_TIR_adj)), by='contact']
##    contact    N      mTIR
## 1:       0 6647 0.6403138
## 2:       1  377 0.6721714
sim_dt[, .( .N, mTIR = mean(lead_TIR_adj)), by='eligible_given_week']
##    eligible_given_week    N      mTIR
## 1:               FALSE 5270 0.6386476
## 2:                TRUE 1754 0.6521675

Linear IV Regressions and Reduced Form

# Without controls
m_iv <- ivreg(lead_TIR_adj ~ contact | eligible_given_week, data = sim_dt)
summary(m_iv)
## 
## Call:
## ivreg(formula = lead_TIR_adj ~ contact | eligible_given_week, 
##     data = sim_dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64954 -0.12284  0.01569  0.13445  0.36135 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.638648   0.002475  258.08  < 2e-16 ***
## contact     0.062902   0.023040    2.73  0.00635 ** 
## 
## Diagnostic tests:
##                   df1  df2 statistic p-value    
## Weak instruments    1 7022  1442.429  <2e-16 ***
## Wu-Hausman          1 7021     2.192   0.139    
## Sargan              0   NA        NA      NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1796 on 7022 degrees of freedom
## Multiple R-Squared: 8.053e-05,   Adjusted R-squared: -6.187e-05 
## Wald test: 7.454 on 1 and 7022 DF,  p-value: 0.006347
# With controls
m_iv_wc <- ivreg(lead_TIR_adj ~ contact + TIR + lag_TIR | eligible_given_week + TIR + lag_TIR, data = sim_dt)
summary(m_iv_wc)
## 
## Call:
## ivreg(formula = lead_TIR_adj ~ contact + TIR + lag_TIR | eligible_given_week + 
##     TIR + lag_TIR, data = sim_dt)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.834312 -0.051976  0.006119  0.057951  0.644598 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.075280   0.004833  15.577  < 2e-16 ***
## contact     0.058242   0.012869   4.526 6.12e-06 ***
## TIR         0.467529   0.011933  39.181  < 2e-16 ***
## lag_TIR     0.412866   0.012109  34.096  < 2e-16 ***
## 
## Diagnostic tests:
##                   df1  df2 statistic p-value    
## Weak instruments    1 7010  1444.982  <2e-16 ***
## Wu-Hausman          1 7009     0.008   0.929    
## Sargan              0   NA        NA      NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1004 on 7010 degrees of freedom
## Multiple R-Squared: 0.6874,  Adjusted R-squared: 0.6873 
## Wald test:  5107 on 3 and 7010 DF,  p-value: < 2.2e-16
# Reduced form
rlm = lm(lead_TIR_adj ~ eligible_given_week, data = sim_dt)
summary(rlm)
## 
## Call:
## lm(formula = lead_TIR_adj ~ eligible_given_week, data = sim_dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63865 -0.12223  0.01604  0.13460  0.36135 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.638648   0.002473 258.204  < 2e-16 ***
## eligible_given_weekTRUE 0.013520   0.004950   2.731  0.00632 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1796 on 7022 degrees of freedom
## Multiple R-squared:  0.001061,   Adjusted R-squared:  0.0009191 
## F-statistic: 7.461 on 1 and 7022 DF,  p-value: 0.006321

GRF’s Instrumental Variable Forest

ivf = instrumental_forest(
  X = sim_dt[,.(lag_TIR, TIR)],
  Y = sim_dt[,lead_TIR_adj],
  W = sim_dt[,contact],
  Z = sim_dt[,eligible_given_week],
  #tune.parameters = "all",
  # From tuning
  sample.fraction = 0.4709779,
  mtry = 1,
  min.node.size = 36,
  honesty.fraction = 0.55761,
  honesty.prune.leaves = FALSE,
  alpha = 0.2485017,
  imbalance.penalty = 0.5069303,
  # Seed for reproducibility 
  seed = 1814
)

# ivf$tuning.output$status
# ivf$tuning.output$params
sim_dt[, w_hat := ivf$W.hat]

ggplot(melt(sim_dt[,.(w_hat = w_hat*4, p_contact, TIR)], id.vars = c('TIR')),
       aes(x=TIR, y=value, color=variable)) + geom_smooth() + theme_bw()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 20 rows containing non-finite values (stat_smooth).

IV Forest sort of recovered true P(contact | TIR) but it’s definitely not perfect.

Let’s look at LATE

average_treatment_effect(ivf)
##   estimate    std.err 
## 0.06282860 0.01445016

Recovered LATE of ~5pp! But SE is ~1.4pp, so not small.

We will need ti be able to predict TIR better in the absence of contact to detect smaller effect sizes or HTEs.

Let’s try simulating hetereogeneous treatment effects now

Let’s assume publicly insured patients have LATE 10pp and non-publicly insured have LATE 5pp.

sim_dt[,lead_TIR_adj_hte := lead_TIR_adj + (contact*public_insurance)*0.05]

ivf_hte = instrumental_forest(
  X = sim_dt[,.(lag_TIR, TIR, public_insurance)],
  Y = sim_dt[,lead_TIR_adj_hte],
  W = sim_dt[,contact],
  Z = sim_dt[,eligible_given_week],
  # tune.parameters = "all",
  # From tuning
  sample.fraction = 0.4983636,
  mtry = 3,
  min.node.size = 46,
  honesty.fraction = 0.6754995,
  honesty.prune.leaves = TRUE,
  alpha = 0.2273693,
  imbalance.penalty = 0.3267822,
  # Seed for reproducibility 
  seed = 1814
)

# ivf_hte$tuning.output$status
# ivf_hte$tuning.output$params

average_treatment_effect(ivf_hte)
##   estimate    std.err 
## 0.06565841 0.01386774
sim_dt[, ivf_hte_scores := get_scores(ivf_hte)]

sim_dt[, .(n=.N, pe=mean(ivf_hte_scores), se=sd(ivf_hte_scores)/sqrt(.N) ), by='public_insurance']
##    public_insurance    n         pe         se
## 1:                1  866 0.08844954 0.04151141
## 2:                0 6158 0.06320305 0.01481226
average_treatment_effect(ivf_hte, subset=sim_dt[,public_insurance==0])
##   estimate    std.err 
## 0.06330580 0.01478893
average_treatment_effect(ivf_hte, subset=sim_dt[,public_insurance==1])
##   estimate    std.err 
## 0.09262759 0.04191611

Sort of recovered 5pp for non-publicly insured and 10pp for publicly insured.. But SEs are too large to say the groups have different ATEs.

We will need to better predict baseline TIR to detect differences in effect sizes.