library(ggplot2)
library(data.table)
library(grf)
library(ivreg)
library(estimatr)

Power simulations when eligiblity as instrument and TIR as outcome.

full_dt = fread("~/Downloads/CGM/deid_daily_timeseries.csv")
full_dt = full_dt[pop == "Pilot"]


# Get lags and leads
setorder(full_dt, pop, pat_id, obs_id)

full_dt[,`:=`(
  TIR_7dr = frollmean(TIR, 7, na.rm=T, hasNA=TRUE),
  mean_glucose_7dr = frollmean(mean_glucose, 7, na.rm=T, hasNA=TRUE),
  hyper_7dr = frollmean(hyper, 7, na.rm=T, hasNA=TRUE),
  hypo1_7dr = frollmean(hypo1, 7, na.rm=T, hasNA=TRUE),
  hypo2_7dr = frollmean(hypo2, 7, na.rm=T, hasNA=TRUE),
  p_worn_7dr = frollmean(p_worn, 7),
  clinic_contact_last_7_days = frollapply(clinic_remote_contact, 7, function(x) {max(x,na.rm=T)}),
  visit_last_7_days = frollapply(visit, 7, function(x) {max(x,na.rm=T)})
  ), by=c('pop','pat_id')]

full_dt[, `:=`(
  TIR_7dr_7d_lead = shift(TIR_7dr, 7, type="lead"),
  mean_glucose_7dr_7d_lead = shift(mean_glucose_7dr, 7, type="lead"),
  p_worn_7dr_7d_lead = shift(p_worn_7dr, 7, type="lead")
), by=c('pop','pat_id')]


full_dt = full_dt[p_worn_7dr>0.5 & p_worn_7dr_7d_lead>0.5]

full_dt[, dTIR := TIR_7dr_7d_lead - TIR_7dr]
full_dt[, dMG := mean_glucose_7dr_7d_lead - mean_glucose_7dr]

full_dt[, dTIR_lag7 := shift(dTIR, 7, type="lag"), by = c('pop','pat_id')]

full_dt = full_dt[!is.na(dTIR) & !is.na(dMG) & !is.na(dTIR_lag7)]

full_dt[is.na(months_since_onset), months_since_onset:=-1]
full_dt[is.na(age), age:=-1]

full_dt[, week_idx_from_end := frank(-obs_week_id, ties.method = "dense"), by='pat_id']

full_dt[, weekday_id := as.numeric(as.factor(weekday))]

Use N weeks to sample observations for RF to make sure there’s no temporal leakage

# n_weeks is the number of weeks experiment will run, and we don't use those observations to fit the expected outcome model

get_rf_model_for_n_weeks = function(n_weeks, dt) {
  
  print(n_weeks)
  
  reg_dt = dt[epic_read==0 & week_idx_from_end > n_weeks]
  
  reg_X = model.matrix(~ -1 + age + pump + pat_id + public_insurance + sexF + poc + hisp + black + asian + nepl + dka_onset + antibody_antiins + weekday + months_since_onset + 
                         TIR_7dr + mean_glucose_7dr + hyper_7dr + hypo1_7dr + hypo2_7dr + p_worn_7dr + dTIR_lag7, data=reg_dt)
  
  rf = regression_forest(
    X = reg_X,
    Y = reg_dt[,dTIR],
    clusters = reg_dt[, as.factor(pat_id)],
    # tune.parameters = "all",
    # From tuning 
    sample.fraction = 0.2627555,
    mtry = 11,
    min.node.size = 2275,
    honesty.fraction = 0.59910440,
    honesty.prune.leaves = FALSE,
    alpha = 0.07020127,
    imbalance.penalty = 0.91454407,
    # Seed for reproducibility 
    seed = 1814
  )
  
  # print r-sq
  print(1-sd(reg_dt[,dTIR] - rf$predictions)/sd(reg_dt[,dTIR]))
  
  return(rf)
}


# fit all the models
N_WEEKS = c(6,12,26,52)
rf_models = lapply(N_WEEKS, get_rf_model_for_n_weeks, full_dt)
## [1] 6
## [1] 0.082347
## [1] 12
## [1] 0.08123004
## [1] 26
## [1] 0.07887717
## [1] 52
## [1] 0.07055156
# rf_models

P(contact | TIR) models

get_p_contact_model_for_n_weeks = function(n_weeks, dt) {
  return(loess(epic_read ~ TIR_7dr, data=dt[months_since_onset<=12 & week_idx_from_end > n_weeks]) )
}

# fit all the models
pc_models = lapply(N_WEEKS, get_p_contact_model_for_n_weeks, full_dt)
pc_models
## [[1]]
## Call:
## loess(formula = epic_read ~ TIR_7dr, data = dt[months_since_onset <= 
##     12 & week_idx_from_end > n_weeks])
## 
## Number of Observations: 33964 
## Equivalent Number of Parameters: 4.88 
## Residual Standard Error: 0.2352 
## 
## [[2]]
## Call:
## loess(formula = epic_read ~ TIR_7dr, data = dt[months_since_onset <= 
##     12 & week_idx_from_end > n_weeks])
## 
## Number of Observations: 33769 
## Equivalent Number of Parameters: 4.88 
## Residual Standard Error: 0.2356 
## 
## [[3]]
## Call:
## loess(formula = epic_read ~ TIR_7dr, data = dt[months_since_onset <= 
##     12 & week_idx_from_end > n_weeks])
## 
## Number of Observations: 32988 
## Equivalent Number of Parameters: 4.85 
## Residual Standard Error: 0.2359 
## 
## [[4]]
## Call:
## loess(formula = epic_read ~ TIR_7dr, data = dt[months_since_onset <= 
##     12 & week_idx_from_end > n_weeks])
## 
## Number of Observations: 27835 
## Equivalent Number of Parameters: 4.84 
## Residual Standard Error: 0.2269
test_power_smart_sample = function(dt, n_pats, n_weeks, s) {
  
  # P(contact | TIR) model
  loess_fit = pc_models[[which(N_WEEKS==n_weeks)]]
  
  # Get RF model and add preds
  rfm = rf_models[[which(N_WEEKS==n_weeks)]]
  
  # add preds
  mm_pred = model.matrix(~ -1 + age + pump + pat_id + public_insurance + sexF + poc + hisp + black + asian + nepl + dka_onset + antibody_antiins + weekday + months_since_onset +
                           TIR_7dr + mean_glucose_7dr + hyper_7dr + hypo1_7dr + hypo2_7dr + p_worn_7dr + dTIR_lag7, data=dt[epic_read==1 | week_idx_from_end <= n_weeks])
  dt[epic_read==0 & week_idx_from_end > n_weeks, dTIR_pred := rfm$predictions]
  dt[epic_read==1 | week_idx_from_end <= n_weeks, dTIR_pred := predict(rfm, mm_pred)$predictions]
  
  qualification_dt = dt[week_idx_from_end > n_weeks, .(
    p_tir_below_65p = mean(TIR_7dr<0.65),
    m_sq_error_dTIR_pred = mean((dTIR_pred - dTIR)^2)
    ), by='pat_id']

  qualification_dt = qualification_dt[,highly_variable := ifelse(p_tir_below_65p>0.25 & p_tir_below_65p < 0.75,1,0)]
  
  setorder(qualification_dt, -highly_variable, m_sq_error_dTIR_pred)
  
  pat_ids = qualification_dt$pat_id[1:n_pats]

  smart_small_sim_dt = copy(dt[pat_id %in% pat_ids & week_idx_from_end <= n_weeks])
  
  # Sample contacts
  
  smart_small_sim_dt[, p_contact := predict(loess_fit, smart_small_sim_dt)]
  
  smart_small_sim_dt[, contact_day := (pat_id+weekday_id) %% 7 == 0]
  
  set.seed(s); smart_small_sim_dt[contact_day==1 & TIR_7dr<0.65, contact_eligible := rbinom(.N, 1, 0.5)]
  
  set.seed(s); smart_small_sim_dt[contact_eligible==1, contact := rbinom(.N, 1, p_contact*7)]
  smart_small_sim_dt[is.na(contact), contact:=0]
  
  # Apply treatment effect
  smart_small_sim_dt[, dTIR_adj := ifelse(contact==1, pmin(1,dTIR+0.03), dTIR)]
  
  # Models
  m_iv_s <- lm_robust(dTIR_adj ~ contact, data = smart_small_sim_dt[TIR_7dr<0.65 & contact_day==1], clusters = pat_id)
  m_iv_wc_s <- lm_robust(dTIR_adj ~ contact + dTIR_pred, data = smart_small_sim_dt[TIR_7dr<0.65 & contact_day==1], clusters = pat_id)
  
  out_dt_nc = data.table(
    n_pats = n_pats,
    n_weeks = n_weeks,
    n_contacts = sum(smart_small_sim_dt$contact),
    type = "not controlling for expected dTIR",
    coef = summary(m_iv_s)$coefficients["contact","Estimate"],
    se = summary(m_iv_s)$coefficients["contact","Std. Error"],
    p = summary(m_iv_s)$coefficients["contact","Pr(>|t|)"])
  
  out_dt_c = data.table(
    n_pats = n_pats,
    n_weeks = n_weeks,
    n_contacts = sum(smart_small_sim_dt$contact),
    type = "controlling for expected dTIR",
    coef = summary(m_iv_wc_s)$coefficients["contact","Estimate"],
    se = summary(m_iv_wc_s)$coefficients["contact","Std. Error"],
    p = summary(m_iv_wc_s)$coefficients["contact","Pr(>|t|)"])
  
  return(rbind(out_dt_nc,out_dt_c))
}


# test_power_smart_sample(full_dt,20,12,1)

test_grid = expand.grid(
  n_pats = c(10,20,30,50,90),
  n_weeks = N_WEEKS
)

N_SIMS = 50
alpha = 0.05

smart_results = rbindlist(lapply(1:nrow(test_grid), function(i) {
  rbindlist(lapply(1:N_SIMS, function(s) test_power_smart_sample(full_dt, test_grid$n_pats[i], test_grid$n_weeks[i], s)))
}))

smart_results[, stat_sig := p < alpha]
smart_results_agg = smart_results[,.(
  power=mean(stat_sig, na.rm=T),
  m_coef = mean(coef),
  m_n_contacts = mean(n_contacts)), by=c('n_pats','n_weeks','type')]
# smart_results_agg

ggplot(smart_results_agg, aes(x=n_weeks, color=factor(n_pats), y=power, linetype=type)) + geom_line() + geom_point() + theme_bw() + geom_hline(yintercept = 0.5, linetype="dashed") + ylim(0,1)

Same but randomly sampling patients

test_power_random_sample = function(dt, n_pats, n_weeks, s) {
  
  # P(contact | TIR) model
  loess_fit = pc_models[[which(N_WEEKS==n_weeks)]]
  
  # Get RF model and add preds
  rfm = rf_models[[which(N_WEEKS==n_weeks)]]
  
  # add preds
  mm_pred = model.matrix(~ -1 + age + pump + pat_id + public_insurance + sexF + poc + hisp + black + asian + nepl + dka_onset + antibody_antiins + weekday + months_since_onset +
                           TIR_7dr + mean_glucose_7dr + hyper_7dr + hypo1_7dr + hypo2_7dr + p_worn_7dr + dTIR_lag7, data=dt[epic_read==1 | week_idx_from_end <= n_weeks])
  dt[epic_read==0 & week_idx_from_end > n_weeks, dTIR_pred := rfm$predictions]
  dt[epic_read==1 | week_idx_from_end <= n_weeks, dTIR_pred := predict(rfm, mm_pred)$predictions]
  
  set.seed(s); pat_ids = sample(dt[week_idx_from_end > n_weeks , unique(pat_id)], n_pats)

  sim_dt = copy(dt[pat_id %in% pat_ids & week_idx_from_end <= n_weeks])
  
  # Sample contacts
  
  sim_dt[, p_contact := predict(loess_fit, sim_dt)]
  
  sim_dt[, contact_day := (pat_id+weekday_id) %% 7 == 0]
  
  set.seed(s); sim_dt[contact_day==1 & TIR_7dr<0.65, contact_eligible := rbinom(.N, 1, 0.5)]
  
  set.seed(s); sim_dt[contact_eligible==1, contact := rbinom(.N, 1, p_contact*7)]
  sim_dt[is.na(contact), contact:=0]
  
  # Apply treatment effect
  sim_dt[, dTIR_adj := ifelse(contact==1, pmin(1,dTIR+0.03), dTIR)]
  
  # Models
  m_iv_s <- lm_robust(dTIR_adj ~ contact, data = sim_dt[TIR_7dr<0.65 & contact_day==1], clusters = pat_id)
  m_iv_wc_s <- lm_robust(dTIR_adj ~ contact + dTIR_pred, data = sim_dt[TIR_7dr<0.65 & contact_day==1], clusters = pat_id)
  
  out_dt_nc = data.table(
    n_pats = n_pats,
    n_weeks = n_weeks,
    n_contacts = sum(sim_dt$contact),
    type = "not controlling for expected dTIR",
    coef = summary(m_iv_s)$coefficients["contact","Estimate"],
    se = summary(m_iv_s)$coefficients["contact","Std. Error"],
    p = summary(m_iv_s)$coefficients["contact","Pr(>|t|)"])
  
  out_dt_c = data.table(
    n_pats = n_pats,
    n_weeks = n_weeks,
    n_contacts = sum(sim_dt$contact),
    type = "controlling for expected dTIR",
    coef = summary(m_iv_wc_s)$coefficients["contact","Estimate"],
    se = summary(m_iv_wc_s)$coefficients["contact","Std. Error"],
    p = summary(m_iv_wc_s)$coefficients["contact","Pr(>|t|)"])
  
  return(rbind(out_dt_nc,out_dt_c))
}


# test_power_random_sample(full_dt,20,12,1)


random_results = rbindlist(lapply(1:nrow(test_grid), function(i) {
  rbindlist(lapply(1:N_SIMS, function(s) test_power_random_sample(full_dt, test_grid$n_pats[i], test_grid$n_weeks[i], s)))
}))

random_results[, stat_sig := p < alpha]
random_results_agg = random_results[,.(
  power=mean(stat_sig, na.rm=T),
  m_coef = mean(coef),
  m_n_contacts = mean(n_contacts)), by=c('n_pats','n_weeks','type')]

ggplot(random_results_agg, aes(x=n_weeks, color=factor(n_pats), y=power, linetype=type)) + geom_line() + geom_point() + theme_bw() + geom_hline(yintercept = 0.5, linetype="dashed") + ylim(0,1)

Interesting relationship between N patients and power, all for 12 weeks and with control

n_pats_grid = expand.grid(
  n_pats = c(10,20,30,40,50,60,70,80,90),
  n_weeks = c(12)
)

smart_results_n_pats_grid = rbindlist(lapply(1:nrow(n_pats_grid), function(i) {
  rbindlist(lapply(1:N_SIMS, function(s) test_power_smart_sample(full_dt, n_pats_grid$n_pats[i], n_pats_grid$n_weeks[i], s)))
}))

smart_results_n_pats_grid[, stat_sig := p < alpha]
smart_results_n_pats_grid_agg = smart_results_n_pats_grid[,.(
  power=mean(stat_sig, na.rm=T),
  m_coef = mean(coef),
  m_n_contacts = mean(n_contacts)), by=c('n_pats','n_weeks','type')]
# smart_results_n_pats_grid_agg

ggplot(smart_results_n_pats_grid_agg[type=="controlling for expected dTIR"], aes(x=n_pats, y=power)) + geom_line() + geom_point() + theme_bw() + ylim(0,1)