library(ggplot2)
library(data.table)
library(grf)
library(ivreg)
library(estimatr)
full_dt = fread("~/Downloads/CGM/deid_daily_timeseries.csv")
full_dt = full_dt[pop == "Pilot"]
# Get lags and leads
setorder(full_dt, pop, pat_id, obs_id)
full_dt[,`:=`(
TIR_7dr = frollmean(TIR, 7, na.rm=T, hasNA=TRUE),
mean_glucose_7dr = frollmean(mean_glucose, 7, na.rm=T, hasNA=TRUE),
hyper_7dr = frollmean(hyper, 7, na.rm=T, hasNA=TRUE),
hypo1_7dr = frollmean(hypo1, 7, na.rm=T, hasNA=TRUE),
hypo2_7dr = frollmean(hypo2, 7, na.rm=T, hasNA=TRUE),
p_worn_7dr = frollmean(p_worn, 7),
clinic_contact_last_7_days = frollapply(clinic_remote_contact, 7, function(x) {max(x,na.rm=T)}),
visit_last_7_days = frollapply(visit, 7, function(x) {max(x,na.rm=T)})
), by=c('pop','pat_id')]
full_dt[, `:=`(
TIR_7dr_7d_lead = shift(TIR_7dr, 7, type="lead"),
mean_glucose_7dr_7d_lead = shift(mean_glucose_7dr, 7, type="lead"),
p_worn_7dr_7d_lead = shift(p_worn_7dr, 7, type="lead")
), by=c('pop','pat_id')]
full_dt = full_dt[p_worn_7dr>0.5 & p_worn_7dr_7d_lead>0.5]
full_dt[, dTIR := TIR_7dr_7d_lead - TIR_7dr]
full_dt[, dMG := mean_glucose_7dr_7d_lead - mean_glucose_7dr]
full_dt[, dTIR_lag7 := shift(dTIR, 7, type="lag"), by = c('pop','pat_id')]
full_dt = full_dt[!is.na(dTIR) & !is.na(dMG) & !is.na(dTIR_lag7)]
full_dt[is.na(months_since_onset), months_since_onset:=-1]
full_dt[is.na(age), age:=-1]
full_dt[, week_idx_from_end := frank(-obs_week_id, ties.method = "dense"), by='pat_id']
full_dt[, weekday_id := as.numeric(as.factor(weekday))]
Use N weeks to sample observations for RF to make sure there’s no temporal leakage
# n_weeks is the number of weeks experiment will run, and we don't use those observations to fit the expected outcome model
get_rf_model_for_n_weeks = function(n_weeks, dt) {
print(n_weeks)
reg_dt = dt[epic_read==0 & week_idx_from_end > n_weeks]
reg_X = model.matrix(~ -1 + age + pump + pat_id + public_insurance + sexF + poc + hisp + black + asian + nepl + dka_onset + antibody_antiins + weekday + months_since_onset +
TIR_7dr + mean_glucose_7dr + hyper_7dr + hypo1_7dr + hypo2_7dr + p_worn_7dr + dTIR_lag7, data=reg_dt)
rf = regression_forest(
X = reg_X,
Y = reg_dt[,dTIR],
clusters = reg_dt[, as.factor(pat_id)],
# tune.parameters = "all",
# From tuning
sample.fraction = 0.2627555,
mtry = 11,
min.node.size = 2275,
honesty.fraction = 0.59910440,
honesty.prune.leaves = FALSE,
alpha = 0.07020127,
imbalance.penalty = 0.91454407,
# Seed for reproducibility
seed = 1814
)
# print r-sq
print(1-sd(reg_dt[,dTIR] - rf$predictions)/sd(reg_dt[,dTIR]))
return(rf)
}
# fit all the models
N_WEEKS = c(6,12,26,52)
rf_models = lapply(N_WEEKS, get_rf_model_for_n_weeks, full_dt)
## [1] 6
## [1] 0.082347
## [1] 12
## [1] 0.08123004
## [1] 26
## [1] 0.07887717
## [1] 52
## [1] 0.07055156
# rf_models
P(contact | TIR) models
get_p_contact_model_for_n_weeks = function(n_weeks, dt) {
return(loess(epic_read ~ TIR_7dr, data=dt[months_since_onset<=12 & week_idx_from_end > n_weeks]) )
}
# fit all the models
pc_models = lapply(N_WEEKS, get_p_contact_model_for_n_weeks, full_dt)
pc_models
## [[1]]
## Call:
## loess(formula = epic_read ~ TIR_7dr, data = dt[months_since_onset <=
## 12 & week_idx_from_end > n_weeks])
##
## Number of Observations: 33964
## Equivalent Number of Parameters: 4.88
## Residual Standard Error: 0.2352
##
## [[2]]
## Call:
## loess(formula = epic_read ~ TIR_7dr, data = dt[months_since_onset <=
## 12 & week_idx_from_end > n_weeks])
##
## Number of Observations: 33769
## Equivalent Number of Parameters: 4.88
## Residual Standard Error: 0.2356
##
## [[3]]
## Call:
## loess(formula = epic_read ~ TIR_7dr, data = dt[months_since_onset <=
## 12 & week_idx_from_end > n_weeks])
##
## Number of Observations: 32988
## Equivalent Number of Parameters: 4.85
## Residual Standard Error: 0.2359
##
## [[4]]
## Call:
## loess(formula = epic_read ~ TIR_7dr, data = dt[months_since_onset <=
## 12 & week_idx_from_end > n_weeks])
##
## Number of Observations: 27835
## Equivalent Number of Parameters: 4.84
## Residual Standard Error: 0.2269
test_power_smart_sample = function(dt, n_pats, n_weeks, s) {
# P(contact | TIR) model
loess_fit = pc_models[[which(N_WEEKS==n_weeks)]]
# Get RF model and add preds
rfm = rf_models[[which(N_WEEKS==n_weeks)]]
# add preds
mm_pred = model.matrix(~ -1 + age + pump + pat_id + public_insurance + sexF + poc + hisp + black + asian + nepl + dka_onset + antibody_antiins + weekday + months_since_onset +
TIR_7dr + mean_glucose_7dr + hyper_7dr + hypo1_7dr + hypo2_7dr + p_worn_7dr + dTIR_lag7, data=dt[epic_read==1 | week_idx_from_end <= n_weeks])
dt[epic_read==0 & week_idx_from_end > n_weeks, dTIR_pred := rfm$predictions]
dt[epic_read==1 | week_idx_from_end <= n_weeks, dTIR_pred := predict(rfm, mm_pred)$predictions]
qualification_dt = dt[week_idx_from_end > n_weeks, .(
p_tir_below_65p = mean(TIR_7dr<0.65),
m_sq_error_dTIR_pred = mean((dTIR_pred - dTIR)^2)
), by='pat_id']
qualification_dt = qualification_dt[,highly_variable := ifelse(p_tir_below_65p>0.25 & p_tir_below_65p < 0.75,1,0)]
setorder(qualification_dt, -highly_variable, m_sq_error_dTIR_pred)
pat_ids = qualification_dt$pat_id[1:n_pats]
smart_small_sim_dt = copy(dt[pat_id %in% pat_ids & week_idx_from_end <= n_weeks])
# Sample contacts
smart_small_sim_dt[, p_contact := predict(loess_fit, smart_small_sim_dt)]
smart_small_sim_dt[, contact_day := (pat_id+weekday_id) %% 7 == 0]
set.seed(s); smart_small_sim_dt[contact_day==1 & TIR_7dr<0.65, contact_eligible := rbinom(.N, 1, 0.5)]
set.seed(s); smart_small_sim_dt[contact_eligible==1, contact := rbinom(.N, 1, p_contact*7)]
smart_small_sim_dt[is.na(contact), contact:=0]
# Apply treatment effect
smart_small_sim_dt[, dTIR_adj := ifelse(contact==1, pmin(1,dTIR+0.03), dTIR)]
# Models
m_iv_s <- lm_robust(dTIR_adj ~ contact, data = smart_small_sim_dt[TIR_7dr<0.65 & contact_day==1], clusters = pat_id)
m_iv_wc_s <- lm_robust(dTIR_adj ~ contact + dTIR_pred, data = smart_small_sim_dt[TIR_7dr<0.65 & contact_day==1], clusters = pat_id)
out_dt_nc = data.table(
n_pats = n_pats,
n_weeks = n_weeks,
n_contacts = sum(smart_small_sim_dt$contact),
type = "not controlling for expected dTIR",
coef = summary(m_iv_s)$coefficients["contact","Estimate"],
se = summary(m_iv_s)$coefficients["contact","Std. Error"],
p = summary(m_iv_s)$coefficients["contact","Pr(>|t|)"])
out_dt_c = data.table(
n_pats = n_pats,
n_weeks = n_weeks,
n_contacts = sum(smart_small_sim_dt$contact),
type = "controlling for expected dTIR",
coef = summary(m_iv_wc_s)$coefficients["contact","Estimate"],
se = summary(m_iv_wc_s)$coefficients["contact","Std. Error"],
p = summary(m_iv_wc_s)$coefficients["contact","Pr(>|t|)"])
return(rbind(out_dt_nc,out_dt_c))
}
# test_power_smart_sample(full_dt,20,12,1)
test_grid = expand.grid(
n_pats = c(10,20,30,50,90),
n_weeks = N_WEEKS
)
N_SIMS = 50
alpha = 0.05
smart_results = rbindlist(lapply(1:nrow(test_grid), function(i) {
rbindlist(lapply(1:N_SIMS, function(s) test_power_smart_sample(full_dt, test_grid$n_pats[i], test_grid$n_weeks[i], s)))
}))
smart_results[, stat_sig := p < alpha]
smart_results_agg = smart_results[,.(
power=mean(stat_sig, na.rm=T),
m_coef = mean(coef),
m_n_contacts = mean(n_contacts)), by=c('n_pats','n_weeks','type')]
# smart_results_agg
ggplot(smart_results_agg, aes(x=n_weeks, color=factor(n_pats), y=power, linetype=type)) + geom_line() + geom_point() + theme_bw() + geom_hline(yintercept = 0.5, linetype="dashed") + ylim(0,1)
Same but randomly sampling patients
test_power_random_sample = function(dt, n_pats, n_weeks, s) {
# P(contact | TIR) model
loess_fit = pc_models[[which(N_WEEKS==n_weeks)]]
# Get RF model and add preds
rfm = rf_models[[which(N_WEEKS==n_weeks)]]
# add preds
mm_pred = model.matrix(~ -1 + age + pump + pat_id + public_insurance + sexF + poc + hisp + black + asian + nepl + dka_onset + antibody_antiins + weekday + months_since_onset +
TIR_7dr + mean_glucose_7dr + hyper_7dr + hypo1_7dr + hypo2_7dr + p_worn_7dr + dTIR_lag7, data=dt[epic_read==1 | week_idx_from_end <= n_weeks])
dt[epic_read==0 & week_idx_from_end > n_weeks, dTIR_pred := rfm$predictions]
dt[epic_read==1 | week_idx_from_end <= n_weeks, dTIR_pred := predict(rfm, mm_pred)$predictions]
set.seed(s); pat_ids = sample(dt[week_idx_from_end > n_weeks , unique(pat_id)], n_pats)
sim_dt = copy(dt[pat_id %in% pat_ids & week_idx_from_end <= n_weeks])
# Sample contacts
sim_dt[, p_contact := predict(loess_fit, sim_dt)]
sim_dt[, contact_day := (pat_id+weekday_id) %% 7 == 0]
set.seed(s); sim_dt[contact_day==1 & TIR_7dr<0.65, contact_eligible := rbinom(.N, 1, 0.5)]
set.seed(s); sim_dt[contact_eligible==1, contact := rbinom(.N, 1, p_contact*7)]
sim_dt[is.na(contact), contact:=0]
# Apply treatment effect
sim_dt[, dTIR_adj := ifelse(contact==1, pmin(1,dTIR+0.03), dTIR)]
# Models
m_iv_s <- lm_robust(dTIR_adj ~ contact, data = sim_dt[TIR_7dr<0.65 & contact_day==1], clusters = pat_id)
m_iv_wc_s <- lm_robust(dTIR_adj ~ contact + dTIR_pred, data = sim_dt[TIR_7dr<0.65 & contact_day==1], clusters = pat_id)
out_dt_nc = data.table(
n_pats = n_pats,
n_weeks = n_weeks,
n_contacts = sum(sim_dt$contact),
type = "not controlling for expected dTIR",
coef = summary(m_iv_s)$coefficients["contact","Estimate"],
se = summary(m_iv_s)$coefficients["contact","Std. Error"],
p = summary(m_iv_s)$coefficients["contact","Pr(>|t|)"])
out_dt_c = data.table(
n_pats = n_pats,
n_weeks = n_weeks,
n_contacts = sum(sim_dt$contact),
type = "controlling for expected dTIR",
coef = summary(m_iv_wc_s)$coefficients["contact","Estimate"],
se = summary(m_iv_wc_s)$coefficients["contact","Std. Error"],
p = summary(m_iv_wc_s)$coefficients["contact","Pr(>|t|)"])
return(rbind(out_dt_nc,out_dt_c))
}
# test_power_random_sample(full_dt,20,12,1)
random_results = rbindlist(lapply(1:nrow(test_grid), function(i) {
rbindlist(lapply(1:N_SIMS, function(s) test_power_random_sample(full_dt, test_grid$n_pats[i], test_grid$n_weeks[i], s)))
}))
random_results[, stat_sig := p < alpha]
random_results_agg = random_results[,.(
power=mean(stat_sig, na.rm=T),
m_coef = mean(coef),
m_n_contacts = mean(n_contacts)), by=c('n_pats','n_weeks','type')]
ggplot(random_results_agg, aes(x=n_weeks, color=factor(n_pats), y=power, linetype=type)) + geom_line() + geom_point() + theme_bw() + geom_hline(yintercept = 0.5, linetype="dashed") + ylim(0,1)
Interesting relationship between N patients and power, all for 12 weeks and with control
n_pats_grid = expand.grid(
n_pats = c(10,20,30,40,50,60,70,80,90),
n_weeks = c(12)
)
smart_results_n_pats_grid = rbindlist(lapply(1:nrow(n_pats_grid), function(i) {
rbindlist(lapply(1:N_SIMS, function(s) test_power_smart_sample(full_dt, n_pats_grid$n_pats[i], n_pats_grid$n_weeks[i], s)))
}))
smart_results_n_pats_grid[, stat_sig := p < alpha]
smart_results_n_pats_grid_agg = smart_results_n_pats_grid[,.(
power=mean(stat_sig, na.rm=T),
m_coef = mean(coef),
m_n_contacts = mean(n_contacts)), by=c('n_pats','n_weeks','type')]
# smart_results_n_pats_grid_agg
ggplot(smart_results_n_pats_grid_agg[type=="controlling for expected dTIR"], aes(x=n_pats, y=power)) + geom_line() + geom_point() + theme_bw() + ylim(0,1)