Testing Off-Policy Policy Evaluation Methods with TIDE Data

Implementing five off-policy evalution estimators from Thomas and Brunskill (ICML 2016):

Importance Sampling Estimator
Weighted Importance Sampling Estimator
Model Based Estimator
Doubly Robust Estimator
Weighted Doubly Robust Estimator

Load TIDE data

Action: patient is shown to CDEs for review

Reward: Change in TIR

State: CGM summary statistics

sim_dt = fread('~/Google Drive/My Drive/Stanford/CGM/sim_dt.csv')

eval_dt = sim_dt[in_first_year_since_onset==0 & !is.na(dTIR_2w) & !is.na(TIR_lag)]

eval_dt[,.(uniqueN(pat_id), .N, mean(eligible_for_review), sum(tide_message_sent))]

V1	N	V3	V4
78	2740	0.2481752	236

Policy functions

behavior_policy = function(x, a) {
  ifelse(a==1, mean(eval_dt$eligible_for_review), (1-mean(eval_dt$eligible_for_review)))
}

null_policy = function(x, a) {
  ifelse(a==1, 0, 1)
}

tir_above_65_policy =  function(x, a) {
  ifelse((x$TIR_lag>=0.65) == a, 1, 0)
}

tir_below_65_policy =  function(x, a) {
  ifelse((x$TIR_lag<0.65) == a, 1, 0)
}

tir_below_65_and_pw_above_50_policy = function(x, a) {
  ifelse((x$TIR_lag<0.65 & x$p_worn_lag>0.5) == a, 1, 0)
}

always_treat_policy = function(x, a) {
  ifelse(a==1, 1, 0)
}

# list of policy functions to evaluate later
policy_fns = list(
 "null" = null_policy,
 "behavioral" = behavior_policy,
 "tir>=65%" = tir_above_65_policy,
 "tir<65%" = tir_below_65_policy,
 "tir<65% & pw>50%" = tir_below_65_and_pw_above_50_policy,
 "always treat" = always_treat_policy
)

Importance Sampling evaluation

is_eval = function(d, p) {
  
  d[, behavior_policy_prob := behavior_policy(d, d$eligible_for_review)]
  d[, eval_policy_prob := p(d, d$eligible_for_review)]
  
  c(d[, mean(dTIR_2w*(eval_policy_prob/behavior_policy_prob))], sum(d$eval_policy_prob*d$eligible_for_review))
}

# test
# for (pn in names(policy_fns)) {
#   print(pn)
#   print(is_eval(eval_dt, policy_fns[[pn]]))
# }

boostrap_is_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = is_eval(boot_d, null_policy)
  eval_policy_reward = is_eval(boot_d, p)
  
  return((eval_policy_reward[1] - null_policy_reward[1]))
}

# test
# for (pn in names(policy_fns)) {
#   print(pn)
#   print(boostrap_is_eval(eval_dt, seq(1,nrow(eval_dt)), policy_fns[[pn]]))
# }

# Bootstrapped results
is_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_is_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))


# plot
invisible(is_bs_results[, policy := reorder(is_bs_results$policy, is_bs_results$delta, FUN = mean)])

ggplot(is_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("IS results")

Weighted Importance Sampling

wis_eval = function(d, p) {
  
  d[, behavior_policy_prob := behavior_policy(d, d$eligible_for_review)]
  d[, eval_policy_prob := p(d, d$eligible_for_review)]
  
  c(d[, weighted.mean(dTIR_2w, (eval_policy_prob/behavior_policy_prob))], sum(d$eval_policy_prob*d$eligible_for_review))
}

boostrap_wis_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = wis_eval(boot_d, null_policy)
  eval_policy_reward = wis_eval(boot_d, p)
  
  return((eval_policy_reward[1] - null_policy_reward[1]))
}


# Bootstrapped results
wis_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_wis_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))


# plot
invisible(wis_bs_results[, policy := reorder(wis_bs_results$policy, wis_bs_results$delta, FUN = mean)])

ggplot(wis_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("WIS results")

merge(wis_bs_results[,.(mean_d=mean(delta), sd=sd(delta)), by='policy'], 
      is_bs_results[,.(mean_d=mean(delta), sd=sd(delta)), by='policy'], 
      by='policy', suffixes=c(".wis",".is"))

policy	mean_d.wis	sd.wis	mean_d.is	sd.is
null	0.0000000	0.0000000	0.0000000	0.0000000
tir>=65%	0.0019151	0.0029331	0.0017792	0.0032145
behavioral	0.0022759	0.0012396	0.0023940	0.0013092
tir<65%	0.0072975	0.0040717	0.0072922	0.0040847
tir<65% & pw>50%	0.0085750	0.0036644	0.0081592	0.0035622
always treat	0.0090921	0.0050805	0.0092650	0.0052109

Model Based Estimator

Need outcome model.

# simple predictors for model 
invisible(eval_dt[, TIR_lag_below_65 := ifelse(TIR_lag<0.65,1,0)])
invisible(eval_dt[, p_worn_lag_above_50 := ifelse(p_worn_lag>0.50,1,0)])

lm_all = lm(dTIR_2w ~ eligible_for_review*(TIR_lag_below_65*p_worn_lag_above_50), eval_dt) 

summary(lm_all)

## 
## Call:
## lm(formula = dTIR_2w ~ eligible_for_review * (TIR_lag_below_65 * 
##     p_worn_lag_above_50), data = eval_dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59256 -0.06053  0.00627  0.06908  0.69575 
## 
## Coefficients:
##                                                           Estimate Std. Error
## (Intercept)                                              -0.064174   0.018022
## eligible_for_review                                       0.009082   0.041490
## TIR_lag_below_65                                          0.120291   0.021551
## p_worn_lag_above_50                                       0.036445   0.018375
## TIR_lag_below_65:p_worn_lag_above_50                     -0.068949   0.022229
## eligible_for_review:TIR_lag_below_65                     -0.021990   0.048401
## eligible_for_review:p_worn_lag_above_50                  -0.006658   0.042105
## eligible_for_review:TIR_lag_below_65:p_worn_lag_above_50  0.039599   0.049608
##                                                          t value Pr(>|t|)    
## (Intercept)                                               -3.561 0.000376 ***
## eligible_for_review                                        0.219 0.826747    
## TIR_lag_below_65                                           5.582 2.62e-08 ***
## p_worn_lag_above_50                                        1.983 0.047416 *  
## TIR_lag_below_65:p_worn_lag_above_50                      -3.102 0.001943 ** 
## eligible_for_review:TIR_lag_below_65                      -0.454 0.649628    
## eligible_for_review:p_worn_lag_above_50                   -0.158 0.874375    
## eligible_for_review:TIR_lag_below_65:p_worn_lag_above_50   0.798 0.424798    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1182 on 2732 degrees of freedom
## Multiple R-squared:  0.06454,    Adjusted R-squared:  0.06215 
## F-statistic: 26.93 on 7 and 2732 DF,  p-value: < 2.2e-16

model_based_eval = function(d, p) {
  
  d[, eval_policy_a := p(d, 1)]
  
  # need to sample actions [does nothing now, except for non-deterministic behavioral policy]
  d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
  
  # Add preds
  d_a0 = copy(d)
  d_a0[,eligible_for_review:=0]
  d_a1 = copy(d)
  d_a1[,eligible_for_review:=1]
  d[, pred_a0 := predict(lm_all, d_a0)]
  d[, pred_a1 := predict(lm_all, d_a1)]
  
  # Pick which pred to use based on eval policy
  d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
  
  # Model-predicted outcomes
  mean(d$pred_based_on_policy)
}

for (pn in names(policy_fns)) {
  print(pn)
  print(model_based_eval(eval_dt, policy_fns[[pn]]))
}

## [1] "null"
## [1] -0.003724238
## [1] "behavioral"
## [1] -0.00141676
## [1] "tir>=65%"
## [1] -0.002265588
## [1] "tir<65%"
## [1] 0.00376931
## [1] "tir<65% & pw>50%"
## [1] 0.004377022
## [1] "always treat"
## [1] 0.005227959

boostrap_mb_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = model_based_eval(boot_d, null_policy)
  eval_policy_reward = model_based_eval(boot_d, p)
  
  return((eval_policy_reward - null_policy_reward))
}


# Bootstrapped results
mb_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_mb_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))


# plot
invisible(mb_bs_results[, policy := reorder(mb_bs_results$policy, mb_bs_results$delta, FUN = mean)])

ggplot(mb_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("MB results")

Doubly Robust Estimator (similar to Augmented Inverse Propensity Weighting)

dr_eval = function(d, p) {
  
  is = is_eval(d, p)[1]
  
  d[, behavior_policy_prob := behavior_policy(d, d$eligible_for_review)]
  d[, eval_policy_prob := p(d, d$eligible_for_review)]
  d[, weight := (eval_policy_prob/behavior_policy_prob)]
  
  d[, eval_policy_a := p(d, 1)]
  # need to sample actions [does nothing now, except for non-deterministic behavioral policy]
  d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
  
  # Add preds
  d_a0 = copy(d)
  d_a0[,eligible_for_review:=0]
  d_a1 = copy(d)
  d_a1[,eligible_for_review:=1]
  d[, pred_a0 := predict(lm_all, d_a0)]
  d[, pred_a1 := predict(lm_all, d_a1)]
  
  # Pick which pred to use based on eval policy
  d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
  
  # Model-based adjustment
  d[, model_adj := weight*pred_based_on_policy - pred_based_on_policy]
  
  c(is - mean(d$model_adj), is, mean(d$model_adj))
  
}

for (pn in names(policy_fns)) {
  print(pn)
  print(dr_eval(eval_dt, policy_fns[[pn]]))
}

## [1] "null"
## [1] -3.724238e-03 -3.757565e-03 -3.332691e-05
## [1] "behavioral"
## [1] -0.001467469 -0.001467469  0.000000000
## [1] "tir>=65%"
## [1] -0.0022655880 -0.0020539535  0.0002116345
## [1] "tir<65%"
## [1]  3.769310e-03  3.766561e-03 -2.749015e-06
## [1] "tir<65% & pw>50%"
## [1] 0.004377022 0.004647951 0.000270929
## [1] "always treat"
## [1] 0.0052279593 0.0054701718 0.0002422124

boostrap_dr_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = dr_eval(boot_d, null_policy)[1]
  eval_policy_reward = dr_eval(boot_d, p)[1]
  
  return((eval_policy_reward - null_policy_reward))
}


# Bootstrapped results
dr_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_dr_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))


# plot
invisible(dr_bs_results[, policy := reorder(dr_bs_results$policy, dr_bs_results$delta, FUN = mean)])

ggplot(dr_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("DR results")

Weighted Doubly Robust Estimator

wdr_eval = function(d, p) {
  
  wis = wis_eval(d, p)[1]
  
  d[, behavior_policy_prob := behavior_policy(d, d$eligible_for_review)]
  d[, eval_policy_prob := p(d, d$eligible_for_review)]
  d[, weight := (eval_policy_prob/behavior_policy_prob) / sum(d$eval_policy_prob/d$behavior_policy_prob)]
  
  d[, eval_policy_a := p(d, 1)]
  # need to sample actions [does nothing now, except for non-deterministic behavioral policy]
  d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
  
  # Add preds
  d_a0 = copy(d)
  d_a0[,eligible_for_review:=0]
  d_a1 = copy(d)
  d_a1[,eligible_for_review:=1]
  d[, pred_a0 := predict(lm_all, d_a0)]
  d[, pred_a1 := predict(lm_all, d_a1)]
  
  # Pick which pred to use based on eval policy
  d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
  
  # Model-based adjustment
  d[, model_adj := weight*pred_based_on_policy - pred_based_on_policy]
  
  c(wis - mean(d$model_adj), wis, mean(d$model_adj))
  
}

for (pn in names(policy_fns)) {
  print(pn)
  print(wdr_eval(eval_dt, policy_fns[[pn]]))
}

## [1] "null"
## [1] -0.007480431 -0.003757565  0.003722866
## [1] "behavioral"
## [1] -0.002890040 -0.001467469  0.001422570
## [1] "tir>=65%"
## [1] -0.004322845 -0.002058008  0.002264837
## [1] "tir<65%"
## [1]  0.007527092  0.003759154 -0.003767938
## [1] "tir<65% & pw>50%"
## [1]  0.008987010  0.004611671 -0.004375339
## [1] "always treat"
## [1]  0.010696135  0.005470172 -0.005225963

boostrap_wdr_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = wdr_eval(boot_d, null_policy)[1]
  eval_policy_reward = wdr_eval(boot_d, p)[1]
  
  return((eval_policy_reward - null_policy_reward))
}


# Bootstrapped results
wdr_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_wdr_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))


# plot
invisible(wdr_bs_results[, policy := reorder(wdr_bs_results$policy, wdr_bs_results$delta, FUN = mean)])

ggplot(wdr_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("WDR results")

# Combine results

combo_bs_results = rbind(
  is_bs_results[,method:="IS"],
  wis_bs_results[,method:="WIS"],
  mb_bs_results[,method:="MB"],
  dr_bs_results[,method:="DR"],
  wdr_bs_results[,method:="WDR"]
)

combo_bs_results$method = factor(combo_bs_results$method, levels=c("MB","IS","WIS","DR","WDR"))
combo_bs_results$policy = factor(combo_bs_results$policy, levels=c("null","tir>=65%","behavioral","tir<65%", "tir<65% & pw>50%", "always treat"))

ggplot(combo_bs_results, aes(x=method, y=delta, color=method)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + facet_wrap(~policy) +
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.025),
    fun.max = function(z) quantile(z,0.975),
    geom = "pointrange") + ggtitle("All results")

combo_bs_results[, .(m=mean(delta), sd=sd(delta)), by=c('policy','method')][order(policy,method)]

policy	method	m	sd
null	MB	0.0000000	0.0000000
null	IS	0.0000000	0.0000000
null	WIS	0.0000000	0.0000000
null	DR	0.0000000	0.0000000
null	WDR	0.0000000	0.0000000
tir>=65%	MB	0.0014584	0.0000308
tir>=65%	IS	0.0017792	0.0032145
tir>=65%	WIS	0.0019151	0.0029331
tir>=65%	DR	0.0011580	0.0029931
tir>=65%	WDR	0.0031738	0.0031455
behavioral	MB	0.0022281	0.0001160
behavioral	IS	0.0023940	0.0013092
behavioral	WIS	0.0022759	0.0012396
behavioral	DR	0.0023889	0.0012333
behavioral	WDR	0.0045855	0.0012545
tir<65%	MB	0.0074890	0.0002074
tir<65%	IS	0.0072922	0.0040847
tir<65%	WIS	0.0072975	0.0040717
tir<65%	DR	0.0073906	0.0037251
tir<65%	WDR	0.0152271	0.0040793
tir<65% & pw>50%	MB	0.0081119	0.0001821
tir<65% & pw>50%	IS	0.0081592	0.0035622
tir<65% & pw>50%	WIS	0.0085750	0.0036644
tir<65% & pw>50%	DR	0.0081148	0.0035909
tir<65% & pw>50%	WDR	0.0164977	0.0036536
always treat	MB	0.0089463	0.0001872
always treat	IS	0.0092650	0.0052109
always treat	WIS	0.0090921	0.0050805
always treat	DR	0.0088771	0.0048392
always treat	WDR	0.0178158	0.0049360