library(data.table)
library(ggplot2)
library(gridExtra)
library(boot)
library(knitr)
library(printr)
R = 500

Testing Off-Policy Policy Evaluation Methods with Toy Data

Implementing five off-policy evalution estimators from Thomas and Brunskill (ICML 2016):

Simulate toy example MDP data

  • Two states: {0,1} equally likely
  • Two actions: {0,1} equally likely in behavioral policy
  • \(Q(a,s) = 2 * 1[s=0\ \&\ a=1] - 1[a=1] + e\)
  • \(e \sim N(0,4)\)
N = 1000
states = rbinom(N, 1, 0.5)
actions = rbinom(N, 1, 0.5)
errors = rnorm(N, 0, 4)
rewards = 2*(states==0 & actions==1) - (actions==1) + errors

eval_dt = data.table(
  state = states,
  action = actions,
  reward = rewards
)

Policy functions

behavior_policy = function(x, a) {
  ifelse(a==1, mean(eval_dt$action), (1-mean(eval_dt$action)))
}

null_policy = function(x, a) {
  ifelse(a==1, 0, 1)
}

action_equal_state_policy =  function(x, a) {
  ifelse(x$state == a, 1, 0)
}

always_treat_policy = function(x, a) {
  ifelse(a==1, 1, 0)
}

optimal_policy = function(x, a) {
  ifelse((a==1 & x$state==0) | (a==0 & x$state==1), 1, 0)
}

# list of policy functions to evaluate later
policy_fns = list(
 "never" = null_policy,
 "behavioral" = behavior_policy,
 "worst" = action_equal_state_policy,
 "always" = always_treat_policy,
 "optimal" = optimal_policy
)

We can calculate the true expected reward of each policy

# rewards = 2*(states==0 & actions==1) - (actions==1) + errors

true_expected_rewards = rbind(
  data.table(policy="never", expected_reward = 0),
  data.table(policy="behavioral", expected_reward = 0), # act randomly so get expected reward = 0.25*2 + 0.5(-1)
  data.table(policy="worst", expected_reward = -0.5), # only do action=1 when state=1, so expected reward = 0.5(-1) = -0.5
  data.table(policy="always", expected_reward = 0), # expected reward = 0.5*2 + 1(-1) = 0
  data.table(policy="optimal", expected_reward = 0.5) # expected reward = 0.5*2 + 0.5(-1) = 0.5
)

true_expected_rewards
policy expected_reward
never 0.0
behavioral 0.0
worst -0.5
always 0.0
optimal 0.5

Importance Sampling evaluation

is_eval = function(d, p) {
  
  d[, behavior_policy_prob := behavior_policy(d, d$action)]
  d[, eval_policy_prob := p(d, d$action)]
  
  c(d[, mean(reward*(eval_policy_prob/behavior_policy_prob))], sum(d$eval_policy_prob*d$action))
}

# test
# for (pn in names(policy_fns)) {
#   print(pn)
#   print(is_eval(eval_dt, policy_fns[[pn]]))
# }

boostrap_is_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = is_eval(boot_d, null_policy)
  eval_policy_reward = is_eval(boot_d, p)
  
  return((eval_policy_reward[1] - null_policy_reward[1]))
}

# test
# for (pn in names(policy_fns)) {
#   print(pn)
#   print(boostrap_is_eval(eval_dt, seq(1,nrow(eval_dt)), policy_fns[[pn]]))
# }

# Bootstrapped results
is_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_is_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))

is_bs_results = merge(is_bs_results, true_expected_rewards, by='policy')

# plot
invisible(is_bs_results[, policy := reorder(is_bs_results$policy, is_bs_results$delta, FUN = mean)])

ggplot(is_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("IS results") +
  geom_point(aes(y=expected_reward), color="black")

Weighted Importance Sampling

wis_eval = function(d, p) {
  
  d[, behavior_policy_prob := behavior_policy(d, d$action)]
  d[, eval_policy_prob := p(d, d$action)]
  
  c(d[, weighted.mean(reward, (eval_policy_prob/behavior_policy_prob))], sum(d$eval_policy_prob*d$action))
}

boostrap_wis_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = wis_eval(boot_d, null_policy)
  eval_policy_reward = wis_eval(boot_d, p)
  
  return((eval_policy_reward[1] - null_policy_reward[1]))
}


# Bootstrapped results
wis_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_wis_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))
wis_bs_results = merge(wis_bs_results, true_expected_rewards, by='policy')

# plot
invisible(wis_bs_results[, policy := reorder(wis_bs_results$policy, wis_bs_results$delta, FUN = mean)])

ggplot(wis_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("WIS results")  +
  geom_point(aes(y=expected_reward), color="black")

Model Based Estimator

Need outcome model.

# simple predictors for model 
lm_all = lm(reward ~ action*state, eval_dt) 

summary(lm_all)
## 
## Call:
## lm(formula = reward ~ action * state, data = eval_dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3230  -2.7626   0.0588   2.7262  11.6706 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.3799     0.2538  -1.497  0.13476    
## action         0.9734     0.3671   2.651  0.00814 ** 
## state          0.6170     0.3636   1.697  0.09004 .  
## action:state  -2.3079     0.5179  -4.456  9.3e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.092 on 996 degrees of freedom
## Multiple R-squared:  0.02401,    Adjusted R-squared:  0.02107 
## F-statistic: 8.168 on 3 and 996 DF,  p-value: 2.247e-05
model_based_eval = function(d, p) {
  
  d[, eval_policy_a := p(d, 1)]
  
  # need to sample actions [does nothing now, except for non-deterministic behavioral policy]
  d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
  
  # Add preds
  d_a0 = copy(d)
  d_a0[,action:=0]
  d_a1 = copy(d)
  d_a1[,action:=1]
  d[, pred_a0 := predict(lm_all, d_a0)]
  d[, pred_a1 := predict(lm_all, d_a1)]
  
  # Pick which pred to use based on eval policy
  d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
  
  # Model-predicted outcomes
  mean(d$pred_based_on_policy)
}

# for (pn in names(policy_fns)) {
#   print(pn)
#   print(model_based_eval(eval_dt, policy_fns[[pn]]))
# }


boostrap_mb_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = model_based_eval(boot_d, null_policy)
  eval_policy_reward = model_based_eval(boot_d, p)
  
  return((eval_policy_reward - null_policy_reward))
}


# Bootstrapped results
mb_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_mb_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))
mb_bs_results = merge(mb_bs_results, true_expected_rewards, by='policy')

# plot
invisible(mb_bs_results[, policy := reorder(mb_bs_results$policy, mb_bs_results$delta, FUN = mean)])

ggplot(mb_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("MB results") +
  geom_point(aes(y=expected_reward), color="black")

## Doubly Robust Estimator (similar to Augmented Inverse Propensity Weighting)

dr_eval = function(d, p) {
  
  is = is_eval(d, p)[1]
  
  d[, behavior_policy_prob := behavior_policy(d, d$action)]
  d[, eval_policy_prob := p(d, d$action)]
  d[, weight := (eval_policy_prob/behavior_policy_prob)]
  
  d[, eval_policy_a := p(d, 1)]
  # need to sample actions [does nothing now, except for non-deterministic behavioral policy]
  d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
  
  # Add preds
  d_a0 = copy(d)
  d_a0[,action:=0]
  d_a1 = copy(d)
  d_a1[,action:=1]
  d[, pred_a0 := predict(lm_all, d_a0)]
  d[, pred_a1 := predict(lm_all, d_a1)]
  
  # Pick which pred to use based on eval policy
  d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
  
  # Model-based adjustment
  d[, model_adj := weight*pred_based_on_policy - pred_based_on_policy]
  
  c(is - mean(d$model_adj), is, mean(d$model_adj))
  
}

for (pn in names(policy_fns)) {
  print(pn)
  print(dr_eval(eval_dt, policy_fns[[pn]]))
}
## [1] "never"
## [1] -0.070151398 -0.079295194 -0.009143796
## [1] "behavioral"
## [1] -0.178791 -0.178791  0.000000
## [1] "worst"
## [1] -0.74007238 -0.76242821 -0.02235583
## [1] "always"
## [1] -0.25534064 -0.28111217 -0.02577153
## [1] "optimal"
## [1]  0.41458034  0.40202084 -0.01255949
boostrap_dr_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = dr_eval(boot_d, null_policy)[1]
  eval_policy_reward = dr_eval(boot_d, p)[1]
  
  return((eval_policy_reward - null_policy_reward))
}


# Bootstrapped results
dr_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_dr_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))
dr_bs_results = merge(dr_bs_results, true_expected_rewards, by='policy')

# plot
invisible(dr_bs_results[, policy := reorder(dr_bs_results$policy, dr_bs_results$delta, FUN = mean)])

ggplot(dr_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("DR results") +
  geom_point(aes(y=expected_reward), color="black")

Weighted Doubly Robust Estimator

Should be slightly lower variance but also biased towards behavior policy.

wdr_eval = function(d, p) {
  
  wis = wis_eval(d, p)[1]
  
  d[, behavior_policy_prob := behavior_policy(d, d$action)]
  d[, eval_policy_prob := p(d, d$action)]
  d[, weight := (eval_policy_prob/behavior_policy_prob) / sum(d$eval_policy_prob/d$behavior_policy_prob)]
  
  d[, eval_policy_a := p(d, 1)]
  # need to sample actions [does nothing now, except for non-deterministic behavioral policy]
  d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
  
  # Add preds
  d_a0 = copy(d)
  d_a0[,action:=0]
  d_a1 = copy(d)
  d_a1[,action:=1]
  d[, pred_a0 := predict(lm_all, d_a0)]
  d[, pred_a1 := predict(lm_all, d_a1)]
  
  # Pick which pred to use based on eval policy
  d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
  
  # Model-based adjustment
  d[, model_adj := weight*pred_based_on_policy - pred_based_on_policy]
  
  c(wis - mean(d$model_adj), wis, mean(d$model_adj))
  
}

for (pn in names(policy_fns)) {
  print(pn)
  print(wdr_eval(eval_dt, policy_fns[[pn]]))
}
## [1] "never"
## [1] -0.14936730 -0.07929519  0.07007210
## [1] "behavioral"
## [1] -0.3499849 -0.1787910  0.1711939
## [1] "worst"
## [1] -1.4795093 -0.7401771  0.7393322
## [1] "always"
## [1] -0.5361717 -0.2811122  0.2550595
## [1] "optimal"
## [1]  0.8286468  0.4144809 -0.4141659
boostrap_wdr_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = wdr_eval(boot_d, null_policy)[1]
  eval_policy_reward = wdr_eval(boot_d, p)[1]
  
  return((eval_policy_reward - null_policy_reward))
}


# Bootstrapped results
wdr_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_wdr_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))
wdr_bs_results = merge(wdr_bs_results, true_expected_rewards, by='policy')

# plot
invisible(wdr_bs_results[, policy := reorder(wdr_bs_results$policy, wdr_bs_results$delta, FUN = mean)])

ggplot(wdr_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("WDR results") +
  geom_point(aes(y=expected_reward), color="black")

# Combine results

combo_bs_results = rbind(
  is_bs_results[,method:="IS"],
  wis_bs_results[,method:="WIS"],
  mb_bs_results[,method:="MB"],
  dr_bs_results[,method:="DR"],
  wdr_bs_results[,method:="WDR"]
)

combo_bs_results$method = factor(combo_bs_results$method, levels=c("MB","IS","WIS","DR","WDR"))

ggplot(combo_bs_results, aes(x=method, y=delta, color=method)) + theme_bw() + 
  facet_wrap(~policy) +
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.025),
    fun.max = function(z) quantile(z,0.975),
    geom = "pointrange") + ggtitle("All results") +
  geom_hline(aes(yintercept=expected_reward), color="black", linetype="dashed")

combo_bs_results[, .(m=mean(delta), sd=sd(delta)), by=c('policy','method')][order(policy,method)]
policy method m sd
worst MB -0.6687786 0.0217680
worst IS -0.7115259 0.1846885
worst WIS -0.6507946 0.1836858
worst DR -0.6547978 0.1863244
worst WDR -1.3372767 0.1881958
always MB -0.1863755 0.0351102
always IS -0.1933815 0.2472037
always WIS -0.2028057 0.2674178
always DR -0.1867959 0.2503641
always WDR -0.3749602 0.2682160
behavioral MB -0.0933211 0.0260322
behavioral IS -0.1104266 0.1272894
behavioral WIS -0.0980019 0.1330066
behavioral DR -0.1079832 0.1205329
behavioral WDR -0.1824938 0.1339634
never MB 0.0000000 0.0000000
never IS 0.0000000 0.0000000
never WIS 0.0000000 0.0000000
never DR 0.0000000 0.0000000
never WDR 0.0000000 0.0000000
optimal MB 0.4852943 0.0162455
optimal IS 0.4786080 0.1957667
optimal WIS 0.5017712 0.1801794
optimal DR 0.4844982 0.1828144
optimal WDR 0.9765139 0.1896315