Testing Off-Policy Policy Evaluation Methods with Toy Data

Implementing five off-policy evalution estimators from Thomas and Brunskill (ICML 2016):

Importance Sampling Estimator
Weighted Importance Sampling Estimator
Model Based Estimator
Doubly Robust Estimator
Weighted Doubly Robust Estimator

Simulate toy example MDP data

Two states: {0,1} equally likely
Two actions: {0,1} equally likely in behavioral policy
\(Q(a,s) = 2 * 1[s=0\ \&\ a=1] - 1[a=1] + e\)
\(e \sim N(0,4)\)

N = 1000
states = rbinom(N, 1, 0.5)
actions = rbinom(N, 1, 0.5)
errors = rnorm(N, 0, 4)
rewards = 2*(states==0 & actions==1) - (actions==1) + errors

eval_dt = data.table(
  state = states,
  action = actions,
  reward = rewards
)

Policy functions

behavior_policy = function(x, a) {
  ifelse(a==1, mean(eval_dt$action), (1-mean(eval_dt$action)))
}

null_policy = function(x, a) {
  ifelse(a==1, 0, 1)
}

action_equal_state_policy =  function(x, a) {
  ifelse(x$state == a, 1, 0)
}

always_treat_policy = function(x, a) {
  ifelse(a==1, 1, 0)
}

optimal_policy = function(x, a) {
  ifelse((a==1 & x$state==0) | (a==0 & x$state==1), 1, 0)
}

# list of policy functions to evaluate later
policy_fns = list(
 "never" = null_policy,
 "behavioral" = behavior_policy,
 "worst" = action_equal_state_policy,
 "always" = always_treat_policy,
 "optimal" = optimal_policy
)

We can calculate the true expected reward of each policy

# rewards = 2*(states==0 & actions==1) - (actions==1) + errors

true_expected_rewards = rbind(
  data.table(policy="never", expected_reward = 0),
  data.table(policy="behavioral", expected_reward = 0), # act randomly so get expected reward = 0.25*2 + 0.5(-1)
  data.table(policy="worst", expected_reward = -0.5), # only do action=1 when state=1, so expected reward = 0.5(-1) = -0.5
  data.table(policy="always", expected_reward = 0), # expected reward = 0.5*2 + 1(-1) = 0
  data.table(policy="optimal", expected_reward = 0.5) # expected reward = 0.5*2 + 0.5(-1) = 0.5
)

true_expected_rewards

policy	expected_reward
never	0.0
behavioral	0.0
worst	-0.5
always	0.0
optimal	0.5

Importance Sampling evaluation

is_eval = function(d, p) {
  
  d[, behavior_policy_prob := behavior_policy(d, d$action)]
  d[, eval_policy_prob := p(d, d$action)]
  
  c(d[, mean(reward*(eval_policy_prob/behavior_policy_prob))], sum(d$eval_policy_prob*d$action))
}

# test
# for (pn in names(policy_fns)) {
#   print(pn)
#   print(is_eval(eval_dt, policy_fns[[pn]]))
# }

boostrap_is_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = is_eval(boot_d, null_policy)
  eval_policy_reward = is_eval(boot_d, p)
  
  return((eval_policy_reward[1] - null_policy_reward[1]))
}

# test
# for (pn in names(policy_fns)) {
#   print(pn)
#   print(boostrap_is_eval(eval_dt, seq(1,nrow(eval_dt)), policy_fns[[pn]]))
# }

# Bootstrapped results
is_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_is_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))

is_bs_results = merge(is_bs_results, true_expected_rewards, by='policy')

# plot
invisible(is_bs_results[, policy := reorder(is_bs_results$policy, is_bs_results$delta, FUN = mean)])

ggplot(is_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("IS results") +
  geom_point(aes(y=expected_reward), color="black")

Weighted Importance Sampling

wis_eval = function(d, p) {
  
  d[, behavior_policy_prob := behavior_policy(d, d$action)]
  d[, eval_policy_prob := p(d, d$action)]
  
  c(d[, weighted.mean(reward, (eval_policy_prob/behavior_policy_prob))], sum(d$eval_policy_prob*d$action))
}

boostrap_wis_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = wis_eval(boot_d, null_policy)
  eval_policy_reward = wis_eval(boot_d, p)
  
  return((eval_policy_reward[1] - null_policy_reward[1]))
}


# Bootstrapped results
wis_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_wis_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))
wis_bs_results = merge(wis_bs_results, true_expected_rewards, by='policy')

# plot
invisible(wis_bs_results[, policy := reorder(wis_bs_results$policy, wis_bs_results$delta, FUN = mean)])

ggplot(wis_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("WIS results")  +
  geom_point(aes(y=expected_reward), color="black")

Model Based Estimator

Need outcome model.

# simple predictors for model 
lm_all = lm(reward ~ action*state, eval_dt) 

summary(lm_all)

## 
## Call:
## lm(formula = reward ~ action * state, data = eval_dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3230  -2.7626   0.0588   2.7262  11.6706 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.3799     0.2538  -1.497  0.13476    
## action         0.9734     0.3671   2.651  0.00814 ** 
## state          0.6170     0.3636   1.697  0.09004 .  
## action:state  -2.3079     0.5179  -4.456  9.3e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.092 on 996 degrees of freedom
## Multiple R-squared:  0.02401,    Adjusted R-squared:  0.02107 
## F-statistic: 8.168 on 3 and 996 DF,  p-value: 2.247e-05

model_based_eval = function(d, p) {
  
  d[, eval_policy_a := p(d, 1)]
  
  # need to sample actions [does nothing now, except for non-deterministic behavioral policy]
  d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
  
  # Add preds
  d_a0 = copy(d)
  d_a0[,action:=0]
  d_a1 = copy(d)
  d_a1[,action:=1]
  d[, pred_a0 := predict(lm_all, d_a0)]
  d[, pred_a1 := predict(lm_all, d_a1)]
  
  # Pick which pred to use based on eval policy
  d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
  
  # Model-predicted outcomes
  mean(d$pred_based_on_policy)
}

# for (pn in names(policy_fns)) {
#   print(pn)
#   print(model_based_eval(eval_dt, policy_fns[[pn]]))
# }


boostrap_mb_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = model_based_eval(boot_d, null_policy)
  eval_policy_reward = model_based_eval(boot_d, p)
  
  return((eval_policy_reward - null_policy_reward))
}


# Bootstrapped results
mb_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_mb_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))
mb_bs_results = merge(mb_bs_results, true_expected_rewards, by='policy')

# plot
invisible(mb_bs_results[, policy := reorder(mb_bs_results$policy, mb_bs_results$delta, FUN = mean)])

ggplot(mb_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("MB results") +
  geom_point(aes(y=expected_reward), color="black")

## Doubly Robust Estimator (similar to Augmented Inverse Propensity Weighting)

dr_eval = function(d, p) {
  
  is = is_eval(d, p)[1]
  
  d[, behavior_policy_prob := behavior_policy(d, d$action)]
  d[, eval_policy_prob := p(d, d$action)]
  d[, weight := (eval_policy_prob/behavior_policy_prob)]
  
  d[, eval_policy_a := p(d, 1)]
  # need to sample actions [does nothing now, except for non-deterministic behavioral policy]
  d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
  
  # Add preds
  d_a0 = copy(d)
  d_a0[,action:=0]
  d_a1 = copy(d)
  d_a1[,action:=1]
  d[, pred_a0 := predict(lm_all, d_a0)]
  d[, pred_a1 := predict(lm_all, d_a1)]
  
  # Pick which pred to use based on eval policy
  d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
  
  # Model-based adjustment
  d[, model_adj := weight*pred_based_on_policy - pred_based_on_policy]
  
  c(is - mean(d$model_adj), is, mean(d$model_adj))
  
}

for (pn in names(policy_fns)) {
  print(pn)
  print(dr_eval(eval_dt, policy_fns[[pn]]))
}

## [1] "never"
## [1] -0.070151398 -0.079295194 -0.009143796
## [1] "behavioral"
## [1] -0.178791 -0.178791  0.000000
## [1] "worst"
## [1] -0.74007238 -0.76242821 -0.02235583
## [1] "always"
## [1] -0.25534064 -0.28111217 -0.02577153
## [1] "optimal"
## [1]  0.41458034  0.40202084 -0.01255949

boostrap_dr_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = dr_eval(boot_d, null_policy)[1]
  eval_policy_reward = dr_eval(boot_d, p)[1]
  
  return((eval_policy_reward - null_policy_reward))
}


# Bootstrapped results
dr_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_dr_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))
dr_bs_results = merge(dr_bs_results, true_expected_rewards, by='policy')

# plot
invisible(dr_bs_results[, policy := reorder(dr_bs_results$policy, dr_bs_results$delta, FUN = mean)])

ggplot(dr_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("DR results") +
  geom_point(aes(y=expected_reward), color="black")

Weighted Doubly Robust Estimator

Should be slightly lower variance but also biased towards behavior policy.

wdr_eval = function(d, p) {
  
  wis = wis_eval(d, p)[1]
  
  d[, behavior_policy_prob := behavior_policy(d, d$action)]
  d[, eval_policy_prob := p(d, d$action)]
  d[, weight := (eval_policy_prob/behavior_policy_prob) / sum(d$eval_policy_prob/d$behavior_policy_prob)]
  
  d[, eval_policy_a := p(d, 1)]
  # need to sample actions [does nothing now, except for non-deterministic behavioral policy]
  d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
  
  # Add preds
  d_a0 = copy(d)
  d_a0[,action:=0]
  d_a1 = copy(d)
  d_a1[,action:=1]
  d[, pred_a0 := predict(lm_all, d_a0)]
  d[, pred_a1 := predict(lm_all, d_a1)]
  
  # Pick which pred to use based on eval policy
  d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
  
  # Model-based adjustment
  d[, model_adj := weight*pred_based_on_policy - pred_based_on_policy]
  
  c(wis - mean(d$model_adj), wis, mean(d$model_adj))
  
}

for (pn in names(policy_fns)) {
  print(pn)
  print(wdr_eval(eval_dt, policy_fns[[pn]]))
}

## [1] "never"
## [1] -0.14936730 -0.07929519  0.07007210
## [1] "behavioral"
## [1] -0.3499849 -0.1787910  0.1711939
## [1] "worst"
## [1] -1.4795093 -0.7401771  0.7393322
## [1] "always"
## [1] -0.5361717 -0.2811122  0.2550595
## [1] "optimal"
## [1]  0.8286468  0.4144809 -0.4141659

boostrap_wdr_eval = function(d, i, p) {
  
  boot_d = d[i]
  
  null_policy_reward = wdr_eval(boot_d, null_policy)[1]
  eval_policy_reward = wdr_eval(boot_d, p)[1]
  
  return((eval_policy_reward - null_policy_reward))
}


# Bootstrapped results
wdr_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
  data.table(
    policy = pn,
    delta = boot(eval_dt, boostrap_wdr_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
  )
}))
wdr_bs_results = merge(wdr_bs_results, true_expected_rewards, by='policy')

# plot
invisible(wdr_bs_results[, policy := reorder(wdr_bs_results$policy, wdr_bs_results$delta, FUN = mean)])

ggplot(wdr_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() + 
  geom_hline(yintercept=0, linetype="dashed") + 
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.05),
    fun.max = function(z) quantile(z,0.95),
    geom = "pointrange") + ggtitle("WDR results") +
  geom_point(aes(y=expected_reward), color="black")

# Combine results

combo_bs_results = rbind(
  is_bs_results[,method:="IS"],
  wis_bs_results[,method:="WIS"],
  mb_bs_results[,method:="MB"],
  dr_bs_results[,method:="DR"],
  wdr_bs_results[,method:="WDR"]
)

combo_bs_results$method = factor(combo_bs_results$method, levels=c("MB","IS","WIS","DR","WDR"))

ggplot(combo_bs_results, aes(x=method, y=delta, color=method)) + theme_bw() + 
  facet_wrap(~policy) +
  stat_summary(
    fun = median,
    fun.min = function(z) quantile(z,0.025),
    fun.max = function(z) quantile(z,0.975),
    geom = "pointrange") + ggtitle("All results") +
  geom_hline(aes(yintercept=expected_reward), color="black", linetype="dashed")

combo_bs_results[, .(m=mean(delta), sd=sd(delta)), by=c('policy','method')][order(policy,method)]

policy	method	m	sd
worst	MB	-0.6687786	0.0217680
worst	IS	-0.7115259	0.1846885
worst	WIS	-0.6507946	0.1836858
worst	DR	-0.6547978	0.1863244
worst	WDR	-1.3372767	0.1881958
always	MB	-0.1863755	0.0351102
always	IS	-0.1933815	0.2472037
always	WIS	-0.2028057	0.2674178
always	DR	-0.1867959	0.2503641
always	WDR	-0.3749602	0.2682160
behavioral	MB	-0.0933211	0.0260322
behavioral	IS	-0.1104266	0.1272894
behavioral	WIS	-0.0980019	0.1330066
behavioral	DR	-0.1079832	0.1205329
behavioral	WDR	-0.1824938	0.1339634
never	MB	0.0000000	0.0000000
never	IS	0.0000000	0.0000000
never	WIS	0.0000000	0.0000000
never	DR	0.0000000	0.0000000
never	WDR	0.0000000	0.0000000
optimal	MB	0.4852943	0.0162455
optimal	IS	0.4786080	0.1957667
optimal	WIS	0.5017712	0.1801794
optimal	DR	0.4844982	0.1828144
optimal	WDR	0.9765139	0.1896315