library(data.table)
library(ggplot2)
library(gridExtra)
library(boot)
library(knitr)
library(printr)
R = 500
Implementing five off-policy evalution estimators from Thomas and Brunskill (ICML 2016):
N = 1000
states = rbinom(N, 1, 0.5)
actions = rbinom(N, 1, 0.5)
errors = rnorm(N, 0, 4)
rewards = 2*(states==0 & actions==1) - (actions==1) + errors
eval_dt = data.table(
state = states,
action = actions,
reward = rewards
)
behavior_policy = function(x, a) {
ifelse(a==1, mean(eval_dt$action), (1-mean(eval_dt$action)))
}
null_policy = function(x, a) {
ifelse(a==1, 0, 1)
}
action_equal_state_policy = function(x, a) {
ifelse(x$state == a, 1, 0)
}
always_treat_policy = function(x, a) {
ifelse(a==1, 1, 0)
}
optimal_policy = function(x, a) {
ifelse((a==1 & x$state==0) | (a==0 & x$state==1), 1, 0)
}
# list of policy functions to evaluate later
policy_fns = list(
"never" = null_policy,
"behavioral" = behavior_policy,
"worst" = action_equal_state_policy,
"always" = always_treat_policy,
"optimal" = optimal_policy
)
We can calculate the true expected reward of each policy
# rewards = 2*(states==0 & actions==1) - (actions==1) + errors
true_expected_rewards = rbind(
data.table(policy="never", expected_reward = 0),
data.table(policy="behavioral", expected_reward = 0), # act randomly so get expected reward = 0.25*2 + 0.5(-1)
data.table(policy="worst", expected_reward = -0.5), # only do action=1 when state=1, so expected reward = 0.5(-1) = -0.5
data.table(policy="always", expected_reward = 0), # expected reward = 0.5*2 + 1(-1) = 0
data.table(policy="optimal", expected_reward = 0.5) # expected reward = 0.5*2 + 0.5(-1) = 0.5
)
true_expected_rewards
| policy | expected_reward |
|---|---|
| never | 0.0 |
| behavioral | 0.0 |
| worst | -0.5 |
| always | 0.0 |
| optimal | 0.5 |
is_eval = function(d, p) {
d[, behavior_policy_prob := behavior_policy(d, d$action)]
d[, eval_policy_prob := p(d, d$action)]
c(d[, mean(reward*(eval_policy_prob/behavior_policy_prob))], sum(d$eval_policy_prob*d$action))
}
# test
# for (pn in names(policy_fns)) {
# print(pn)
# print(is_eval(eval_dt, policy_fns[[pn]]))
# }
boostrap_is_eval = function(d, i, p) {
boot_d = d[i]
null_policy_reward = is_eval(boot_d, null_policy)
eval_policy_reward = is_eval(boot_d, p)
return((eval_policy_reward[1] - null_policy_reward[1]))
}
# test
# for (pn in names(policy_fns)) {
# print(pn)
# print(boostrap_is_eval(eval_dt, seq(1,nrow(eval_dt)), policy_fns[[pn]]))
# }
# Bootstrapped results
is_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
data.table(
policy = pn,
delta = boot(eval_dt, boostrap_is_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
)
}))
is_bs_results = merge(is_bs_results, true_expected_rewards, by='policy')
# plot
invisible(is_bs_results[, policy := reorder(is_bs_results$policy, is_bs_results$delta, FUN = mean)])
ggplot(is_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() +
geom_hline(yintercept=0, linetype="dashed") +
stat_summary(
fun = median,
fun.min = function(z) quantile(z,0.05),
fun.max = function(z) quantile(z,0.95),
geom = "pointrange") + ggtitle("IS results") +
geom_point(aes(y=expected_reward), color="black")
wis_eval = function(d, p) {
d[, behavior_policy_prob := behavior_policy(d, d$action)]
d[, eval_policy_prob := p(d, d$action)]
c(d[, weighted.mean(reward, (eval_policy_prob/behavior_policy_prob))], sum(d$eval_policy_prob*d$action))
}
boostrap_wis_eval = function(d, i, p) {
boot_d = d[i]
null_policy_reward = wis_eval(boot_d, null_policy)
eval_policy_reward = wis_eval(boot_d, p)
return((eval_policy_reward[1] - null_policy_reward[1]))
}
# Bootstrapped results
wis_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
data.table(
policy = pn,
delta = boot(eval_dt, boostrap_wis_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
)
}))
wis_bs_results = merge(wis_bs_results, true_expected_rewards, by='policy')
# plot
invisible(wis_bs_results[, policy := reorder(wis_bs_results$policy, wis_bs_results$delta, FUN = mean)])
ggplot(wis_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() +
geom_hline(yintercept=0, linetype="dashed") +
stat_summary(
fun = median,
fun.min = function(z) quantile(z,0.05),
fun.max = function(z) quantile(z,0.95),
geom = "pointrange") + ggtitle("WIS results") +
geom_point(aes(y=expected_reward), color="black")
Need outcome model.
# simple predictors for model
lm_all = lm(reward ~ action*state, eval_dt)
summary(lm_all)
##
## Call:
## lm(formula = reward ~ action * state, data = eval_dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3230 -2.7626 0.0588 2.7262 11.6706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3799 0.2538 -1.497 0.13476
## action 0.9734 0.3671 2.651 0.00814 **
## state 0.6170 0.3636 1.697 0.09004 .
## action:state -2.3079 0.5179 -4.456 9.3e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.092 on 996 degrees of freedom
## Multiple R-squared: 0.02401, Adjusted R-squared: 0.02107
## F-statistic: 8.168 on 3 and 996 DF, p-value: 2.247e-05
model_based_eval = function(d, p) {
d[, eval_policy_a := p(d, 1)]
# need to sample actions [does nothing now, except for non-deterministic behavioral policy]
d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
# Add preds
d_a0 = copy(d)
d_a0[,action:=0]
d_a1 = copy(d)
d_a1[,action:=1]
d[, pred_a0 := predict(lm_all, d_a0)]
d[, pred_a1 := predict(lm_all, d_a1)]
# Pick which pred to use based on eval policy
d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
# Model-predicted outcomes
mean(d$pred_based_on_policy)
}
# for (pn in names(policy_fns)) {
# print(pn)
# print(model_based_eval(eval_dt, policy_fns[[pn]]))
# }
boostrap_mb_eval = function(d, i, p) {
boot_d = d[i]
null_policy_reward = model_based_eval(boot_d, null_policy)
eval_policy_reward = model_based_eval(boot_d, p)
return((eval_policy_reward - null_policy_reward))
}
# Bootstrapped results
mb_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
data.table(
policy = pn,
delta = boot(eval_dt, boostrap_mb_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
)
}))
mb_bs_results = merge(mb_bs_results, true_expected_rewards, by='policy')
# plot
invisible(mb_bs_results[, policy := reorder(mb_bs_results$policy, mb_bs_results$delta, FUN = mean)])
ggplot(mb_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() +
geom_hline(yintercept=0, linetype="dashed") +
stat_summary(
fun = median,
fun.min = function(z) quantile(z,0.05),
fun.max = function(z) quantile(z,0.95),
geom = "pointrange") + ggtitle("MB results") +
geom_point(aes(y=expected_reward), color="black")
## Doubly Robust Estimator (similar to Augmented Inverse Propensity
Weighting)
dr_eval = function(d, p) {
is = is_eval(d, p)[1]
d[, behavior_policy_prob := behavior_policy(d, d$action)]
d[, eval_policy_prob := p(d, d$action)]
d[, weight := (eval_policy_prob/behavior_policy_prob)]
d[, eval_policy_a := p(d, 1)]
# need to sample actions [does nothing now, except for non-deterministic behavioral policy]
d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
# Add preds
d_a0 = copy(d)
d_a0[,action:=0]
d_a1 = copy(d)
d_a1[,action:=1]
d[, pred_a0 := predict(lm_all, d_a0)]
d[, pred_a1 := predict(lm_all, d_a1)]
# Pick which pred to use based on eval policy
d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
# Model-based adjustment
d[, model_adj := weight*pred_based_on_policy - pred_based_on_policy]
c(is - mean(d$model_adj), is, mean(d$model_adj))
}
for (pn in names(policy_fns)) {
print(pn)
print(dr_eval(eval_dt, policy_fns[[pn]]))
}
## [1] "never"
## [1] -0.070151398 -0.079295194 -0.009143796
## [1] "behavioral"
## [1] -0.178791 -0.178791 0.000000
## [1] "worst"
## [1] -0.74007238 -0.76242821 -0.02235583
## [1] "always"
## [1] -0.25534064 -0.28111217 -0.02577153
## [1] "optimal"
## [1] 0.41458034 0.40202084 -0.01255949
boostrap_dr_eval = function(d, i, p) {
boot_d = d[i]
null_policy_reward = dr_eval(boot_d, null_policy)[1]
eval_policy_reward = dr_eval(boot_d, p)[1]
return((eval_policy_reward - null_policy_reward))
}
# Bootstrapped results
dr_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
data.table(
policy = pn,
delta = boot(eval_dt, boostrap_dr_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
)
}))
dr_bs_results = merge(dr_bs_results, true_expected_rewards, by='policy')
# plot
invisible(dr_bs_results[, policy := reorder(dr_bs_results$policy, dr_bs_results$delta, FUN = mean)])
ggplot(dr_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() +
geom_hline(yintercept=0, linetype="dashed") +
stat_summary(
fun = median,
fun.min = function(z) quantile(z,0.05),
fun.max = function(z) quantile(z,0.95),
geom = "pointrange") + ggtitle("DR results") +
geom_point(aes(y=expected_reward), color="black")
Should be slightly lower variance but also biased towards behavior policy.
wdr_eval = function(d, p) {
wis = wis_eval(d, p)[1]
d[, behavior_policy_prob := behavior_policy(d, d$action)]
d[, eval_policy_prob := p(d, d$action)]
d[, weight := (eval_policy_prob/behavior_policy_prob) / sum(d$eval_policy_prob/d$behavior_policy_prob)]
d[, eval_policy_a := p(d, 1)]
# need to sample actions [does nothing now, except for non-deterministic behavioral policy]
d$eval_policy_a = rbinom(nrow(d), 1, d$eval_policy_a)
# Add preds
d_a0 = copy(d)
d_a0[,action:=0]
d_a1 = copy(d)
d_a1[,action:=1]
d[, pred_a0 := predict(lm_all, d_a0)]
d[, pred_a1 := predict(lm_all, d_a1)]
# Pick which pred to use based on eval policy
d[, pred_based_on_policy := ifelse(eval_policy_a==1, pred_a1, pred_a0)]
# Model-based adjustment
d[, model_adj := weight*pred_based_on_policy - pred_based_on_policy]
c(wis - mean(d$model_adj), wis, mean(d$model_adj))
}
for (pn in names(policy_fns)) {
print(pn)
print(wdr_eval(eval_dt, policy_fns[[pn]]))
}
## [1] "never"
## [1] -0.14936730 -0.07929519 0.07007210
## [1] "behavioral"
## [1] -0.3499849 -0.1787910 0.1711939
## [1] "worst"
## [1] -1.4795093 -0.7401771 0.7393322
## [1] "always"
## [1] -0.5361717 -0.2811122 0.2550595
## [1] "optimal"
## [1] 0.8286468 0.4144809 -0.4141659
boostrap_wdr_eval = function(d, i, p) {
boot_d = d[i]
null_policy_reward = wdr_eval(boot_d, null_policy)[1]
eval_policy_reward = wdr_eval(boot_d, p)[1]
return((eval_policy_reward - null_policy_reward))
}
# Bootstrapped results
wdr_bs_results = rbindlist(lapply(names(policy_fns), function(pn) {
data.table(
policy = pn,
delta = boot(eval_dt, boostrap_wdr_eval, R=R, parallel="multicore", ncpus=10, p = policy_fns[[pn]])$t[,1]
)
}))
wdr_bs_results = merge(wdr_bs_results, true_expected_rewards, by='policy')
# plot
invisible(wdr_bs_results[, policy := reorder(wdr_bs_results$policy, wdr_bs_results$delta, FUN = mean)])
ggplot(wdr_bs_results, aes(x=policy, y=delta, color=policy)) + theme_bw() +
geom_hline(yintercept=0, linetype="dashed") +
stat_summary(
fun = median,
fun.min = function(z) quantile(z,0.05),
fun.max = function(z) quantile(z,0.95),
geom = "pointrange") + ggtitle("WDR results") +
geom_point(aes(y=expected_reward), color="black")
# Combine results
combo_bs_results = rbind(
is_bs_results[,method:="IS"],
wis_bs_results[,method:="WIS"],
mb_bs_results[,method:="MB"],
dr_bs_results[,method:="DR"],
wdr_bs_results[,method:="WDR"]
)
combo_bs_results$method = factor(combo_bs_results$method, levels=c("MB","IS","WIS","DR","WDR"))
ggplot(combo_bs_results, aes(x=method, y=delta, color=method)) + theme_bw() +
facet_wrap(~policy) +
stat_summary(
fun = median,
fun.min = function(z) quantile(z,0.025),
fun.max = function(z) quantile(z,0.975),
geom = "pointrange") + ggtitle("All results") +
geom_hline(aes(yintercept=expected_reward), color="black", linetype="dashed")
combo_bs_results[, .(m=mean(delta), sd=sd(delta)), by=c('policy','method')][order(policy,method)]
| policy | method | m | sd |
|---|---|---|---|
| worst | MB | -0.6687786 | 0.0217680 |
| worst | IS | -0.7115259 | 0.1846885 |
| worst | WIS | -0.6507946 | 0.1836858 |
| worst | DR | -0.6547978 | 0.1863244 |
| worst | WDR | -1.3372767 | 0.1881958 |
| always | MB | -0.1863755 | 0.0351102 |
| always | IS | -0.1933815 | 0.2472037 |
| always | WIS | -0.2028057 | 0.2674178 |
| always | DR | -0.1867959 | 0.2503641 |
| always | WDR | -0.3749602 | 0.2682160 |
| behavioral | MB | -0.0933211 | 0.0260322 |
| behavioral | IS | -0.1104266 | 0.1272894 |
| behavioral | WIS | -0.0980019 | 0.1330066 |
| behavioral | DR | -0.1079832 | 0.1205329 |
| behavioral | WDR | -0.1824938 | 0.1339634 |
| never | MB | 0.0000000 | 0.0000000 |
| never | IS | 0.0000000 | 0.0000000 |
| never | WIS | 0.0000000 | 0.0000000 |
| never | DR | 0.0000000 | 0.0000000 |
| never | WDR | 0.0000000 | 0.0000000 |
| optimal | MB | 0.4852943 | 0.0162455 |
| optimal | IS | 0.4786080 | 0.1957667 |
| optimal | WIS | 0.5017712 | 0.1801794 |
| optimal | DR | 0.4844982 | 0.1828144 |
| optimal | WDR | 0.9765139 | 0.1896315 |