off_policy_value_evaluation

library(data.table)
library(ggplot2)
library(boot)

Trying to implement a simple example of Weighted Importance Sampling Estimation and Weighted Doubly Robust Value Evaluation from https://arxiv.org/pdf/1604.00923.pdf

States: {0, 1}
Actions: {0, 1}
r(s,a) = s - a
No discount factor.

Define transition matrices

P_0 = matrix(c(.5,.5,.5,.5), ncol=2)
P_0

##      [,1] [,2]
## [1,]  0.5  0.5
## [2,]  0.5  0.5

P_1 = matrix(c(0,1,.5,.5), ncol=2, byrow=TRUE)
P_1

##      [,1] [,2]
## [1,]  0.0  1.0
## [2,]  0.5  0.5

Policies:

\(\pi_0\): 50% chance to either action 0 or 1.
\(\pi_1\): 25% chance of action 1, 75% of 0.

Simulate 1e6 time steps for each policy, starting in state 0

Policy 0 (Expected reward should be 0.1 * N_TIME_STEPS)

N_TIME_STEPS = 1000000

reward_0 = 0
state = 0
for(i in seq(1,N_TIME_STEPS)) {
  a = rbinom(1,1,0.5)
  reward_0 = reward_0 + state - a
  if(a == 0) P = P_0 else P = P_1
  state = sample.int(2, 1, prob = P[state+1,])-1
}
reward_0/N_TIME_STEPS

## [1] 0.09999

Policy 1 (Expected reward should be 0.3 * N_TIME_STEPS)

N_TIME_STEPS = 1000000

reward_1 = 0
state = 0
for(i in seq(1,N_TIME_STEPS)) {
  a = rbinom(1,1,0.25)
  reward_1 = reward_1 + state - a
  if(a == 0) P = P_0 else P = P_1
  state = sample.int(2, 1, prob = P[state+1,])-1
}
reward_1/N_TIME_STEPS

## [1] 0.305275

Now, let’s simulate a bunch of trajectories of \(\pi_0\) and try to use them to estimate the value of \(\pi_1\).

Use the exp average rewards going forward as V predictions.

sim_single_trajectory = function(x, n_time_steps) {
  
  states = array(dim=n_time_steps+1)
  states[1] = 0
  actions = array(dim=n_time_steps)
  rewards = array(dim=n_time_steps)
  cum_rewards = array(dim=n_time_steps)
  
  for(i in seq(1,n_time_steps)) {
    actions[i] = rbinom(1,1,0.5)
    rewards[i] = states[i] - actions[i] #+ rnorm(1, sd=0.1)
    cum_rewards[i] = if(i>1) rewards[i] + cum_rewards[i-1] else rewards[i]
    if(actions[i] == 0) P = P_0 else P = P_1
    states[i+1] = sample.int(2, 1, prob = P[states[i]+1,])-1
  }
  
  return(data.table(
    id = x,
    t = seq(1,n_time_steps),
    state = states[1:n_time_steps],
    action = actions,
    prob_action_pi_0 = 0.5,
    prob_action_pi_1 = ifelse(actions==1, 0.25, 0.75),
    reward = rewards,
    cum_rewards = cum_rewards,
    v_hat_pi_0 = 0.1 * (n_time_steps - seq(1,n_time_steps)),
    v_hat_pi_1 = 0.3 * (n_time_steps - seq(1,n_time_steps)),
    q_hat_pi_0 = -0.2*actions + 0.1 * (n_time_steps - seq(1,n_time_steps)),
    q_hat_pi_1 = -0.2*actions + 0.3 * (n_time_steps - seq(1,n_time_steps))
  ))
}

N_TRAJECTORIES = 1000
N_TIME_STEPS = 100
trajectories_pi_0 = rbindlist(lapply(seq(1,N_TRAJECTORIES), function(x) sim_single_trajectory(x, N_TIME_STEPS)))
str(trajectories_pi_0)

## Classes 'data.table' and 'data.frame':   100000 obs. of  12 variables:
##  $ id              : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ t               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ state           : num  0 1 0 1 0 0 1 1 1 1 ...
##  $ action          : int  0 0 1 0 0 1 0 1 1 1 ...
##  $ prob_action_pi_0: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ prob_action_pi_1: num  0.75 0.75 0.25 0.75 0.75 0.25 0.75 0.25 0.25 0.25 ...
##  $ reward          : num  0 1 -1 1 0 -1 1 0 0 0 ...
##  $ cum_rewards     : num  0 1 0 1 1 0 1 1 1 1 ...
##  $ v_hat_pi_0      : num  9.9 9.8 9.7 9.6 9.5 9.4 9.3 9.2 9.1 9 ...
##  $ v_hat_pi_1      : num  29.7 29.4 29.1 28.8 28.5 28.2 27.9 27.6 27.3 27 ...
##  $ q_hat_pi_0      : num  9.9 9.8 9.5 9.6 9.5 9.2 9.3 9 8.9 8.8 ...
##  $ q_hat_pi_1      : num  29.7 29.4 28.9 28.8 28.5 28 27.9 27.4 27.1 26.8 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Try the Weighted Importance Sampling estimator

No discount factor so just weight each trajectory’s cumualtive reward by its product of the relative policy probabilities

setorder(trajectories_pi_0, id, t)
trajectories_pi_0[, `:=`(
  cum_p_ratio = cumprod(prob_action_pi_1/prob_action_pi_0)),
  by ='id']
trajectories_pi_0[, weight := cum_p_ratio / sum(cum_p_ratio), by='t']

Unweighted mean reward (expect 10).

# bootstrap trajectory IDs

boot.ci(boot(
  data = unique(trajectories_pi_0[, .(id)]),
  stype = "f",
  statistic = function(dt, bs_f) {
    trajectories_by_id = trajectories_pi_0[, .(id_sum = sum(reward)), by='id']
    trajectories_by_id[, boot_f := bs_f]
    trajectories_by_id[, sum(id_sum*boot_f)/uniqueN(id)]
    },
  parallel =  "multicore",
  ncpus = 8,
  R = 500),
  type='perc')

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot(data = unique(trajectories_pi_0[, .(id)]), 
##     stype = "f", statistic = function(dt, bs_f) {
##         trajectories_by_id = trajectories_pi_0[, .(id_sum = sum(reward)), 
##             by = "id"]
##         trajectories_by_id[, `:=`(boot_f, bs_f)]
##         trajectories_by_id[, sum(id_sum * boot_f)/uniqueN(id)]
##     }, parallel = "multicore", ncpus = 8, R = 500), type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 9.005,  9.689 )  
## Calculations and Intervals on Original Scale

Weighted mean reward (expect 30).

boot.ci(boot(
  data = unique(trajectories_pi_0[, .(id)]),
  stype = "f",
  statistic = function(dt, bs_f) {
    trajectories_by_id = trajectories_pi_0[, .(id_sum = sum(reward*weight)), by='id']
    trajectories_by_id[, boot_f := bs_f]
    trajectories_by_id[, sum(id_sum*boot_f)]
    },
  parallel =  "multicore",
  ncpus = 8,
  R = 500),
  type='perc')

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot(data = unique(trajectories_pi_0[, .(id)]), 
##     stype = "f", statistic = function(dt, bs_f) {
##         trajectories_by_id = trajectories_pi_0[, .(id_sum = sum(reward * 
##             weight)), by = "id"]
##         trajectories_by_id[, `:=`(boot_f, bs_f)]
##         trajectories_by_id[, sum(id_sum * boot_f)]
##     }, parallel = "multicore", ncpus = 8, R = 500), type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 9.47, 44.17 )  
## Calculations and Intervals on Original Scale

Next, try weighted doubly robust estimation

setorder(trajectories_pi_0, id, t)
trajectories_pi_0[, lagged_weight := shift(weight, fill=NA), by='id']
trajectories_pi_0[, `:=`(
  wdr_is_part = reward*weight,
  wdr_reg_part_1 = weight*q_hat_pi_1/uniqueN(id),
  wdr_reg_part_2 = lagged_weight*v_hat_pi_1/uniqueN(id)
)]

boot.ci(boot(
  data = unique(trajectories_pi_0[, .(id)]),
  stype = "f",
  statistic = function(dt, bs_f) {
    trajectories_by_id = 
      trajectories_pi_0[, 
                        .(id_sum = sum(wdr_is_part - wdr_reg_part_1 + wdr_reg_part_2, na.rm=TRUE)),
                        by='id']
    trajectories_by_id[, boot_f := bs_f]
    trajectories_by_id[, sum(id_sum*boot_f)]
    },
  parallel =  "multicore",
  ncpus = 8,
  R = 500),
  type='perc')

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot(data = unique(trajectories_pi_0[, .(id)]), 
##     stype = "f", statistic = function(dt, bs_f) {
##         trajectories_by_id = trajectories_pi_0[, .(id_sum = sum(wdr_is_part - 
##             wdr_reg_part_1 + wdr_reg_part_2, na.rm = TRUE)), 
##             by = "id"]
##         trajectories_by_id[, `:=`(boot_f, bs_f)]
##         trajectories_by_id[, sum(id_sum * boot_f)]
##     }, parallel = "multicore", ncpus = 8, R = 500), type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 9.81, 42.72 )  
## Calculations and Intervals on Original Scale

Effect of # of trajectories and length of each trajectory on variance

Get Mean and SD of bootstrap samples of trajectories with different numbers of trajectories and lengths of trajectories.

get_sim_mean_sd = function(n_trajectories, n_time_steps) {
  set.seed(1814)
  trajectories_pi_0 = rbindlist(lapply(seq(1,n_trajectories), function(x) sim_single_trajectory(x, n_time_steps)))
  
  setorder(trajectories_pi_0, id, t)
  trajectories_pi_0[, cum_p_ratio := cumprod(prob_action_pi_1/prob_action_pi_0), by ='id']
  trajectories_pi_0[, weight := cum_p_ratio / sum(cum_p_ratio), by='t']
  trajectories_pi_0[, lagged_weight := shift(weight, fill=NA), by='id']
  trajectories_pi_0[, `:=`(
    wdr_is_part = reward*weight,
    wdr_reg_part_1 = weight*q_hat_pi_1/uniqueN(id),
    wdr_reg_part_2 = lagged_weight*v_hat_pi_1/uniqueN(id)
  )]
  
  set.seed(1814)
  # is_b = boot.ci(boot(
  #   data = trajectories_pi_0, 
  #   strata = trajectories_pi_0[, id],
  #   statistic = function(dt, ind) dt[ind, sum(reward*weight)],
  #   R = 500), type='perc')
  
  is_b = boot.ci(boot(
    data = unique(trajectories_pi_0[, .(id)]),
    stype = "f",
    statistic = function(dt, bs_f) {
      trajectories_by_id = 
        trajectories_pi_0[, .(id_sum = sum(reward*weight)), by='id']
      trajectories_by_id[, boot_f := bs_f]
      trajectories_by_id[, sum(id_sum*boot_f)]
      },
    parallel = "multicore",
    ncpus = 8,
    R = 500),
    type='perc')
  
  set.seed(1814)
  # dr_b = boot.ci(boot(
  #   data = trajectories_pi_0, 
  #   strata = trajectories_pi_0[, id],
  #   statistic = function(dt, ind) dt[ind, sum(wdr_is_part - wdr_reg_part_1 + wdr_reg_part_2, na.rm=TRUE)],
  #   R = 500), type='perc')
  
  dr_b = boot.ci(boot(
    data = unique(trajectories_pi_0[, .(id)]),
    stype = "f",
    statistic = function(dt, bs_f) {
      trajectories_by_id = 
        trajectories_pi_0[, 
                          .(id_sum = sum(wdr_is_part - wdr_reg_part_1 + wdr_reg_part_2, na.rm=TRUE)),
                          by='id']
      trajectories_by_id[, boot_f := bs_f]
      trajectories_by_id[, sum(id_sum*boot_f)]
      },
    parallel = "multicore",
    ncpus = 8,
    R = 500),
    type='perc')
  
  return(data.table(
    n_trajectories = n_trajectories,
    n_time_steps = n_time_steps,
    estimator = c('WIS', 'WDR'),
    bs_lb = c(is_b$percent[1,4], dr_b$percent[1,4])/n_time_steps,
    bs_ub = c(is_b$percent[1,5], dr_b$percent[1,5])/n_time_steps
  ))
}
# test
# get_sim_mean_sd(100, 10)

# Run over grid of n_trajectories and n_time_steps
grid = expand.grid(c(10,100,1000), c(10,100,1000))
full_res = rbindlist(lapply(seq(1,nrow(grid)), function(x) get_sim_mean_sd(grid[x,1], grid[x,2])))

ggplot(full_res, aes(x=estimator, color=estimator, ymin=bs_lb, ymax=bs_ub)) +
  geom_errorbar() + 
  facet_grid(rows=vars(n_trajectories), cols=vars(n_time_steps), labeller = label_both) +
  geom_hline(yintercept = 0.3, linetype='dashed')

Our CGM dataset is ~100 trajectories and ~300 time steps (closest to middle graph)