library(data.table)
library(ggplot2)
library(boot)
Define transition matrices
P_0 = matrix(c(.5,.5,.5,.5), ncol=2)
P_0
## [,1] [,2]
## [1,] 0.5 0.5
## [2,] 0.5 0.5
P_1 = matrix(c(0,1,.5,.5), ncol=2, byrow=TRUE)
P_1
## [,1] [,2]
## [1,] 0.0 1.0
## [2,] 0.5 0.5
Policies:
Simulate 1e6 time steps for each policy, starting in state 0
Policy 0 (Expected reward should be 0.1 * N_TIME_STEPS)
N_TIME_STEPS = 1000000
reward_0 = 0
state = 0
for(i in seq(1,N_TIME_STEPS)) {
a = rbinom(1,1,0.5)
reward_0 = reward_0 + state - a
if(a == 0) P = P_0 else P = P_1
state = sample.int(2, 1, prob = P[state+1,])-1
}
reward_0/N_TIME_STEPS
## [1] 0.09999
Policy 1 (Expected reward should be 0.3 * N_TIME_STEPS)
N_TIME_STEPS = 1000000
reward_1 = 0
state = 0
for(i in seq(1,N_TIME_STEPS)) {
a = rbinom(1,1,0.25)
reward_1 = reward_1 + state - a
if(a == 0) P = P_0 else P = P_1
state = sample.int(2, 1, prob = P[state+1,])-1
}
reward_1/N_TIME_STEPS
## [1] 0.305275
Now, let’s simulate a bunch of trajectories of \(\pi_0\) and try to use them to estimate the value of \(\pi_1\).
Use the exp average rewards going forward as V predictions.
sim_single_trajectory = function(x, n_time_steps) {
states = array(dim=n_time_steps+1)
states[1] = 0
actions = array(dim=n_time_steps)
rewards = array(dim=n_time_steps)
cum_rewards = array(dim=n_time_steps)
for(i in seq(1,n_time_steps)) {
actions[i] = rbinom(1,1,0.5)
rewards[i] = states[i] - actions[i] #+ rnorm(1, sd=0.1)
cum_rewards[i] = if(i>1) rewards[i] + cum_rewards[i-1] else rewards[i]
if(actions[i] == 0) P = P_0 else P = P_1
states[i+1] = sample.int(2, 1, prob = P[states[i]+1,])-1
}
return(data.table(
id = x,
t = seq(1,n_time_steps),
state = states[1:n_time_steps],
action = actions,
prob_action_pi_0 = 0.5,
prob_action_pi_1 = ifelse(actions==1, 0.25, 0.75),
reward = rewards,
cum_rewards = cum_rewards,
v_hat_pi_0 = 0.1 * (n_time_steps - seq(1,n_time_steps)),
v_hat_pi_1 = 0.3 * (n_time_steps - seq(1,n_time_steps)),
q_hat_pi_0 = -0.2*actions + 0.1 * (n_time_steps - seq(1,n_time_steps)),
q_hat_pi_1 = -0.2*actions + 0.3 * (n_time_steps - seq(1,n_time_steps))
))
}
N_TRAJECTORIES = 1000
N_TIME_STEPS = 100
trajectories_pi_0 = rbindlist(lapply(seq(1,N_TRAJECTORIES), function(x) sim_single_trajectory(x, N_TIME_STEPS)))
str(trajectories_pi_0)
## Classes 'data.table' and 'data.frame': 100000 obs. of 12 variables:
## $ id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ t : int 1 2 3 4 5 6 7 8 9 10 ...
## $ state : num 0 1 0 1 0 0 1 1 1 1 ...
## $ action : int 0 0 1 0 0 1 0 1 1 1 ...
## $ prob_action_pi_0: num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ prob_action_pi_1: num 0.75 0.75 0.25 0.75 0.75 0.25 0.75 0.25 0.25 0.25 ...
## $ reward : num 0 1 -1 1 0 -1 1 0 0 0 ...
## $ cum_rewards : num 0 1 0 1 1 0 1 1 1 1 ...
## $ v_hat_pi_0 : num 9.9 9.8 9.7 9.6 9.5 9.4 9.3 9.2 9.1 9 ...
## $ v_hat_pi_1 : num 29.7 29.4 29.1 28.8 28.5 28.2 27.9 27.6 27.3 27 ...
## $ q_hat_pi_0 : num 9.9 9.8 9.5 9.6 9.5 9.2 9.3 9 8.9 8.8 ...
## $ q_hat_pi_1 : num 29.7 29.4 28.9 28.8 28.5 28 27.9 27.4 27.1 26.8 ...
## - attr(*, ".internal.selfref")=<externalptr>
Try the Weighted Importance Sampling estimator
No discount factor so just weight each trajectory’s cumualtive reward by its product of the relative policy probabilities
setorder(trajectories_pi_0, id, t)
trajectories_pi_0[, `:=`(
cum_p_ratio = cumprod(prob_action_pi_1/prob_action_pi_0)),
by ='id']
trajectories_pi_0[, weight := cum_p_ratio / sum(cum_p_ratio), by='t']
Unweighted mean reward (expect 10).
# bootstrap trajectory IDs
boot.ci(boot(
data = unique(trajectories_pi_0[, .(id)]),
stype = "f",
statistic = function(dt, bs_f) {
trajectories_by_id = trajectories_pi_0[, .(id_sum = sum(reward)), by='id']
trajectories_by_id[, boot_f := bs_f]
trajectories_by_id[, sum(id_sum*boot_f)/uniqueN(id)]
},
parallel = "multicore",
ncpus = 8,
R = 500),
type='perc')
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot(data = unique(trajectories_pi_0[, .(id)]),
## stype = "f", statistic = function(dt, bs_f) {
## trajectories_by_id = trajectories_pi_0[, .(id_sum = sum(reward)),
## by = "id"]
## trajectories_by_id[, `:=`(boot_f, bs_f)]
## trajectories_by_id[, sum(id_sum * boot_f)/uniqueN(id)]
## }, parallel = "multicore", ncpus = 8, R = 500), type = "perc")
##
## Intervals :
## Level Percentile
## 95% ( 9.005, 9.689 )
## Calculations and Intervals on Original Scale
Weighted mean reward (expect 30).
boot.ci(boot(
data = unique(trajectories_pi_0[, .(id)]),
stype = "f",
statistic = function(dt, bs_f) {
trajectories_by_id = trajectories_pi_0[, .(id_sum = sum(reward*weight)), by='id']
trajectories_by_id[, boot_f := bs_f]
trajectories_by_id[, sum(id_sum*boot_f)]
},
parallel = "multicore",
ncpus = 8,
R = 500),
type='perc')
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot(data = unique(trajectories_pi_0[, .(id)]),
## stype = "f", statistic = function(dt, bs_f) {
## trajectories_by_id = trajectories_pi_0[, .(id_sum = sum(reward *
## weight)), by = "id"]
## trajectories_by_id[, `:=`(boot_f, bs_f)]
## trajectories_by_id[, sum(id_sum * boot_f)]
## }, parallel = "multicore", ncpus = 8, R = 500), type = "perc")
##
## Intervals :
## Level Percentile
## 95% ( 9.47, 44.17 )
## Calculations and Intervals on Original Scale
Next, try weighted doubly robust estimation
setorder(trajectories_pi_0, id, t)
trajectories_pi_0[, lagged_weight := shift(weight, fill=NA), by='id']
trajectories_pi_0[, `:=`(
wdr_is_part = reward*weight,
wdr_reg_part_1 = weight*q_hat_pi_1/uniqueN(id),
wdr_reg_part_2 = lagged_weight*v_hat_pi_1/uniqueN(id)
)]
boot.ci(boot(
data = unique(trajectories_pi_0[, .(id)]),
stype = "f",
statistic = function(dt, bs_f) {
trajectories_by_id =
trajectories_pi_0[,
.(id_sum = sum(wdr_is_part - wdr_reg_part_1 + wdr_reg_part_2, na.rm=TRUE)),
by='id']
trajectories_by_id[, boot_f := bs_f]
trajectories_by_id[, sum(id_sum*boot_f)]
},
parallel = "multicore",
ncpus = 8,
R = 500),
type='perc')
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot(data = unique(trajectories_pi_0[, .(id)]),
## stype = "f", statistic = function(dt, bs_f) {
## trajectories_by_id = trajectories_pi_0[, .(id_sum = sum(wdr_is_part -
## wdr_reg_part_1 + wdr_reg_part_2, na.rm = TRUE)),
## by = "id"]
## trajectories_by_id[, `:=`(boot_f, bs_f)]
## trajectories_by_id[, sum(id_sum * boot_f)]
## }, parallel = "multicore", ncpus = 8, R = 500), type = "perc")
##
## Intervals :
## Level Percentile
## 95% ( 9.81, 42.72 )
## Calculations and Intervals on Original Scale
Get Mean and SD of bootstrap samples of trajectories with different numbers of trajectories and lengths of trajectories.
get_sim_mean_sd = function(n_trajectories, n_time_steps) {
set.seed(1814)
trajectories_pi_0 = rbindlist(lapply(seq(1,n_trajectories), function(x) sim_single_trajectory(x, n_time_steps)))
setorder(trajectories_pi_0, id, t)
trajectories_pi_0[, cum_p_ratio := cumprod(prob_action_pi_1/prob_action_pi_0), by ='id']
trajectories_pi_0[, weight := cum_p_ratio / sum(cum_p_ratio), by='t']
trajectories_pi_0[, lagged_weight := shift(weight, fill=NA), by='id']
trajectories_pi_0[, `:=`(
wdr_is_part = reward*weight,
wdr_reg_part_1 = weight*q_hat_pi_1/uniqueN(id),
wdr_reg_part_2 = lagged_weight*v_hat_pi_1/uniqueN(id)
)]
set.seed(1814)
# is_b = boot.ci(boot(
# data = trajectories_pi_0,
# strata = trajectories_pi_0[, id],
# statistic = function(dt, ind) dt[ind, sum(reward*weight)],
# R = 500), type='perc')
is_b = boot.ci(boot(
data = unique(trajectories_pi_0[, .(id)]),
stype = "f",
statistic = function(dt, bs_f) {
trajectories_by_id =
trajectories_pi_0[, .(id_sum = sum(reward*weight)), by='id']
trajectories_by_id[, boot_f := bs_f]
trajectories_by_id[, sum(id_sum*boot_f)]
},
parallel = "multicore",
ncpus = 8,
R = 500),
type='perc')
set.seed(1814)
# dr_b = boot.ci(boot(
# data = trajectories_pi_0,
# strata = trajectories_pi_0[, id],
# statistic = function(dt, ind) dt[ind, sum(wdr_is_part - wdr_reg_part_1 + wdr_reg_part_2, na.rm=TRUE)],
# R = 500), type='perc')
dr_b = boot.ci(boot(
data = unique(trajectories_pi_0[, .(id)]),
stype = "f",
statistic = function(dt, bs_f) {
trajectories_by_id =
trajectories_pi_0[,
.(id_sum = sum(wdr_is_part - wdr_reg_part_1 + wdr_reg_part_2, na.rm=TRUE)),
by='id']
trajectories_by_id[, boot_f := bs_f]
trajectories_by_id[, sum(id_sum*boot_f)]
},
parallel = "multicore",
ncpus = 8,
R = 500),
type='perc')
return(data.table(
n_trajectories = n_trajectories,
n_time_steps = n_time_steps,
estimator = c('WIS', 'WDR'),
bs_lb = c(is_b$percent[1,4], dr_b$percent[1,4])/n_time_steps,
bs_ub = c(is_b$percent[1,5], dr_b$percent[1,5])/n_time_steps
))
}
# test
# get_sim_mean_sd(100, 10)
# Run over grid of n_trajectories and n_time_steps
grid = expand.grid(c(10,100,1000), c(10,100,1000))
full_res = rbindlist(lapply(seq(1,nrow(grid)), function(x) get_sim_mean_sd(grid[x,1], grid[x,2])))
ggplot(full_res, aes(x=estimator, color=estimator, ymin=bs_lb, ymax=bs_ub)) +
geom_errorbar() +
facet_grid(rows=vars(n_trajectories), cols=vars(n_time_steps), labeller = label_both) +
geom_hline(yintercept = 0.3, linetype='dashed')
Our CGM dataset is ~100 trajectories and ~300 time steps (closest to middle graph)