library(data.table)
library(ggplot2)
Define transition matrices
P_0 = matrix(c(.5,.5,.5,.5), ncol=2)
P_0
## [,1] [,2]
## [1,] 0.5 0.5
## [2,] 0.5 0.5
P_1 = matrix(c(0,1,.5,.5), ncol=2, byrow=TRUE)
P_1
## [,1] [,2]
## [1,] 0.0 1.0
## [2,] 0.5 0.5
Policies:
Simulate 1e6 time steps for each policy, starting in state 0
Policy 0 (Expected reward should be 0.1 * N_TIME_STEPS)
N_TIME_STEPS = 1000000
reward_0 = 0
state = 0
for(i in seq(1,N_TIME_STEPS)) {
a = rbinom(1,1,0.5)
reward_0 = reward_0 + state - a
if(a == 0) P = P_0 else P = P_1
state = sample.int(2, 1, prob = P[state+1,])-1
}
reward_0/N_TIME_STEPS
## [1] 0.099646
Policy 1 (Expected reward should be 0.3 * N_TIME_STEPS)
N_TIME_STEPS = 1000000
reward_1 = 0
state = 0
for(i in seq(1,N_TIME_STEPS)) {
a = rbinom(1,1,0.25)
reward_1 = reward_1 + state - a
if(a == 0) P = P_0 else P = P_1
state = sample.int(2, 1, prob = P[state+1,])-1
}
reward_1/N_TIME_STEPS
## [1] 0.305655
Now, let’s simulate a bunch of trajectories of \(\pi_0\) and try to use them to estimate the value of \(\pi_1\).
Use the exp average rewards going forward as Q predictions.
sim_single_trajectory = function(x) {
states = array(dim=N_TIME_STEPS+1)
states[1] = 0
actions = array(dim=N_TIME_STEPS)
rewards = array(dim=N_TIME_STEPS)
cum_rewards = array(dim=N_TIME_STEPS)
for(i in seq(1,N_TIME_STEPS)) {
actions[i] = rbinom(1,1,0.5)
rewards[i] = states[i] - actions[i] #+ rnorm(1, sd=0.1)
cum_rewards[i] = if(i>1) rewards[i] + cum_rewards[i-1] else rewards[i]
if(actions[i] == 0) P = P_0 else P = P_1
states[i+1] = sample.int(2, 1, prob = P[states[i]+1,])-1
}
return(data.table(
id = x,
t = seq(1,N_TIME_STEPS),
state = states[1:N_TIME_STEPS],
action = actions,
prob_action_pi_0 = 0.5,
prob_action_pi_1 = ifelse(actions==1, 0.25, 0.75),
reward = rewards,
cum_rewards = cum_rewards,
q_hat_pi_0 = 0.1 * (N_TIME_STEPS - seq(1,N_TIME_STEPS)),
q_hat_pi_1 = 0.3 * (N_TIME_STEPS - seq(1,N_TIME_STEPS))
))
}
N_TRAJECTORIES = 1000
N_TIME_STEPS = 100
trajectories_pi_0 = rbindlist(lapply(seq(1,N_TRAJECTORIES), sim_single_trajectory))
str(trajectories_pi_0)
## Classes 'data.table' and 'data.frame': 100000 obs. of 10 variables:
## $ id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ t : int 1 2 3 4 5 6 7 8 9 10 ...
## $ state : num 0 1 1 1 0 0 0 1 1 0 ...
## $ action : int 1 1 0 1 0 0 0 0 0 1 ...
## $ prob_action_pi_0: num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ prob_action_pi_1: num 0.25 0.25 0.75 0.25 0.75 0.75 0.75 0.75 0.75 0.25 ...
## $ reward : num -1 0 1 0 0 0 0 1 1 -1 ...
## $ cum_rewards : num -1 -1 0 0 0 0 0 1 2 1 ...
## $ q_hat_pi_0 : num 9.9 9.8 9.7 9.6 9.5 9.4 9.3 9.2 9.1 9 ...
## $ q_hat_pi_1 : num 29.7 29.4 29.1 28.8 28.5 28.2 27.9 27.6 27.3 27 ...
## - attr(*, ".internal.selfref")=<externalptr>
First trying doubly robust value estimation of \(pi_0\) (same policy as simulated trajectories) to make sure I can get a sensible result.
Trying to implement the recursive function from the paper (below) as v().
Eq
I am assuming that Q_hat(s_t, a_t) is the predicted cumulative reward from the current time until the end of the time horizon given the currently observed state and action. I’ve created a column with this value for each trajectory at each time that here is equal to (T-t)*(0.1) since I am estimating the value of the same policy as that which was observed and the expected cumulative reward is 0.1 times the number of time steps.
I am assuming that V_hat(s_t) is the expected reward from the current time to the end of the time horizon given the state and policy function. For \(\pi_0\), this is equal to Q_hat(s_t, a_t).
I might be doing something wrong here, but not sure what :)
trajectories_pi_0[,rho_t := prob_action_pi_1/prob_action_pi_0]
setorder(trajectories_pi_0, id, t)
# try single trajectory to make sense of things
i = 1
states = trajectories_pi_0[id==i, state]
rho_ts = trajectories_pi_0[id==i, rho_t]
rewards = trajectories_pi_0[id==i, reward]
q_hat_pi_0s = trajectories_pi_0[id==i, q_hat_pi_0]
v_hat = trajectories_pi_0[id==i, q_hat_pi_0]
# recursive v function
v = function(t) {
if(t == N_TIME_STEPS) {
v_hat[t] + 1 * (rewards[t] - q_hat_pi_0s[t])
} else {
v_hat[t] + 1 * (rewards[t] + v(t+1) - q_hat_pi_0s[t])
}
}
v(1)
## [1] 19
I think v(1) should be doubly robust estimate according to paper, but it’s hard to understand exactly what calculation they define.
On full dataset of trajectories
fill_in_v_dr = function(i) {
states = trajectories_pi_0[id==i, state]
rho_ts = rep(1, trajectories_pi_0[id==i, .N])
rewards = trajectories_pi_0[id==i, reward]
q_hat_pi_0s = trajectories_pi_0[id==i, q_hat_pi_0]
v_hat = trajectories_pi_0[id==i, q_hat_pi_0]
# recursive v function
v = function(t) {
if(t == N_TIME_STEPS) {
v_hat[t] + rho_ts[t] * (rewards[t] - q_hat_pi_0s[t])
} else {
v_hat[t] + rho_ts[t] * (rewards[t] + v(t+1) - q_hat_pi_0s[t])
}
}
res = v(1)
return(res)
}
fill_in_v_dr(1)
## [1] 19
test_dr = sapply(seq(1,N_TRAJECTORIES), fill_in_v_dr)
summary(test_dr)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.000 6.000 9.000 9.607 13.250 36.000
ggplot() + geom_histogram(aes(x=test_dr))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Estimates should be centered around 10
Now try simulating the value of \(\pi_1\)
fill_in_v_dr = function(i) {
states = trajectories_pi_0[id==i, state]
rho_ts = trajectories_pi_0[id==i, rho_t]
rewards = trajectories_pi_0[id==i, reward]
v_hat = trajectories_pi_0[id==i, q_hat_pi_1]
# recursive v function
v = function(t) {
if(t == N_TIME_STEPS) {
v_hat[t] + rho_ts[t] * (rewards[t] - v_hat[t])
} else {
v_hat[t] + rho_ts[t] * (rewards[t] + v(t+1) - v_hat[t])
}
}
res = v(1)
return(res)
}
test_dr = sapply(seq(1,N_TRAJECTORIES), fill_in_v_dr)
ggplot() + geom_histogram(aes(x=test_dr)) + xlim(10,50)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 30 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
Estimates should be centered around 30.
So stuff seems to work but I think I took some shortcuts with the V_hat and Q_hat values.