# todo add induced-wald-z test: it also goes with the distribution
#' UR-Uniform Random Sampling Method - Comparison of Allocation Probability Test vs. Traditional Wald Z test
#'
#' Runs UR algorithm in two-armed case, for n batches to calculate an AP Test statistic for each arm
#' @param pa - actual prior probability of controlled group arm a (typically unknown in practice)
#' @param pb - actual prior probability of experimental arm b (typically unknown in practice)
#' @param n - total number of trials in one simulation
#' @param batch - number of trials before posterior probability is updated, but in UR case it is sort of irrelevant, by default this is 2, i.e. update every 2 batches
#' @param burnin - number of trials that run only Uniform Random (UR) sampling, before applying the UR algorithm for all remaining trials, and updating posterior probability every batch (using beta distribution), by default burin = 1, i.e. no burnin period
#' @return vector containing WaldScore[n], reward[n], AP, and APT: the final Wald-Z-score, reward, allocation probability throughout the trial, and AP Test statistics for both arms,
UR <- function(pa, pb, n, burnin = 1, batch = 2) {
# List: tracking successes and failures under each arm
# ex. [0,1,1,1,2,3,3,4,5,6], final index value indicates total number of successes/failures
arm_a_successes <- c(0)
arm_a_failures <- c(0)
arm_b_successes <- c(0)
arm_b_failures <- c(0)
# List: tracking only the current number of successes and failures (Value changes for each iteration)
arm_a_s <- 0
arm_a_f <- 0
arm_b_s <- 0
arm_b_f <- 0
# History that tracks if there was success in either arm for each trial (Variable not used in final results)
# ex. [1,0,1,1,0,1] 1 indicates success for either arm, 0 indicates failure for both arms
abl <- c()
APT = 0 # AP Test statistic over the time, here we should be expecting this to be around 50% of the sample size n.
# ---------- BURNIN PERIOD: use Uniform Random (UR) sampling only ----------
# note: would be the same here burnin and the actual policy as the policy we are simulating is UR, just for style keeping :) (sorry OCD)
for (i in 1:burnin) {
draw_a <- runif(1) # Random probability of arm a being chosen
draw_b <- runif(1) # Random probability of arm b being chosen
# CASE 1: Arm a is chosen
if (draw_a > draw_b) {
# Arm a was successful
if (runif(1) < pa) {
abl[i] <- 1
arm_a_s <- arm_a_s + 1 # update arm a success count
}
# Arm a was not successful
else {
abl[i] <- 0
arm_a_f <- arm_a_f + 1 # update arm a failure count
}
}
# CASE 2: Arm b is chosen
else {
# Arm b was successful
if (runif(1) < pb) {
abl[i] <- 1
arm_b_s <- arm_b_s + 1 # update arm b success count
}
# Arm b was not successful
else {
abl[i] <- 0
arm_b_f <- arm_b_f + 1 # update arm b failure count
}
}
arm_a_successes[i] <- arm_a_s
arm_a_failures[i] <- arm_a_f
arm_b_successes[i] <- arm_b_s
arm_b_failures[i] <- arm_b_f
}
# ---------- UR Period: use Uniform Random (UR) sampling only ----------
for (i in (burnin+1):n) {
m = i %% batch
draw_a <- runif(1) # Random probability of arm a being chosen
draw_b <- runif(1) # Random probability of arm b being chosen
# CASE 1: Arm a is chosen
if (draw_a > draw_b) {
# Arm a was successful
if (runif(1) < pa) {
abl[i] <- 1
arm_a_s <- arm_a_s + 1 # update arm a success count
}
# Arm a was not successful
else {
abl[i] <- 0
arm_a_f <- arm_a_f + 1 # update arm a failure count
}
}
# CASE 2: Arm b is chosen
else {
if(m == 0){
APT = APT + 1
}
# Arm b was successful
if (runif(1) < pb) {
abl[i] <- 1
arm_b_s <- arm_b_s + 1 # update arm b success count
}
# Arm b was not successful
else {
abl[i] <- 0
arm_b_f <- arm_b_f + 1 # update arm b failure count
}
}
# Tracking the build-up of total number of successes, failures, for arm a & b
arm_a_successes[i] <- arm_a_s
arm_a_failures[i] <- arm_a_f
arm_b_successes[i] <- arm_b_s
arm_b_failures[i] <- arm_b_f
}
na <- arm_a_successes + arm_a_failures
nb <- arm_b_successes + arm_b_failures
AP <- nb/(na+nb) # Tracking of the changes of allocation probability over the time, it should stabilize at a 50% when sample size gets large enough
# List: complete history of estimated posterior probability updates, for arm a & b success (final index, final posterior probability for each)
pa_est <- arm_a_successes / na
pb_est <- arm_b_successes / nb
# Wald Score for confidence interval: for analyzing categorical data underlying chi-squared distribution vectors, instead of summation
p_hat <- (arm_a_successes + arm_b_successes)/(na + nb)
WaldScore <- (pb_est - pa_est) / sqrt(p_hat*(1-p_hat)*(1/na + 1/nb))
# List: our expected reward, from both arms (number of times chosen * posterior success probability, for each arm)
expected_reward <- (na[n] * pa + nb[n] * pb) / n
# Lists out the average expected reward for each individual trial (final index, final expected reward value)
actual_reward <- (arm_a_successes[n] + arm_b_successes[n])/n
#return(list(WaldScore = WaldScore[n], reward = reward[n], AP = AP[n], APT = APT))
#return(c(WaldScore[n], expected_reward, actual_reward, AP[n], APT))
return(list(arm_successes = c(arm_a_successes[n], arm_b_successes[n]),
arm_failures = c(arm_a_failures[n], arm_b_failures[n]),
APT = APT))
}AP test application
Section 1 Policy Introductions
Section 1.1 Running Traditional UR
Section 1.1.1: Two-arm Uniform Random Sampling
Section 1.1.2: multi-arm Uniform Random
#' UR Sampling Method - Comparison of Allocation Probability Test vs. Traditional Wald Z test
#'
#' Runs UR algorithm in two-armed case, for n batches to calculate an AP Test statistic for each arm
#' @param priors this is a list c() containing the success rate of each arm c(pa, pb, pc,...)
#' @param n - total number of trials in one simulation
#' @param batch - number of trials before posterior probability is updated, but in UR case it is sort of irrelevant, by default this is 2, i.e. update every 2 batches
#' @param burnin - number of trials that run only Uniform Random (UR) sampling, before applying the UR algorithm for all remaining trials, and updating posterior probability every batch (using beta distribution), by default burin = 1, i.e. no burnin period
#'
#' @return vector containing WaldScore[n], reward[n], AP, and APT: the final Wald-Z-score, reward, allocation probability throughout the trial, and AP Test statistics for both arms,
# or per say the CARA-FLGI method
UR_multiarm <- function(priors, n, burnin = 1, batch = 2) {
# for an n arm setting, let's calculate the number of arms first
num_arms <- length(priors)
# List: tracking successes and failures under each arm
# ex. [0,1,1,1,2,3,3,4,5,6], final index value indicates total number of successes/failures
arm_successes <- array(0, dim=c(num_arms, n))
arm_failures <- array(0, dim=c(num_arms, n))
# List: tracking only the current number of successes and failures (Value changes for each iteration)
arm_s <- rep(0, num_arms)
arm_f <- rep(0, num_arms)
# still keeping track of the allocation probability test statistcs under a multi-arm case
APT = 0
# keep the same abl list for record keeping, to see how many successes and failures along the way
abl <- c()
# ---------- BURNIN PERIOD: use Uniform Random (UR) sampling only ----------
for (i in 1:burnin) {
draw_arms <- runif(num_arms) # randomly drawing one arm out of all arms
arm_chosen <- which.max(draw_arms) # settle down which arm to pick
if(runif(1) < priors[arm_chosen]){
abl[i] <- 1
arm_s[arm_chosen] <- arm_s[arm_chosen] + 1 # update arm a success count
}
else{
abl[i] <- 0
arm_f[arm_chosen] <- arm_f[arm_chosen] + 1 # update arm a failure count
}
# Tracking the build-up of total number of successes, failures, for all the arms:
arm_successes[ , i] <- arm_s
arm_failures[ , i] <- arm_f
}
# ---------- AFTER BURNIN PERIOD: Use TS or UR depending on difference between posterior probabilities ----------
for (i in (burnin + 1):n) {
# Posterior probability updates happen by batch size (whenever m becomes 0, larger batch size -> slower updates)
m = i %% batch
# Draw from Beta(alpha, beta) distribution, using only the current success & failure counts
# alpha = current successes, beta = current failures of each arm
# CASE 1: Posterior probability of arms a & b is updated
if (m == 0) {
draw_arms <- runif(num_arms)
# purely uniform sampling
# assuming the last arm in the prior list is always the experimental arm and we are to determine if it is a superior treatment compare to all other arms
if(all(draw_arms[num_arms] > draw_arms[1:num_arms-1])){
APT = APT + 1
}
}
# CASE 2: Posterior probability of arms a & b is not updated
else {
draw_arms <- runif(num_arms)
# purely uniform random sampling
}
# settle the chosen arm:
arm_chosen <- which.max(draw_arms) # Settle down which arm to pick based on the highest sampled posterior room
# Update arms history (abl), successes, and failures
if (runif(1) < priors[arm_chosen]) {
abl[i] <- 1
arm_s[arm_chosen] <- arm_s[arm_chosen] + 1 # Update success count for chosen arm
} else {
abl[i] <- 0
arm_f[arm_chosen] <- arm_f[arm_chosen] + 1 # Update failure count for chosen arm
}
# Tracking the build-up of total number of successes, failures, for all arms, adding onto successes and failures from burnin period
arm_successes[, i] <- arm_s
arm_failures[, i] <- arm_f
}
# List: complete history of number of times arms a & b were chosen (final index, total number of times chosen for each)
n_total <- arm_successes + arm_failures
AP <- n_total[num_arms, ]/colSums(n_total)
# List: complete history of estimated posterior probability updates, for arm a & b success (final index, final posterior probability for each)
p_est <- arm_successes / n_total
# Wald Score for confidence interval: for analyzing categorical data underlying chi-squared distribution vectors, instead of summation
# we are calculating the Z test for whether or not the last arm is significantly better than the average of the other arms
# Get the proportions of successes for the last arm and the other arms
p_last_arm <- p_est[num_arms, n]
# Get the sample sizes for the last arm and the other arms
n_last_arm <- n_total[num_arms, n]
n_other_arms <- n_total[-num_arms, n]
# Calculate the weighted average success rate for the other arms
p_other_arms <- sum(n_other_arms * p_est[-num_arms, n]) / sum(n_other_arms)
# Calculate the pooled proportion of successes
p_pooled <- (n_last_arm * p_last_arm + sum(n_other_arms) * p_other_arms) / (n_last_arm + sum(n_other_arms))
# Calculate the Z-score
WaldScore <- (p_last_arm - p_other_arms) / sqrt(p_pooled * (1 - p_pooled) * (1/n_last_arm + 1/sum(n_other_arms)))
# List: our expected reward, from both arms (number of times chosen * posterior success probability, for each arm)
expected_reward <- sum(n_total[, n] * priors) / n
# Lists out the average expected reward for each individual trial (final index, final expected reward value)
actual_reward <- sum(arm_successes[, n]) / n
#return(list(WaldScore = WaldScore[n], reward = reward[n], AP = AP[n], APT = APT))
# return(c(WaldScore, expected_reward, actual_reward, AP[n], APT))
return(list(arm_successes = arm_successes[,n],
arm_failures = arm_failures[,n],
APT = APT))
}Section 1.2 Running Traditional TS
Section 1.2.1: 2-arm Traditional TS
#' TS-Thompson Sampling Method - Comparison of Allocation Probability Test vs. Traditional Wald Z test
#'
#' Runs TS algorithm in two-armed case, for n batches to calculate an AP Test statistic for each arm
#' @param pa - actual prior probability of controlled group arm a (typically unknown in practice)
#' @param pb - actual prior probability of experimental arm b (typically unknown in practice)
#' @param n - total number of trials in one simulation
#' @param batch - number of trials before posterior probability is updated, but in UR case it is sort of irrelevant, by default this is 2, i.e. update every 2 batches
#' @param burnin - number of trials that run only Uniform Random (UR) sampling, before applying the UR algorithm for all remaining trials, and updating posterior probability every batch (using beta distribution), by default burin = 1, i.e. no burnin period
#'
#' @return vector containing WaldScore[n], reward[n], AP, and APT: the final Wald-Z-score, reward, allocation probability throughout the trial, and AP Test statistics for both arms,
# or per say the CARA-FLGI method
TS <- function(pa, pb, n, burnin = 1, batch = 2) {
# List: tracking successes and failures under each arm
# ex. [0,1,1,1,2,3,3,4,5,6], final index value indicates total number of successes/failures
arm_a_successes <- c(0)
arm_a_failures <- c(0)
arm_b_successes <- c(0)
arm_b_failures <- c(0)
# List: tracking only the current number of successes and failures (Value changes for each iteration)
arm_a_s <- 0
arm_a_f <- 0
arm_b_s <- 0
arm_b_f <- 0
# History that tracks if there was success in either arm for each trial (Variable not used in final results)
# ex. [1,0,1,1,0,1] 1 indicates success for either arm, 0 indicates failure for both arms
abl <- c()
APT = 0
# ---------- BURNIN PERIOD: use Uniform Random (UR) sampling only ----------
for (i in 1:burnin) {
draw_a <- runif(1) # Random probability of arm a being chosen
draw_b <- runif(1) # Random probability of arm b being chosen
# CASE 1: Arm a is chosen
if (draw_a > draw_b) {
# Arm a was successful
if (runif(1) < pa) {
abl[i] <- 1
arm_a_s <- arm_a_s + 1 # update arm a success count
}
# Arm a was not successful
else {
abl[i] <- 0
arm_a_f <- arm_a_f + 1 # update arm a failure count
}
}
# CASE 2: Arm b is chosen
else {
# Arm b was successful
if (runif(1) < pb) {
abl[i] <- 1
arm_b_s <- arm_b_s + 1 # update arm b success count
}
# Arm b was not successful
else {
abl[i] <- 0
arm_b_f <- arm_b_f + 1 # update arm b failure count
}
}
# Tracking the build-up of total number of successes, failures, for arm a & b
arm_a_successes[i] <- arm_a_s
arm_a_failures[i] <- arm_a_f
arm_b_successes[i] <- arm_b_s
arm_b_failures[i] <- arm_b_f
}
# ---------- AFTER BURNIN PERIOD: Use TS or UR depending on difference between posterior probabilities ----------
for (i in (burnin + 1):n) {
# Posterior probability updates happen by batch size (whenever m becomes 0, larger batch size -> slower updates)
m = i %% batch
# Draw from Beta(alpha, beta) distribution, using only the current success & failure counts
# alpha = current successes, beta = current failures of each arm
# CASE 1: Posterior probability of arms a & b is updated
if (m == 0) {
draw_a <-
rbeta(1, arm_a_successes[i - 1] + 1, arm_a_failures[i - 1] + 1)
draw_b <-
rbeta(1, arm_b_successes[i - 1] + 1, arm_b_failures[i - 1] + 1)
if(draw_b > draw_a){
APT = APT + 1
}
}
# CASE 2: Posterior probability of arms a & b is not updated
else {
draw_a <-
rbeta(1, arm_a_successes[i - m] + 1, arm_a_failures[i - m] + 1)
draw_b <-
rbeta(1, arm_b_successes[i - m] + 1, arm_b_failures[i - m] + 1)
# Using index [i - m] shows we are using the posterior probabilities from the previous batch, and not our most current success/failure counts
}
# Update arm a & b history (abl), successes, and failures
if (draw_a > draw_b) {
if (runif(1) < pa) {
abl[i] <- 1
arm_a_s <- arm_a_s + 1 # update arm a success count
} else {
abl[i] <- 0
arm_a_f <- arm_a_f + 1 # update arm a failure count
}
} else{
if (runif(1) < pb) {
abl[i] <- 1
arm_b_s <- arm_b_s + 1 #update arm b success count
} else {
abl[i] <- 0
arm_b_f <- arm_b_f + 1 #update arm b failure count
}
}
# Tracking the build-up of total number of successes, failures, for arm a & b, adding onto successes and failures from burnin period
arm_a_successes[i] <- arm_a_s
arm_a_failures[i] <- arm_a_f
arm_b_successes[i] <- arm_b_s
arm_b_failures[i] <- arm_b_f
}
# List: complete history of number of times arms a & b were chosen (final index, total number of times chosen for each)
na <- arm_a_successes + arm_a_failures
nb <- arm_b_successes + arm_b_failures
AP <- nb/(na+nb)
# List: complete history of estimated posterior probability updates, for arm a & b success (final index, final posterior probability for each)
pa_est <- arm_a_successes / na
pb_est <- arm_b_successes / nb
# Wald Score for confidence interval: for analyzing categorical data underlying chi-squared distribution vectors, instead of summation
# WaldScore <- (pb_est - pa_est) / sqrt(pa_est * (1 - pa_est) / na + pb_est * (1 - pb_est) / nb)
p_hat <- (arm_a_successes + arm_b_successes)/(na + nb)
WaldScore <- (pb_est - pa_est) / sqrt(p_hat*(1-p_hat)*(1/na + 1/nb))
# List: our expected reward, from both arms (number of times chosen * posterior success probability, for each arm)
expected_reward <- (na[n] * pa + nb[n] * pb) / n
# Lists out the average expected reward for each individual trial (final index, final expected reward value)
actual_reward <- (arm_a_successes[n] + arm_b_successes[n])/n
#return(list(WaldScore = WaldScore[n], reward = reward[n], AP = AP[n], APT = APT))
#return(c(WaldScore[n], expected_reward, actual_reward, AP[n], APT))
return(list(arm_successes = c(arm_a_successes[n], arm_b_successes[n]),
arm_failures = c(arm_a_failures[n], arm_b_failures[n]),
APT = APT))
}Section 1.2.2: multi-arm Traditional TS
#' TS-Thompson Sampling Method - Comparison of Allocation Probability Test vs. Traditional Wald Z test
#'
#' Runs TS algorithm in two-armed case, for n batches to calculate an AP Test statistic for each arm
#' @param priors this is a list c() containing the success rate of each arm c(pa, pb, pc,...)
#' @param n - total number of trials in one simulation
#' @param batch - number of trials before posterior probability is updated, but in UR case it is sort of irrelevant, by default this is 2, i.e. update every 2 batches
#' @param burnin - number of trials that run only Uniform Random (UR) sampling, before applying the UR algorithm for all remaining trials, and updating posterior probability every batch (using beta distribution), by default burin = 1, i.e. no burnin period
#'
#' @return vector containing WaldScore[n], reward[n], AP, and APT: the final Wald-Z-score, reward, allocation probability throughout the trial, and AP Test statistics for both arms,
# or per say the CARA-FLGI method
TS_multiarm <- function(priors, n, burnin = 1, batch = 2) {
# for an n arm setting, let's calculate the number of arms first
num_arms <- length(priors)
# List: tracking successes and failures under each arm
# ex. [0,1,1,1,2,3,3,4,5,6], final index value indicates total number of successes/failures
arm_successes <- array(0, dim=c(num_arms, n))
arm_failures <- array(0, dim=c(num_arms, n))
# List: tracking only the current number of successes and failures (Value changes for each iteration)
arm_s <- rep(0, num_arms)
arm_f <- rep(0, num_arms)
# still keeping track of the allocation probability test statistcs under a multi-arm case
APT = 0
# keep the same abl list for record keeping, to see how many successes and failures along the way
abl <- c()
# ---------- BURNIN PERIOD: use Uniform Random (UR) sampling only ----------
for (i in 1:burnin) {
draw_arms <- runif(num_arms) # randomly drawing one arm out of all arms
arm_chosen <- which.max(draw_arms) # settle down which arm to pick
if(runif(1) < priors[arm_chosen]){
abl[i] <- 1
arm_s[arm_chosen] <- arm_s[arm_chosen] + 1 # update arm a success count
}
else{
abl[i] <- 0
arm_f[arm_chosen] <- arm_f[arm_chosen] + 1 # update arm a failure count
}
# Tracking the build-up of total number of successes, failures, for all the arms:
arm_successes[ , i] <- arm_s
arm_failures[ , i] <- arm_f
}
# ---------- AFTER BURNIN PERIOD: Use TS or UR depending on difference between posterior probabilities ----------
for (i in (burnin + 1):n) {
# Posterior probability updates happen by batch size (whenever m becomes 0, larger batch size -> slower updates)
m = i %% batch
# Draw from Beta(alpha, beta) distribution, using only the current success & failure counts
# alpha = current successes, beta = current failures of each arm
# CASE 1: Posterior probability of arms a & b is updated
if (m == 0) {
draw_arms <- rbeta(num_arms, arm_successes[, i - 1] + 1, arm_failures[, i - 1] + 1)
# assuming the last arm in the prior list is always the experimental arm and we are to determine if it is a superior treatment compare to all other arms
if(all(draw_arms[num_arms] > draw_arms[1:num_arms-1])){
APT = APT + 1
}
}
# CASE 2: Posterior probability of arms a & b is not updated
else {
draw_arms <- rbeta(num_arms, arm_successes[, i - m] + 1, arm_failures[, i - m] + 1)
# Using index [i - m] shows we are using the posterior probabilities from the previous batch, and not our most current success/failure counts
}
# settle the chosen arm:
arm_chosen <- which.max(draw_arms) # Settle down which arm to pick based on the highest sampled posterior room
# Update arms history (abl), successes, and failures
if (runif(1) < priors[arm_chosen]) {
abl[i] <- 1
arm_s[arm_chosen] <- arm_s[arm_chosen] + 1 # Update success count for chosen arm
} else {
abl[i] <- 0
arm_f[arm_chosen] <- arm_f[arm_chosen] + 1 # Update failure count for chosen arm
}
# Tracking the build-up of total number of successes, failures, for all arms, adding onto successes and failures from burnin period
arm_successes[, i] <- arm_s
arm_failures[, i] <- arm_f
}
# List: complete history of number of times arms a & b were chosen (final index, total number of times chosen for each)
n_total <- arm_successes + arm_failures
AP <- n_total[num_arms, ]/colSums(n_total)
# List: complete history of estimated posterior probability updates, for arm a & b success (final index, final posterior probability for each)
p_est <- arm_successes / n_total
# Wald Score for confidence interval: for analyzing categorical data underlying chi-squared distribution vectors, instead of summation
# we are calculating the Z test for whether or not the last arm is significantly better than the average of the other arms
# Get the proportions of successes for the last arm and the other arms
p_last_arm <- p_est[num_arms, n]
# Get the sample sizes for the last arm and the other arms
n_last_arm <- n_total[num_arms, n]
n_other_arms <- n_total[-num_arms, n]
# Calculate the weighted average success rate for the other arms
p_other_arms <- sum(n_other_arms * p_est[-num_arms, n]) / sum(n_other_arms)
# Calculate the pooled proportion of successes
p_pooled <- (n_last_arm * p_last_arm + sum(n_other_arms) * p_other_arms) / (n_last_arm + sum(n_other_arms))
# Calculate the Z-score
WaldScore <- (p_last_arm - p_other_arms) / sqrt(p_pooled * (1 - p_pooled) * (1/n_last_arm + 1/sum(n_other_arms)))
# List: our expected reward, from both arms (number of times chosen * posterior success probability, for each arm)
expected_reward <- sum(n_total[, n] * priors) / n
# Lists out the average expected reward for each individual trial (final index, final expected reward value)
actual_reward <- sum(arm_successes[, n]) / n
#return(list(WaldScore = WaldScore[n], reward = reward[n], AP = AP[n], APT = APT))
#return(c(WaldScore, expected_reward, actual_reward, AP[n], APT))
return(list(arm_successes = arm_successes[,n],
arm_failures = arm_failures[,n],
APT = APT))
}Section 1.3 Running Epsilon TS
#' Epsilon-TS-Thompson Sampling Method - Comparison of Allocation Probability Test vs. Traditional Wald Z test
#'
#' Runs epsilon TS algorithm in two-armed case, for n batches to calculate an AP Test statistic for each arm
#' @param pa - actual prior probability of controlled group arm a (typically unknown in practice)
#' @param pb - actual prior probability of experimental arm b (typically unknown in practice)
#' @param n - total number of trials in one simulation
#' @param batch - number of trials before posterior probability is updated, but in UR case it is sort of irrelevant, by default this is 2, i.e. update every 2 batches
#' @param burnin - number of trials that run only Uniform Random (UR) sampling, before applying the UR algorithm for all remaining trials, and updating posterior probability every batch (using beta distribution), by default burin = 1, i.e. no burnin period
#' @param epsilon - this is the epsilon (or proportion) we defined previous to simulations at a certain level for UR sampling within the simulations, by default let's make it 0.1 i.e. 10% uniform random exploration
#' @return vector containing WaldScore[n], reward[n], AP, and APT: the final Wald-Z-score, reward, allocation probability throughout the trial, and AP Test statistics for both arms,
# or per say the CARA-FLGI method
TS_epsilon <- function(pa, pb, n, burnin = 1, batch = 2, epsilon = 0.1) {
# List: tracking successes and failures under each arm
# ex. [0,1,1,1,2,3,3,4,5,6], final index value indicates total number of successes/failures
arm_a_successes <- c(0)
arm_a_failures <- c(0)
arm_b_successes <- c(0)
arm_b_failures <- c(0)
# List: tracking only the current number of successes and failures (Value changes for each iteration)
arm_a_s <- 0
arm_a_f <- 0
arm_b_s <- 0
arm_b_f <- 0
# History that tracks if there was success in either arm for each trial (Variable not used in final results)
# ex. [1,0,1,1,0,1] 1 indicates success for either arm, 0 indicates failure for both arms
abl <- c()
APT = 0 # AP Test statistic
# ---------- BURNIN PERIOD: use Uniform Random (UR) sampling only ----------
for (i in 1:burnin) {
draw_a <- runif(1) # Random probability of arm a being chosen
draw_b <- runif(1) # Random probability of arm b being chosen
# CASE 1: Arm a is chosen
if (draw_a > draw_b) {
# Arm a was successful
if (runif(1) < pa) {
abl[i] <- 1
arm_a_s <- arm_a_s + 1 # update arm a success count
}
# Arm a was not successful
else {
abl[i] <- 0
arm_a_f <- arm_a_f + 1 # update arm a failure count
}
}
# CASE 2: Arm b is chosen
else {
# Arm b was successful
if (runif(1) < pb) {
abl[i] <- 1
arm_b_s <- arm_b_s + 1 # update arm b success count
}
# Arm b was not successful
else {
abl[i] <- 0
arm_b_f <- arm_b_f + 1 # update arm b failure count
}
}
# Tracking the build-up of total number of successes, failures, for arm a & b
arm_a_successes[i] <- arm_a_s
arm_a_failures[i] <- arm_a_f
arm_b_successes[i] <- arm_b_s
arm_b_failures[i] <- arm_b_f
}
# ---------- AFTER BURNIN PERIOD: Use TS or UR depending on difference between posterior probabilities ----------
for (i in (burnin + 1):n) {
# Posterior probability updates happen by batch size (whenever m becomes 0, larger batch size -> slower updates)
m = i %% batch
# Draw from Beta(alpha, beta) distribution, using only the current success & failure counts
# alpha = current successes, beta = current failures of each arm
# Use UR: If difference between posterior probabilities of arms is less than threshold value (expected difference in reward between arms)
if (runif(1) < epsilon) {
draw_a <- runif(1) # Random probability of arm a being chosen
draw_b <- runif(1) # Random probability of arm b being chosen
}
# Use TS: If difference between posterior probabilities of arms crosses threshold value (expected difference in reward between arms)
else {
# CASE 1: Posterior probability of arms a & b is updated
if (m == 0) {
draw_a <-
rbeta(1, arm_a_successes[i - 1] + 1, arm_a_failures[i - 1] + 1)
draw_b <-
rbeta(1, arm_b_successes[i - 1] + 1, arm_b_failures[i - 1] + 1)
if (draw_b > draw_a){
APT = APT +1
}
}
# CASE 2: Posterior probability of arms a & b is not updated
else {
draw_a <-
rbeta(1, arm_a_successes[i - m] + 1, arm_a_failures[i - m] + 1)
draw_b <-
rbeta(1, arm_b_successes[i - m] + 1, arm_b_failures[i - m] + 1)
}
}
# Update arm a & b history (abl), successes, and failures
if (draw_a > draw_b) {
if (runif(1) < pa) {
abl[i] <- 1
arm_a_s <- arm_a_s + 1 # update arm a success count
} else {
abl[i] <- 0
arm_a_f <- arm_a_f + 1 # update arm a failure count
}
} else{
if (runif(1) < pb) {
abl[i] <- 1
arm_b_s <- arm_b_s + 1 #update arm b success count
} else {
abl[i] <- 0
arm_b_f <- arm_b_f + 1 #update arm b failure count
}
}
# Tracking the build-up of total number of successes, failures, for arm a & b, adding onto successes and failures from burnin period
arm_a_successes[i] <- arm_a_s
arm_a_failures[i] <- arm_a_f
arm_b_successes[i] <- arm_b_s
arm_b_failures[i] <- arm_b_f
}
# List: complete history of number of times arms a & b were chosen (final index, total number of times chosen for each)
na <- arm_a_successes + arm_a_failures
nb <- arm_b_successes + arm_b_failures
AP <- nb/(na+nb)
# List: complete history of estimated posterior probability updates, for arm a & b success (final index, final posterior probability for each)
pa_est <- arm_a_successes / na
pb_est <- arm_b_successes / nb
# Wald Score for confidence interval: for analyzing categorical data underlying chi-squared distribution vectors, instead of summation
#WaldScore <- (pb_est - pa_est) / sqrt(pa_est * (1 - pa_est) / na + pb_est * (1 - pb_est) / nb)
p_hat <- (arm_a_successes + arm_b_successes)/(na + nb)
WaldScore <- (pb_est - pa_est) / sqrt(p_hat*(1-p_hat)*(1/na + 1/nb))
# List: our expected reward, from both arms (number of times chosen * posterior success probability, for each arm)
expected_reward <- (na[n] * pa + nb[n] * pb) / n
# Lists out the average expected reward for each individual trial (final index, final expected reward value)
actual_reward <- (arm_a_successes[n] + arm_b_successes[n])/n
#return(list(WaldScore = WaldScore[n], reward = reward[n], AP = AP[n], APT = APT))
#return(c(WaldScore[n], expected_reward, actual_reward, AP[n], APT))
return(list(arm_successes = c(arm_a_successes[n], arm_b_successes[n]),
arm_failures = c(arm_a_failures[n], arm_b_failures[n]),
APT = APT))
}Section 1.4 Running TS-PostDiff (Code adapted from Tong)
#' TS-PostDiff - Allocation Probability Test
#'
#' Runs TS-PostDiff algorithm in two-armed case, for n batches, with given batch size and BURNIN period, to calculate an AP Test statistic for each arm
#' @param pa - actual prior probability of arm a (typically unknown in practice)
#' @param pb - actual prior probability of arm b (typically unknown in practice)
#' @param n - total number of trials in one simulation
#' @param c - threshold of difference in expected reward between the two arms, by default it is 0.1
#' @param batch - number of trials before posterior probability is updated, but in UR case it is sort of irrelevant, by default this is 2, i.e. update every 2 batches
#' @param burnin - number of trials that run only Uniform Random (UR) sampling, before applying the UR algorithm for all remaining trials, and updating posterior probability every batch (using beta distribution), by default burin = 1, i.e. no burnin period
#' @return vector containing WaldScore[n], reward[n], APT_1, APT_2: the final waldscore, reward, and AP Test statistics for both arms
TSPDD <- function(pa, pb, n, burnin = 1, batch = 2, c = 0.1) {
# List: tracking successes and failures under each arm
# ex. [0,1,1,1,2,3,3,4,5,6], final index value indicates total number of successes/failures
arm_a_successes <- c(0)
arm_a_failures <- c(0)
arm_b_successes <- c(0)
arm_b_failures <- c(0)
# List: tracking only the current number of successes and failures (Value changes for each iteration)
arm_a_s <- 0
arm_a_f <- 0
arm_b_s <- 0
arm_b_f <- 0
# History that tracks if there was success in either arm for each trial (Variable not used in final results)
# ex. [1,0,1,1,0,1] 1 indicates success for either arm, 0 indicates failure for both arms
abl <- c()
APT = 0 # AP Test statistic
# ---------- BURNIN PERIOD: use Uniform Random (UR) sampling only ----------
for (i in 1:burnin) {
draw_a <- runif(1) # Random probability of arm a being chosen
draw_b <- runif(1) # Random probability of arm b being chosen
# CASE 1: Arm a is chosen
if (draw_a > draw_b) {
# Arm a was successful
if (runif(1) < pa) {
abl[i] <- 1
arm_a_s <- arm_a_s + 1 # update arm a success count
}
# Arm a was not successful
else {
abl[i] <- 0
arm_a_f <- arm_a_f + 1 # update arm a failure count
}
}
# CASE 2: Arm b is chosen
else {
# Arm b was successful
if (runif(1) < pb) {
abl[i] <- 1
arm_b_s <- arm_b_s + 1 # update arm b success count
}
# Arm b was not successful
else {
abl[i] <- 0
arm_b_f <- arm_b_f + 1 # update arm b failure count
}
}
# Tracking the build-up of total number of successes, failures, for arm a & b
arm_a_successes[i] <- arm_a_s
arm_a_failures[i] <- arm_a_f
arm_b_successes[i] <- arm_b_s
arm_b_failures[i] <- arm_b_f
}
# ---------- AFTER BURNIN PERIOD: Use TS or UR depending on difference between posterior probabilities ----------
for (i in (burnin + 1):n) {
# Posterior probability updates happen by batch size (whenever m becomes 0, larger batch size -> slower updates)
m = i %% batch
# Draw from Beta(alpha, beta) distribution, using only the current success & failure counts
# alpha = current successes, beta = current failures of each arm
# Use UR: If difference between posterior probabilities of arms is less than threshold value (expected difference in reward between arms)
if (abs(draw_a - draw_b) < c) {
draw_a <- runif(1) # Random probability of arm a being chosen
draw_b <- runif(1) # Random probability of arm b being chosen
}
# Use TS: If difference between posterior probabilities of arms crosses threshold value (expected difference in reward between arms)
else {
# CASE 1: Posterior probability of arms a & b is updated
if (m == 0) {
draw_a <-
rbeta(1, arm_a_successes[i - 1] + 1, arm_a_failures[i - 1] + 1)
draw_b <-
rbeta(1, arm_b_successes[i - 1] + 1, arm_b_failures[i - 1] + 1)
if (draw_b > draw_a){
APT = APT + 1
}
}
# CASE 2: Posterior probability of arms a & b is not updated
else {
draw_a <-
rbeta(1, arm_a_successes[i - m] + 1, arm_a_failures[i - m] + 1)
draw_b <-
rbeta(1, arm_b_successes[i - m] + 1, arm_b_failures[i - m] + 1)
}
}
# Update arm a & b history (abl), successes, and failures
if (draw_a > draw_b) {
if (runif(1) < pa) {
abl[i] <- 1
arm_a_s <- arm_a_s + 1 # update arm a success count
} else {
abl[i] <- 0
arm_a_f <- arm_a_f + 1 # update arm a failure count
}
} else{
if (runif(1) < pb) {
abl[i] <- 1
arm_b_s <- arm_b_s + 1 #update arm b success count
} else {
abl[i] <- 0
arm_b_f <- arm_b_f + 1 #update arm b failure count
}
}
# Tracking the build-up of total number of successes, failures, for arm a & b, adding onto successes and failures from burnin period
arm_a_successes[i] <- arm_a_s
arm_a_failures[i] <- arm_a_f
arm_b_successes[i] <- arm_b_s
arm_b_failures[i] <- arm_b_f
}
# List: complete history of number of times arms a & b were chosen (final index, total number of times chosen for each)
na <- arm_a_successes + arm_a_failures
nb <- arm_b_successes + arm_b_failures
# List: complete history of estimated posterior probability updates, for arm a & b success (final index, final posterior probability for each)
pa_est <- arm_a_successes / na
pb_est <- arm_b_successes / nb
AP <- nb/(na+nb)
# Wald Score for confidence interval: for analyzing categorical data underlying chi-squared distribution vectors, instead of summation
#WaldScore <- (pb_est - pa_est) / sqrt(pa_est * (1 - pa_est) / na + pb_est * (1 - pb_est) / nb)
p_hat <- (arm_a_successes + arm_b_successes)/(na + nb)
WaldScore <- (pb_est - pa_est) / sqrt(p_hat*(1-p_hat)*(1/na + 1/nb))
# List: our expected reward, from both arms (number of times chosen * posterior success probability, for each arm)
expected_reward <- (na[n] * pa + nb[n] * pb) / n
# Lists out the average expected reward for each individual trial (final index, final expected reward value)
actual_reward <- (arm_a_successes[n] + arm_b_successes[n])/n
#return(list(WaldScore = WaldScore[n], reward = reward[n], AP = AP[n], APT = APT))
#return(c(WaldScore[n], expected_reward, actual_reward, AP[n], APT))
return(list(arm_successes = c(arm_a_successes[n], arm_b_successes[n]),
arm_failures = c(arm_a_failures[n], arm_b_failures[n]),
APT = APT))
}Section 2: Finding Critical Value
Section 2.1 Finding Critical Values, Using Normal Assumption on Law of Large Numbers:
#' Critical Value (Normal)
#'
#' Finds critical value using normal assumption on LLN
#' @param APT - AP Test statistic (List of either APT_ban_UR/APT_w_UR results across different simulations)
#' @param alpha - Customized significance level
#'
#' @return generated critical value from normality assumption
normal_cv <- function(APT, alpha) {
m <- mean(APT)
v <- var(APT)
sd <- sqrt(v)
normal_rej <- qnorm(1 - alpha) #Rejection area defined (following normal distribution assumption)
cv <- normal_rej * sd + m
return(cv)
}Section 2.2 Finding Critical Values, Using Empirical Approach
#' Critical Value (Empirical)
#'
#' Finds critical value using empirical assumption on LLN
#' @param APT - AP Test statistic (List of either APT_ban_UR/APT_w_UR results across different simulations)
#' @param alpha - Customized significance level
#'
#' @return generated critical value from empirical simulations
empirical_cv <- function(APT, alpha) {
cdf = 0
i = 0
# Repeatedly add the empirical probabilities of each [i] value until cdf reaches 1 - significance level
while (cdf <= 1- alpha) {
p_i <-sum(APT == i)/length(APT)
cdf <- sum(cdf, p_i)
i = i + 1
}
return(i - 1)
}Section 2.3 Replicating and rerun the simulations for B replications
To achieve that, let us write a helper function to achieve on analyzing arm successes, arm failures, APT and priors to come out with test statistics:
# it takes in a multi-dimenional priors could be two-arm, three-arm, and so on
# it takes in the rejection alpha value, one tail.
# takes in the corresponding list of successes for each arm, for e.g. c(10,15) means 10 successes for arm 1 and 15 successes for arm 2
# takes in the corresponding list of failures for each arm, for e.g. c(20,35) means 20 failures for arm 1 and 35 failures for arm 2
# APT is calculated from the sampling result of the policy, following the CARA-FLGI method
analyzing_results <- function(alpha, priors, arm_successes, arm_failures, APT){
s <- length(priors) # calculate the dimension of the priors
n <- arm_successes + arm_failures # total number of samples in each arm
p_hat <- arm_successes/n # posterior probability estimated at the end of the trials
average_reward <- sum(arm_successes)/sum(n) # aaverage reward for the whole trial
expect_reward <- sum(priors*n)/sum(n) # expected reward given the prior success rate and the sample size for each arm
AP <- n[s]/sum(n) # the allocation probability for the last arm (experimental arm)
wald <- c()
for (i in 1: s-1){
wald[i] <- (p_hat[s] - p_hat[i])/sqrt(p_hat[s]*(1-p_hat[s])/(n[s]+0.01) + p_hat[i]*(1-p_hat[i])/(n[i]+0.01))
}
reject <- ifelse(all(as.vector(na.omit(wald)) > qnorm(1-alpha)), 1, 0) # return 1 if the null hypothesis can be rejected, 0 otherwise
return(c(reject, expect_reward, average_reward, AP, APT))
}analyzing_results(0.05, c(0.05,0.4,0.5), c(0,20,30), c(0,20,15), 25)[1] 0.0000000 0.4529412 0.5882353 0.5294118 25.0000000
library(tidyverse)
# One output for each simulation, the four dimensions are waldscore, average reward, APT_ban_UR, and APT_w_UR
# adding z biomarkers
simulation_n_times <-
function(alpha, # rejection alpha
policy, # which policy to use, UR, TS, UR_multiarm, etc.
effect_size, # the difference between p2 - p1
n, # total sample size
B, # number of replication
burnin, # period of UR we set for running
batch, # number of samples in each batch before updates
multi_arm = FALSE, # number of arms in the treatment, if two arm by default it is FALSE
z = 1, # number of biomarker grops in tota, by default this is 1 (i.e. single group biomarker)
priors = rep(0.5, 3), # set a default priors for multi-arm case for prevention of error
...) { # ... as in some of the parameters in other policies and multi_arm
if (multi_arm == FALSE) {
pa <- 0.4 # define the corresponding priors of the success rate centred and scaled at p1+p2 = 1
pb <- 0.4 + effect_size
results <- array(dim = c(B, 5)) # output a five dimensional arrage
# number of simulations
K <- n/batch # this is the number of blocks in the session
for (i in 1:B) {
arm_successes <- rep(0,2) # for temporary storage of each run for all biomarkers
arm_failures <- rep(0,2)
APT <- 0
repeat {n_z <- rmultinom(1, size = K, prob = rep(1, z) / z); if (all(n_z > 0)) break}
for (m in 1:z){
# n_z <- rmultinom(1, size = K, prob = rep(1, z) / z)
step_result <- policy(
pa = pa,
pb = pb,
n = n_z[m]*batch,
burnin = burnin,
batch = batch,
...
)
arm_successes <- arm_successes + step_result$arm_successes
arm_failures <- arm_failures + step_result$arm_failures
APT <- APT + step_result$APT
}
results[i, ] <- analyzing_results(alpha, c(pa,pb), arm_successes, arm_failures, APT)
}
average_power = mean(as.vector(na.omit(results[, 1]))) # find average power
expected_reward = mean(results[, 2]) # expected reward setting
average_reward = mean(results[, 3]) # calculate the actual real reward throughout many trials
average_AP = mean(results[, 4]) # find average allocation probability for the experimental arm
APT = results[, 5]
# # Empirical and normal CV of APT
# em_cv <- empirical_cv(APT, alpha)
# norm_cv <- normal_cv(APT, alpha)
# F1 <- ecdf(APT)
# # Empirical and normal FPR of APT is
# em_FPR <- 1 - F1(em_cv)
# norm_FPR <- 1 - F1(floor(norm_cv))
return(
list(
average_power = average_power,
expected_reward = expected_reward,
average_reward = average_reward,
average_AP = average_AP,
APT = APT
)
)
}
if (class(multi_arm) == "numeric"){
K <- n/batch # this is the number of blocks in the session
results <- array(dim = c(B, 5))
for (i in 1:B) {
arm_successes <- rep(0,length(priors))# for temporary storage of each run for all biomarkers
arm_failures <- rep(0,length(priors))
APT <- 0
repeat {n_z <- rmultinom(1, size = K, prob = rep(1, z) / z); if (all(n_z > 0)) break}
for (m in 1:z) {
#n_z <- rmultinom(1, size = K, prob = rep(1, z) / z)
# at this point of time there is no other policies like TS_contextual or TS_epsilon
step_result <- policy(
priors = priors,
n = n_z[m]*batch,
burnin = burnin,
batch = batch
)
arm_successes <- arm_successes + step_result$arm_successes
arm_failures <- arm_failures + step_result$arm_failures
APT <- APT + step_result$APT
}
results[i, ] <- analyzing_results(alpha, priors, arm_successes, arm_failures, APT)
}
average_power = mean(as.vector(na.omit(results[, 1]))) # Find average power based on waldscore
expected_reward = mean(results[, 2]) # expected reward setting
average_reward = mean(results[, 3]) # calculate the actual real reward throughout many trials
average_AP = mean(results[, 4]) # find average allocation probability for the experimental arm
APT = results[, 5]
# # Empirical and normal CV of APT
# em_cv <- empirical_cv(APT, alpha)
# norm_cv <- normal_cv(APT, alpha)
# F1 <- ecdf(APT)
# # Empirical and normal FPR of APT is
# em_FPR <- 1 - F1(em_cv)
# norm_FPR <- 1 - F1(floor(norm_cv))
return(
list(
average_power = average_power,
expected_reward = expected_reward,
average_reward = average_reward,
average_AP = average_AP,
APT = APT
)
)
}
else{
print('Error: Please double check your multi_arm input is numeric or FALSE')
}
}simulation_n_times(alpha = 0.05,
policy = TS,
effect_size = 0.4,
n = 20,
B = 100,
burnin = 1,
batch = 2,
z = 2,
multi_arm = FALSE)$average_power
[1] 0.59
$expected_reward
[1] 0.681
$average_reward
[1] 0.678
$average_AP
[1] 0.7025
$APT
[1] 5 7 5 9 7 9 8 3 8 9 5 6 6 7 4 7 3 8 8 8 8 8 5 8 7
[26] 7 6 6 8 9 6 10 8 7 10 8 7 8 9 10 9 4 8 7 8 2 8 7 8 9
[51] 8 9 8 7 10 9 7 10 8 8 5 9 7 6 8 8 8 10 3 9 8 2 6 5 7
[76] 10 8 8 10 10 8 7 4 8 9 6 8 8 5 8 8 7 9 7 9 5 8 6 9 9
Section 2.4: Construction of the reporting table:
Section 2.4.1: Comparing at a specific effect size and sample size:
# it takes in the same parameter as in before and output a comparison dataframe across different policies.
compare_policies <-
function(alpha,
effect_size,
n,
B,
burnin,
batch,
z = 1,
multi_arm = 3,
priors = c(0.4,0.4,0.4), # the last one is the experimental arm, it will be changing with time
epsilon = 0.1,
c = 0.1) {
priors[multi_arm] <- priors[multi_arm] + effect_size
UR_result <-
simulation_n_times(alpha, UR, effect_size, n, B, burnin, batch, multi_arm = FALSE, z, priors = priors)
UR_multi_arm_result <-
simulation_n_times(alpha, UR_multiarm, effect_size, n, B, burnin, batch, multi_arm, z, priors = priors)
TS_result <-
simulation_n_times(alpha, TS, effect_size, n, B, burnin, batch, multi_arm = FALSE, z, priors = priors)
TS_multi_arm_result <-
simulation_n_times(alpha, TS_multiarm, effect_size, n, B, burnin, batch, multi_arm, z, priors = priors)
TS_epsilon_result <-
simulation_n_times(alpha, TS_epsilon, effect_size, n, B, burnin, batch, multi_arm = FALSE, z, priors = priors, epsilon)
TSPDD_result <-
simulation_n_times(alpha, TSPDD, effect_size, n, B, burnin, batch, multi_arm = FALSE, z, priors = priors, c)
# Create a summary table
summary_table <- data.frame(
Metric = c(
"Average Power",
"Expected Reward",
"Average Reward",
"Average AP",
"Empirical CV",
"Normal CV",
"Empirical FPR",
"Normal FPR"
),
UR = c(
UR_result$average_power,
UR_result$expected_reward,
UR_result$average_reward,
UR_result$average_AP,
UR_result$em_cv,
UR_result$norm_cv,
UR_result$em_FPR,
UR_result$norm_FPR
),
UR_multiarm = c(
UR_multi_arm_result$average_power,
UR_multi_arm_result$expected_reward,
UR_multi_arm_result$average_reward,
UR_multi_arm_result$average_AP,
UR_multi_arm_result$em_cv,
UR_multi_arm_result$norm_cv,
UR_multi_arm_result$em_FPR,
UR_multi_arm_result$norm_FPR
),
TS = c(
TS_result$average_power,
TS_result$expected_reward,
TS_result$average_reward,
TS_result$average_AP,
TS_result$em_cv,
TS_result$norm_cv,
TS_result$em_FPR,
TS_result$norm_FPR
),
TS_multiarm = c(
TS_multi_arm_result$average_power,
TS_multi_arm_result$expected_reward,
TS_multi_arm_result$average_reward,
TS_multi_arm_result$average_AP,
TS_multi_arm_result$em_cv,
TS_multi_arm_result$norm_cv,
TS_multi_arm_result$em_FPR,
TS_multi_arm_result$norm_FPR
),
TS_epsilon = c(
TS_epsilon_result$average_power,
TS_epsilon_result$expected_reward,
TS_epsilon_result$average_reward,
TS_epsilon_result$average_AP,
TS_epsilon_result$em_cv,
TS_epsilon_result$norm_cv,
TS_epsilon_result$em_FPR,
TS_epsilon_result$norm_FPR
),
TSPDD = c(
TSPDD_result$average_power,
TSPDD_result$expected_reward,
TSPDD_result$average_reward,
TSPDD_result$average_AP,
TSPDD_result$em_cv,
TSPDD_result$norm_cv,
TSPDD_result$em_FPR,
TSPDD_result$norm_FPR
)
)
return(summary_table)
}compare_policies(alpha = 0.05,
effect_size = 0.4,
n = 100,
B = 1000,
burnin = 1,
batch = 2,
z = 1,
multi_arm = 3,
priors = c(0.05,0.4, 0.4)) Metric UR UR_multiarm TS TS_multiarm TS_epsilon TSPDD
1 Average Power 0.9960 0.970000 0.864000 0.872000 0.907000 0.945000
2 Expected Reward 0.6012 0.415798 0.769108 0.745857 0.754976 0.756652
3 Average Reward 0.5984 0.412610 0.767610 0.746660 0.753950 0.758540
4 Average AP 0.5030 0.332550 0.922770 0.892730 0.887440 0.891630
5 Empirical CV 0.9960 0.970000 0.864000 0.872000 0.907000 0.945000
6 Normal CV 0.6012 0.415798 0.769108 0.745857 0.754976 0.756652
7 Empirical FPR 0.5984 0.412610 0.767610 0.746660 0.753950 0.758540
8 Normal FPR 0.5030 0.332550 0.922770 0.892730 0.887440 0.891630
Section 2.4.2: Comparing at a range of effect size and sample size:
# notice that here the input of the effect_sizes is a list, all the others remain the same, this is a helper function that I created for easy comparison
compare_effect_sizes <-
function(alpha,
effect_sizes,
n,
B,
burnin,
batch,
z = 1,
multi_arm = 3, # setting default for 3-arm cases comparison
priors = c(0.4,0.4,0.4), # the last one is the experimental arm, it will be changing with time
epsilon = 0.1, # setting default
c = 0.1) {
UR_avg_reward <- c()
UR_multiarm_avg_reward <-c()
TS_avg_reward <- c()
TS_multiarm_avg_reward <-c()
TS_epsilon_avg_reward <- c()
TSPDD_avg_reward <- c()
UR_power <- c()
UR_multiarm_power <- c()
TS_power <- c()
TS_multiarm_power <- c()
TS_epsilon_power <- c()
TSPDD_power <- c()
# Assuming allocation probability is represented by Average AP
UR_alloc_prob <- c()
UR_multiarm_alloc_prob <- c()
TS_alloc_prob <- c()
TS_multiarm_alloc_prob <- c()
TS_epsilon_alloc_prob <- c()
TSPDD_alloc_prob <- c()
for (effect_size in effect_sizes){
result <- compare_policies(alpha = alpha,
effect_size = effect_size,
n = n,
B = B,
burnin = burnin,
batch = batch,
z = z,
multi_arm = multi_arm,
priors = priors,
epsilon = epsilon,
c = c)
# avg reward construction
UR_avg_reward <- c(UR_avg_reward, sprintf("%.2f", result$UR[3]))
UR_multiarm_avg_reward <- c(UR_multiarm_avg_reward, sprintf("%.2f", result$UR_multiarm[3]) )
TS_avg_reward <- c(TS_avg_reward, sprintf("%.2f", result$TS[3]))
TS_multiarm_avg_reward <- c(TS_multiarm_avg_reward, sprintf("%.2f", result$TS_multiarm[3]))
TS_epsilon_avg_reward <- c(TS_epsilon_avg_reward, sprintf("%.2f", result$TS_epsilon[3]))
TSPDD_avg_reward <- c(TSPDD_avg_reward, sprintf("%.2f", result$TSPDD[3]))
# avg power construction
UR_power <- c(UR_power, sprintf("%.2f", result$UR[1]))
UR_multiarm_power <- c(UR_multiarm_power, sprintf("%.2f", result$UR_multiarm[1]))
TS_power <- c(TS_power, sprintf("%.2f", result$TS[1]))
TS_multiarm_power <- c(TS_multiarm_power, sprintf("%.2f", result$TS_multiarm[1]))
TS_epsilon_power <- c(TS_epsilon_power, sprintf("%.2f", result$TS_epsilon[1]))
TSPDD_power <- c(TSPDD_power, sprintf("%.2f", result$TSPDD[1]))
# avg allocation probability construction
UR_alloc_prob <- c(UR_alloc_prob, sprintf("%.2f", result$UR[4]))
UR_multiarm_alloc_prob <- c(UR_multiarm_alloc_prob, sprintf("%.2f", result$UR_multiarm[4]))
TS_alloc_prob <- c(TS_alloc_prob, sprintf("%.2f", result$TS[4]))
TS_multiarm_alloc_prob <- c(TS_multiarm_alloc_prob, sprintf("%.2f", result$TS_multiarm[4]))
TS_epsilon_alloc_prob <- c(TS_epsilon_alloc_prob, sprintf("%.2f", result$TS_epsilon[4]))
TSPDD_alloc_prob <- c(TSPDD_alloc_prob, sprintf("%.2f", result$TSPDD[4]))
}
final_summary_table <- data.frame(
Metric = c("Average Reward",
"Power",
"Allocation Probability"),
UR = c(
paste(UR_avg_reward, collapse = " | "),
paste(UR_power, collapse = " | "),
paste(UR_alloc_prob, collapse = " | ")
),
UR_multiarm = c(
paste(UR_multiarm_avg_reward, collapse = ' | '),
paste(UR_multiarm_power, collapse = ' | '),
paste(UR_multiarm_alloc_prob, collapse = ' | ')
),
TS = c(
paste(TS_avg_reward, collapse = " | "),
paste(TS_power, collapse = " | "),
paste(TS_alloc_prob, collapse = " | ")
),
TS_multiarm = c(
paste(TS_multiarm_avg_reward, collapse = ' | '),
paste(TS_multiarm_power, collapse = ' | '),
paste(TS_multiarm_alloc_prob, collapse = ' | ')
),
TS_epsilon = c(
paste(TS_epsilon_avg_reward, collapse = " | "),
paste(TS_epsilon_power, collapse = " | "),
paste(TS_epsilon_alloc_prob, collapse = " | ")
),
TSPDD = c(
paste(TSPDD_avg_reward, collapse = " | "),
paste(TSPDD_power, collapse = " | "),
paste(TSPDD_alloc_prob, collapse = " | ")
))
return(final_summary_table)
}#demo
compare_effect_sizes(alpha = 0.05,
effect_sizes = c(0.1,0.2,0.3),
n = 100,
B = 100,
burnin = 1,
batch = 2,
z = 2,
multi_arm = 3,
priors = c(0.05,0.5, 0.5)) Metric UR UR_multiarm
1 Average Reward 0.45 | 0.51 | 0.54 0.39 | 0.42 | 0.46
2 Power 0.22 | 0.62 | 0.90 0.22 | 0.65 | 0.87
3 Allocation Probability 0.50 | 0.50 | 0.50 0.33 | 0.34 | 0.34
TS TS_multiarm TS_epsilon TSPDD
1 0.46 | 0.54 | 0.65 0.52 | 0.62 | 0.71 0.46 | 0.54 | 0.63 0.46 | 0.55 | 0.64
2 0.29 | 0.64 | 0.87 0.23 | 0.53 | 0.85 0.31 | 0.56 | 0.88 0.22 | 0.72 | 0.89
3 0.65 | 0.73 | 0.82 0.57 | 0.70 | 0.78 0.60 | 0.71 | 0.77 0.57 | 0.71 | 0.78
Section 3: Simulations:
Section 3.1: Global Settings
In this section we will provide some unified parameters across different simulations:
# let's make this a function
set.seed(0) # for results to be replicable
#calculate power and reward in case p0 < p1
alpha <- 0.05 # rejection region
effect_size <- 0 # by default effect size is 0: no difference between arms | effect size is 0.05-0.1: Intervention is of relatively small or marginal effect | effect size is 0.1-0.2: Medium or smal effect | effect size is 0.2-0.3+: relatively large effectt between interventions
n <- 100 # sample_size(total number of trials in one simulation)
burnin <- n* 0.1 # make by default the burin size is 10% of sample size
z = 1 # by default 1 biomarker group
multi_arm <- 3 #by default 3-arm case
priors <- c(0.05, 0.4, 0.4)
B <- 1000 # number of simulations
batch <- 2 # batch_sizeSection 3.2: Sample size n=50
n <- 50 # notice that this will change the gloabl setting of parametersSection 3.2.1: Effect size = 0.0
compare_policies(alpha,effect_size = 0.0, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1) Metric UR UR_multiarm TS TS_multiarm TS_epsilon TSPDD
1 Average Power 0.06300 0.066000 0.10000 0.104000 0.09500 0.08300
2 Expected Reward 0.40000 0.283772 0.40000 0.356285 0.40000 0.40000
3 Average Reward 0.39896 0.281300 0.39730 0.352080 0.40126 0.40254
4 Average AP 0.50342 0.334260 0.48912 0.446060 0.50634 0.49972
5 Empirical CV 0.06300 0.066000 0.10000 0.104000 0.09500 0.08300
6 Normal CV 0.40000 0.283772 0.40000 0.356285 0.40000 0.40000
7 Empirical FPR 0.39896 0.281300 0.39730 0.352080 0.40126 0.40254
8 Normal FPR 0.50342 0.334260 0.48912 0.446060 0.50634 0.49972
Section 3.2.2 Effect size = 0.05
compare_policies(alpha,effect_size = 0.05, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1) Metric UR UR_multiarm TS TS_multiarm TS_epsilon TSPDD
1 Average Power 0.110000 0.093000 0.160000 0.169000 0.145000 0.141000
2 Expected Reward 0.425101 0.299585 0.427838 0.383113 0.427641 0.427121
3 Average Reward 0.428580 0.303360 0.429720 0.378920 0.426480 0.426720
4 Average AP 0.502020 0.330680 0.556760 0.484480 0.552820 0.542420
5 Empirical CV 0.110000 0.093000 0.160000 0.169000 0.145000 0.141000
6 Normal CV 0.425101 0.299585 0.427838 0.383113 0.427641 0.427121
7 Empirical FPR 0.428580 0.303360 0.429720 0.378920 0.426480 0.426720
8 Normal FPR 0.502020 0.330680 0.556760 0.484480 0.552820 0.542420
Section 3.2.3: Effect size = 0.1
compare_policies(alpha,effect_size = 0.1, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1) Metric UR UR_multiarm TS TS_multiarm TS_epsilon TSPDD
1 Average Power 0.170000 0.152000 0.23300 0.253000 0.228000 0.219000
2 Expected Reward 0.450086 0.315956 0.46155 0.414755 0.459512 0.459426
3 Average Reward 0.449760 0.314640 0.46034 0.418940 0.460560 0.459480
4 Average AP 0.500860 0.333460 0.61550 0.536540 0.595120 0.594260
5 Empirical CV 0.170000 0.152000 0.23300 0.253000 0.228000 0.219000
6 Normal CV 0.450086 0.315956 0.46155 0.414755 0.459512 0.459426
7 Empirical FPR 0.449760 0.314640 0.46034 0.418940 0.460560 0.459480
8 Normal FPR 0.500860 0.333460 0.61550 0.536540 0.595120 0.594260
Section 3.2.4: Effect size = 0.2
compare_policies(alpha,effect_size = 0.2, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1) Metric UR UR_multiarm TS TS_multiarm TS_epsilon TSPDD
1 Average Power 0.438000 0.335000 0.409000 0.383000 0.432000 0.413000
2 Expected Reward 0.499472 0.350076 0.542552 0.494691 0.538416 0.533756
3 Average Reward 0.501500 0.352600 0.540060 0.497180 0.539140 0.535860
4 Average AP 0.497360 0.335160 0.712760 0.646460 0.692080 0.668780
5 Empirical CV 0.438000 0.335000 0.409000 0.383000 0.432000 0.413000
6 Normal CV 0.499472 0.350076 0.542552 0.494691 0.538416 0.533756
7 Empirical FPR 0.501500 0.352600 0.540060 0.497180 0.539140 0.535860
8 Normal FPR 0.497360 0.335160 0.712760 0.646460 0.692080 0.668780
Section 3.2.5: Effect size = 0.3
compare_policies(alpha,effect_size = 0.3, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1) Metric UR UR_multiarm TS TS_multiarm TS_epsilon TSPDD
1 Average Power 0.737000 0.573000 0.62900 0.576000 0.623000 0.675000
2 Expected Reward 0.549076 0.383918 0.63799 0.582702 0.628222 0.626014
3 Average Reward 0.546540 0.387000 0.63740 0.580780 0.631020 0.626340
4 Average AP 0.496920 0.334240 0.79330 0.713120 0.760740 0.753380
5 Empirical CV 0.737000 0.573000 0.62900 0.576000 0.623000 0.675000
6 Normal CV 0.549076 0.383918 0.63799 0.582702 0.628222 0.626014
7 Empirical FPR 0.546540 0.387000 0.63740 0.580780 0.631020 0.626340
8 Normal FPR 0.496920 0.334240 0.79330 0.713120 0.760740 0.753380
Section 3.2.6: Effect size = 0.4
compare_policies(alpha,effect_size = 0.4, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1) Metric UR UR_multiarm TS TS_multiarm TS_epsilon TSPDD
1 Average Power 0.897000 0.813000 0.765000 0.746000 0.80500 0.840000
2 Expected Reward 0.599592 0.414258 0.734664 0.686165 0.72092 0.724488
3 Average Reward 0.601540 0.415600 0.731200 0.682920 0.72012 0.723600
4 Average AP 0.498980 0.330940 0.836660 0.784940 0.80230 0.811220
5 Empirical CV 0.897000 0.813000 0.765000 0.746000 0.80500 0.840000
6 Normal CV 0.599592 0.414258 0.734664 0.686165 0.72092 0.724488
7 Empirical FPR 0.601540 0.415600 0.731200 0.682920 0.72012 0.723600
8 Normal FPR 0.498980 0.330940 0.836660 0.784940 0.80230 0.811220
Section 3.2.7: Effect size = 0.5
compare_policies(alpha,effect_size = 0.5, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1) Metric UR UR_multiarm TS TS_multiarm TS_epsilon TSPDD
1 Average Power 0.99200 0.949000 0.85700 0.828000 0.91000 0.94400
2 Expected Reward 0.65147 0.451748 0.83621 0.784423 0.81852 0.82623
3 Average Reward 0.65392 0.451200 0.83542 0.784460 0.81884 0.82498
4 Average AP 0.50294 0.336120 0.87242 0.819960 0.83704 0.85246
5 Empirical CV 0.99200 0.949000 0.85700 0.828000 0.91000 0.94400
6 Normal CV 0.65147 0.451748 0.83621 0.784423 0.81852 0.82623
7 Empirical FPR 0.65392 0.451200 0.83542 0.784460 0.81884 0.82498
8 Normal FPR 0.50294 0.336120 0.87242 0.819960 0.83704 0.85246
Section 3.2.8: Summary Table for sample size = 50, effect_size = 0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, priors(0.05, 0.4, 0.4+ effect_size) for 3-arms
set.seed(1)
n<-50
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_50 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)Section 3.2.9: Summary Table for sample size = 100, effect_size = 0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,etc. priors(0.05, 0.4, 0.4+ effect_size) for 3-arms
set.seed(1)
n <- 100
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_100 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)Section 3.2.10: Summary Table for sample size = 200, effect_size = 0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,etc. priors(0.05, 0.4, 0.4+ effect_size) for 3-arms
set.seed(1)
n <- 200
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_200 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)Section 3.2.11: Summary Table for sample size = 300, effect_size = 0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,etc. priors(0.05, 0.4, 0.4+ effect_size) for 3-arms
set.seed(1)
n <- 300
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_300 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)Section 3.2.12: Summary Table for sample size = 20, effect_size = 0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,etc. priors(0.05, 0.4, 0.4+ effect_size) for 3-arms
set.seed(1)
n <- 20
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_20 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)Section 3.2.13: Summary Table for sample size = 500, effect_size = 0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,etc. priors(0.05, 0.4, 0.4+ effect_size) for 3-arms
set.seed(1)
n <- 500
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_500 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)Section 3.3: joining the table in a long format
# Step 1: renaming the row names with corresponding sample size
rownames(n_20) <- paste(rownames(n_20), "(n=20)")
rownames(n_50) <- paste(rownames(n_50), "(n=50)")
rownames(n_100) <- paste(rownames(n_100), "(n=100)")
rownames(n_200) <- paste(rownames(n_200), "(n=200)")
rownames(n_300) <- paste(rownames(n_300), "(n=300)")
rownames(n_500) <- paste(rownames(n_500), "(n=500)")library(stringr) # need to install this pacakage if needed
# Step 2: Concatenate these tables vertically
merged_table <- rbind(n_20,n_50, n_100, n_200, n_300, n_500)
# Separate metrics and sample sizes from the row names
metrics <- gsub("\\(.*", "", rownames(merged_table))
sample_sizes <- gsub(".*\\(", "", gsub("\\).*", "", rownames(merged_table)))
# Create a data.frame to hold the metrics, sample sizes, and row indices
ordering_df <- data.frame(
Metric = metrics,
Sample_Size = sample_sizes,
Row_Index = 1:nrow(merged_table))
ordering_df$Sample_Size <- as.numeric(str_extract(sample_sizes, "\\d+"))
# Order the data.frame first by Metric, then by Sample_Size
ordering_df <- ordering_df[order(ordering_df$Metric, ordering_df$Sample_Size), ]
# Rearrange the rows of merged_table based on the ordered row indices
merged_table <- merged_table[ordering_df$Row_Index, ]Section 3.4: Plotting the trend plot with respect to effect size for each metric
Section 3.4.0 Defining the reward graphing tool:
library(dplyr)
library(ggplot2)
library(stringr)
library(tidyr)
#' graphing_tool - graphical analysis on different metrics like reward, power, or allocation probabilities
#'
#' Takes the below input
#' @param merged_table - merged tables on above sample sizes
#' @metric the metric we want to see grpahical interpretation
#' @policies the polcies we want to investigate on
#' @return the graph of the trend for selected policis
graphing_tool <- function(merged_table,
metric,
policies = c()) {
# Step 1: Filter the merged_table
filtered_data <- merged_table |> filter(Metric == metric)
# Step 2.1 Creating a helper function for splitting the data:
split_and_convert <- function(value) {
as.numeric(str_split(value, " \\| ")[[1]])
}
# Step 2.2: Split the values in the Value column into a numeric vector
metric_values_UR <- c(sapply(filtered_data$UR, split_and_convert))
metric_values_UR_multiarm <- c(sapply(filtered_data$UR_multiarm, split_and_convert))
metric_values_TS <- c(sapply(filtered_data$TS, split_and_convert))
metric_values_TS_multiarm <- c(sapply(filtered_data$TS_multiarm, split_and_convert))
metric_values_TS_epsilon <- c(sapply(filtered_data$TS_epsilon, split_and_convert))
metric_values_TSPDD <- c(sapply(filtered_data$TSPDD, split_and_convert))
# Step 3: Create a data frame for plotting
plot_data <- data.frame(
expand_grid(
Policy = c("UR", "UR_multiarm", "TS", "TS_multiarm", "TS_epsilon", "TSPDD"),
Sample_size = c(20, 50, 100, 200, 300, 500),
Effect_Size = effect_sizes
) %>% bind_cols(
Metric = c(
metric_values_UR,
metric_values_UR_multiarm,
metric_values_TS,
metric_values_TS_multiarm,
metric_values_TS_epsilon,
metric_values_TSPDD
)
)
)
# Step 4: Use ggplot2 to plot the trend
p <- ggplot(plot_data %>% filter(Policy %in% policies),
aes(x = Effect_Size, y = Metric, color = Policy)) +
geom_point(size = 0.5, alpha = 0.7) + geom_line(alpha = 0.7) +
facet_wrap( ~ Sample_size) +
labs(title = paste("Trend of ", metric ," across Effect Sizes and Sample Sizes"),
x = "Effect Size",
y = metric) +
theme_minimal()
return(p)
}Section 3.4.1 For Reward: Prior (0.05, 0.4, 0.4+effect size)
graphing_tool(merged_table,
'Average Reward',
c('UR_multiarm', "TS_multiarm"))Section 3.4.2 For Power: Prior (0.05, 0.4, 0.4+effect size)
graphing_tool(merged_table,
'Power',
c('UR_multiarm', "TS_multiarm"))# change it from 0.5, 0.5 +effect size
# the signs of the wald z test might be lower as well
# look into how to calculate the proportion or sample proportion to calculate the error bar for the power (do a bit of literature review )Section 3.5.1: For Reward: Prior (0.4, 0.4, 0.4+effect size)
priors <- c(0.4,0.4,0.4)
n<-50
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_50 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)
n <- 100
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_100 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)
n <- 200
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_200 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)
n <- 300
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_300 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)
n <- 20
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_20 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)
n <- 500
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_500 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)
rownames(n_20) <- paste(rownames(n_20), "(n=20)")
rownames(n_50) <- paste(rownames(n_50), "(n=50)")
rownames(n_100) <- paste(rownames(n_100), "(n=100)")
rownames(n_200) <- paste(rownames(n_200), "(n=200)")
rownames(n_300) <- paste(rownames(n_300), "(n=300)")
rownames(n_500) <- paste(rownames(n_500), "(n=500)")
merged_table <- rbind(n_20,n_50, n_100, n_200, n_300, n_500)
metrics <- gsub("\\(.*", "", rownames(merged_table))
sample_sizes <- gsub(".*\\(", "", gsub("\\).*", "", rownames(merged_table)))
# Create a data.frame to hold the metrics, sample sizes, and row indices
ordering_df <- data.frame(
Metric = metrics,
Sample_Size = sample_sizes,
Row_Index = 1:nrow(merged_table))
ordering_df$Sample_Size <- as.numeric(str_extract(sample_sizes, "\\d+"))
# Order the data.frame first by Metric, then by Sample_Size
ordering_df <- ordering_df[order(ordering_df$Metric, ordering_df$Sample_Size), ]
# Rearrange the rows of merged_table based on the ordered row indices
merged_table <- merged_table[ordering_df$Row_Index, ]graphing_tool(merged_table,
'Average Reward',
c('UR', "TS", "TS_epsilon", "TSPDD"))graphing_tool(merged_table,
'Power',
c('UR', "TS", "TS_epsilon", "TSPDD"))Section 4: Comparison of AP test statistics
Section 4.1 Comparing at z = 1,2 with 0 effect size, 2-arm:
Section 4.1.1 20 samples with batch_size = 2
set.seed(1) # for reproducibility
n <- 20
result_1 <- simulation_n_times(alpha = 0.05,
policy = TS,
effect_size = 0,
n = n,
B = 1000,
burnin = 1,
batch = 2)
AP_z1 <- result_1$APT
result_2 <- simulation_n_times(alpha = 0.05,
policy = TS,
effect_size = 0.2,
n = n,
B = 1000,
burnin = 1,
batch = 2)
AP_z2 <- result_2$APT
result_3 <- simulation_n_times(alpha = 0.05,
policy = TS,
effect_size = 0.3,
n = n,
B = 1000,
burnin = 1,
batch = 2)
AP_z3 <- result_3$APT
# Load the necessary library
library(ggplot2)
# Combine the data
AP_data <- data.frame(
value = c(AP_z1, AP_z2, AP_z3),
group = factor(rep(c("p1 = p2 = 0.5", "p1 = 0.5, p2 = 0.7", "p1 = 0.5, p2 = 0.8"), times = c(length(AP_z1), length(AP_z2), length(AP_z3))))
)
AP_data$group <- factor(AP_data$group, levels = c("p1 = p2 = 0.5", "p1 = 0.5, p2 = 0.7", "p1 = 0.5, p2 = 0.8"))
AP_data |>
ggplot(aes(x = value), fill = group) +
geom_histogram(binwidth = 1, aes(y = ..density..),color = "black", alpha = 0.5, position = 'identity')+
facet_wrap(~group)+
theme_bw()Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.
Finding FPR and Power:
cv <- empirical_cv(AP_z1, 0.05) - 2
FPR <- 1 - ecdf(AP_z1)(cv)
Power_02 <- 1 - ecdf(AP_z2)(cv)
Power_03 <- 1 - ecdf(AP_z3)(cv)
paste('reward over time for each effect size 0, 0.2, 0.3 is',
result_1$average_reward,
result_2$average_reward,
result_3$average_reward)[1] "reward over time for each effect size 0, 0.2, 0.3 is 0.405 0.52255 0.6183"
paste('Correct Allocation time for each effect size 0, 0.2, 0.3 is',
result_1$average_AP,
result_2$average_AP,
result_3$average_AP)[1] "Correct Allocation time for each effect size 0, 0.2, 0.3 is 0.5037 0.6366 0.70755"
paste('Average Wald Power for each effect size 0, 0.2, 0.3 is',
result_1$average_power,
result_2$average_power,
result_3$average_power)[1] "Average Wald Power for each effect size 0, 0.2, 0.3 is 0.154 0.313 0.44"
paste('Empirical CV is at', cv)[1] "Empirical CV is at 7"
paste('Empirical FPR is at', FPR)[1] "Empirical FPR is at 0.201"
paste('Empirical Power with 0.2 effect size is', Power_02)[1] "Empirical Power with 0.2 effect size is 0.407"
paste('Empirical Power with 0.3 effect size is', Power_03)[1] "Empirical Power with 0.3 effect size is 0.531"
=Section 4.1.1 100 samples with batch_size = 2
set.seed(1) # for reproducibility
n <- 100
result_1 <- simulation_n_times(alpha = 0.05,
policy = TS,
effect_size = 0,
n = n,
B = 1000,
burnin = 1,
batch = 2)
AP_z1 <- result_1$APT
result_2 <- simulation_n_times(alpha = 0.05,
policy = TS,
effect_size = 0.2,
n = n,
B = 1000,
burnin = 1,
batch = 2)
AP_z2 <- result_2$APT
result_3 <- simulation_n_times(alpha = 0.05,
policy = TS,
effect_size = 0.3,
n = n,
B = 1000,
burnin = 1,
batch = 2)
AP_z3 <- result_3$APT
# Load the necessary library
library(ggplot2)
# Combine the data
AP_data <- data.frame(
value = c(AP_z1, AP_z2, AP_z3),
group = factor(rep(c("p1 = p2 = 0.5", "p1 = 0.5, p2 = 0.7", "p1 = 0.5, p2 = 0.8"), times = c(length(AP_z1), length(AP_z2), length(AP_z3))))
)
AP_data$group <- factor(AP_data$group, levels = c("p1 = p2 = 0.5", "p1 = 0.5, p2 = 0.7", "p1 = 0.5, p2 = 0.8"))
AP_data |>
ggplot(aes(x = value), fill = group) +
geom_histogram(binwidth = 1, aes(y = ..density..),color = "black", alpha = 0.5, position = 'identity')+
facet_wrap(~group)+
theme_bw()cv <- empirical_cv(AP_z1, 0.05) -4
FPR <- 1 - ecdf(AP_z1)(cv)
Power_02 <- 1 - ecdf(AP_z2)(cv)
Power_03 <- 1 - ecdf(AP_z3)(cv)
paste('reward over time for each effect size 0, 0.2, 0.3 is',
result_1$average_reward,
result_2$average_reward,
result_3$average_reward)[1] "reward over time for each effect size 0, 0.2, 0.3 is 0.40017 0.56067 0.66447"
paste('Correct Allocation time for each effect size 0, 0.2, 0.3 is',
result_1$average_AP,
result_2$average_AP,
result_3$average_AP)[1] "Correct Allocation time for each effect size 0, 0.2, 0.3 is 0.49893 0.79324 0.87275"
paste('Average Wald Power for each effect size 0, 0.2, 0.3 is',
result_1$average_power,
result_2$average_power,
result_3$average_power)[1] "Average Wald Power for each effect size 0, 0.2, 0.3 is 0.12 0.565 0.744"
paste('Empirical CV is at', cv)[1] "Empirical CV is at 41"
paste('Empirical FPR is at', FPR)[1] "Empirical FPR is at 0.133"
paste('Empirical Power with 0.2 effect size is', Power_02)[1] "Empirical Power with 0.2 effect size is 0.536"
paste('Empirical Power with 0.3 effect size is', Power_03)[1] "Empirical Power with 0.3 effect size is 0.765"
Comparisons across different sample sizes and different policies:
Comparison of Reward
set.seed(1)
priors <- c(0.4,0.4,0.4)
n <- 20
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_20 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)
n<-50
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_50 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)
n <- 100
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
n_100 <- compare_effect_sizes(alpha,effect_sizes = effect_sizes, n, B,burnin,batch,z, multi_arm, priors, epsilon=0.1,c=0.1)
rownames(n_20) <- paste(rownames(n_20), "(n=20)")
rownames(n_50) <- paste(rownames(n_50), "(n=50)")
rownames(n_100) <- paste(rownames(n_100), "(n=100)")
merged_table <- rbind(n_20,n_50, n_100)
metrics <- gsub("\\(.*", "", rownames(merged_table))
sample_sizes <- gsub(".*\\(", "", gsub("\\).*", "", rownames(merged_table)))
# Create a data.frame to hold the metrics, sample sizes, and row indices
ordering_df <- data.frame(
Metric = metrics,
Sample_Size = sample_sizes,
Row_Index = 1:nrow(merged_table))
ordering_df$Sample_Size <- as.numeric(str_extract(sample_sizes, "\\d+"))
# Order the data.frame first by Metric, then by Sample_Size
ordering_df <- ordering_df[order(ordering_df$Metric, ordering_df$Sample_Size), ]
# Rearrange the rows of merged_table based on the ordered row indices
merged_table <- merged_table[ordering_df$Row_Index, ]graphing_tool <- function(merged_table,
metric,
policies = c()) {
# Step 1: Filter the merged_table
filtered_data <- merged_table |> filter(Metric == metric)
# Step 2.1 Creating a helper function for splitting the data:
split_and_convert <- function(value) {
as.numeric(str_split(value, " \\| ")[[1]])
}
# Step 2.2: Split the values in the Value column into a numeric vector
metric_values_UR <- c(sapply(filtered_data$UR, split_and_convert))
metric_values_UR_multiarm <- c(sapply(filtered_data$UR_multiarm, split_and_convert))
metric_values_TS <- c(sapply(filtered_data$TS, split_and_convert))
metric_values_TS_multiarm <- c(sapply(filtered_data$TS_multiarm, split_and_convert))
metric_values_TS_epsilon <- c(sapply(filtered_data$TS_epsilon, split_and_convert))
metric_values_TSPDD <- c(sapply(filtered_data$TSPDD, split_and_convert))
# Step 3: Create a data frame for plotting
plot_data <- data.frame(
expand_grid(
Policy = c("UR", "UR_multiarm", "TS", "TS_multiarm", "TS_epsilon", "TSPDD"),
Sample_size = c(20, 50, 100),
Effect_Size = effect_sizes
) %>% bind_cols(
Metric = c(
metric_values_UR,
metric_values_UR_multiarm,
metric_values_TS,
metric_values_TS_multiarm,
metric_values_TS_epsilon,
metric_values_TSPDD
)
)
)
# Step 4: Use ggplot2 to plot the trend
p <- ggplot(plot_data %>% filter(Policy %in% policies),
aes(x = Effect_Size, y = Metric, color = Policy)) +
geom_point(size = 0.5, alpha = 0.7) + geom_line(alpha = 0.7) +
facet_wrap( ~ Sample_size) +
labs(title = paste("Trend of ", metric ," across Effect Sizes and Sample Sizes"),
x = "Effect Size",
y = metric) +
theme_minimal()
return(p)
}graphing_tool(merged_table,
'Average Reward',
c('UR', "TS", "TS_epsilon", "TSPDD"))Comparison of Correct Allocation Rate
set.seed(1)
ns <- c(20,50,100)
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
policies <- c("UR", "TS", "TS_epsilon", "TSPDD")
result <- data.frame(
n = numeric(0),
effect_size = numeric(0),
policy = numeric(0),
AP = numeric(0),
stringsAsFactors = FALSE
)
for (n in ns){
for (effect_size in effect_sizes){
for (policy in policies){
temp <- data.frame(
n = n,
effect_size = effect_size,
policy = policy,
AP = simulation_n_times(alpha = 0.05,
policy = get(policy, envir = .GlobalEnv),
effect_size = effect_size,
n = n,
B = 1000,
burnin = 1,
batch = 2)$average_AP,
stringsAsFactors = FALSE
)
result <- rbind(result, temp)
}
}
}plot <- ggplot(result, aes(x = effect_size, y = AP, color = policy)) +
geom_point(size = 0.5, alpha = 0.7) + geom_line(alpha = 0.7) + # or geom_line() if you prefer lines
facet_wrap(~ n, scales = "free") + # Added scales = "free" to allow different scales for different facets
labs(
title = "Correct Allocation with respect to Effect Size",
x = "Effect Size",
y = "Correct Allocation Ratio"
) +
theme_minimal()
# Display the plot
print(plot)Comparison of Wald-Z Power
graphing_tool(merged_table,
'Power',
c('UR', "TS", "TS_epsilon", "TSPDD"))Comparison of AP Power
set.seed(1)
ns <- c(20,50,100)
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
policies <- c("UR", "TS", "TS_epsilon", "TSPDD")
result <- data.frame(
n = numeric(0),
effect_size = numeric(0),
policy = numeric(0),
power = numeric(0),
stringsAsFactors = FALSE
)
cv <- list('UR' = 0, 'TS' = 0, 'TS_epsilon' = 0, 'TSPDD' = 0)
for (n in ns){
for (effect_size in effect_sizes) {
for (policy in policies) {
APT <- simulation_n_times(
alpha = 0.05,
policy = get(policy, envir = .GlobalEnv),
effect_size = effect_size,
n = n,
B = 1000,
burnin = 1,
batch = 2
)$APT
if (effect_size == 0) {
cv[policy] <- empirical_cv(APT, 0.05) - 1
}
temp <- data.frame(
n = n,
effect_size = effect_size,
policy = policy,
power = 1 - ecdf(APT)(cv[policy]),
stringsAsFactors = FALSE
)
result <- rbind(result, temp)
}
}
}plot <- ggplot(result, aes(x = effect_size, y = power, color = policy)) +
geom_point(size = 0.5, alpha = 0.7) + geom_line(alpha = 0.7) + # or geom_line() if you prefer lines
facet_wrap(~ n) + # Added scales = "free" to allow different scales for different facets
labs(
title = "AP Power with respect to Effect Size",
x = "Effect Size",
y = "Power"
) +
theme_minimal()
# Display the plot
print(plot)Replication results for three arm setting (XPRIZE)
comparing reward
set.seed(1)
ns <- c(20,50,100)
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
policies <- c("UR_multiarm", "TS_multiarm")
priors <- array(c(0.05,0.4,0.4, 0.4,0.4,0.4), dim = c(2,3))
result <- data.frame(
n = numeric(0),
effect_size = numeric(0),
policy = numeric(0),
prior = numeric(0),
metric = numeric(0),
stringsAsFactors = FALSE
)
for (n in ns) {
for (effect_size in effect_sizes) {
for (policy in policies) {
for (i in 1:2) {
prior_label <- paste("Prior", i)
temp <- data.frame(
n = n,
effect_size = effect_size,
policy = policy,
prior = prior_label,
metric = simulation_n_times(
alpha = 0.05,
policy = get(policy, envir = .GlobalEnv),
effect_size = effect_size,
n = n,
B = 1000,
burnin = 1,
batch = 2,
multi_arm = 3,
priors = c(priors[i,1],priors[i,2], priors[i,3]+effect_size)
)$average_reward,
stringsAsFactors = FALSE
)
result <- rbind(result, temp)
}
}
}
}result$prior <- factor(result$prior, labels = c("p1 << p2, p3 = 0.4 +",
"p1 = p2, p3 = 0.4 +"))
result$n <- factor(result$n, labels = c('n = 20', 'n = 50', 'n = 100'))
# Create the plot
plot <- ggplot(result, aes(x = effect_size, y = metric, color = policy)) +
geom_point(size = 0.5, alpha = 0.7) +
geom_line(alpha = 0.7) +
facet_grid(rows = vars(prior), cols = vars(n), scales = "free") +
labs(
title = "Average Reward with respect to Effect Size",
x = "Effect Size",
y = "Average Reward"
) +
theme_minimal()
# Display the plot
print(plot)Comparing correct ratio:
set.seed(1)
ns <- c(20,50,100)
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
policies <- c("UR_multiarm", "TS_multiarm")
priors <- array(c(0.05,0.4,0.4, 0.4,0.4,0.4), dim = c(2,3))
result <- data.frame(
n = numeric(0),
effect_size = numeric(0),
policy = numeric(0),
prior = numeric(0),
metric = numeric(0),
stringsAsFactors = FALSE
)
for (n in ns) {
for (effect_size in effect_sizes) {
for (policy in policies) {
for (i in 1:2) {
prior_label <- paste("Prior", i)
temp <- data.frame(
n = n,
effect_size = effect_size,
policy = policy,
prior = prior_label,
metric = simulation_n_times(
alpha = 0.05,
policy = get(policy, envir = .GlobalEnv),
effect_size = effect_size,
n = n,
B = 1000,
burnin = 1,
batch = 2,
multi_arm = 3,
priors = c(priors[i,1],priors[i,2], priors[i,3]+effect_size)
)$average_AP ,
stringsAsFactors = FALSE
)
result <- rbind(result, temp)
}
}
}
}result$prior <- factor(result$prior, labels = c("p1 << p2, p3 = 0.4 +",
"p1 = p2, p3 = 0.4 +"))
result$n <- factor(result$n, labels = c('n = 20', 'n = 50', 'n = 100'))
# Create the plot
plot <- ggplot(result, aes(x = effect_size, y = metric, color = policy)) +
geom_point(size = 0.5, alpha = 0.7) +
geom_line(alpha = 0.7) +
facet_grid(rows = vars(prior), cols = vars(n), scales = "free") +
labs(
title = "Correct Allocation with respect to Effect Size",
x = "Effect Size",
y = "Correct Allocation Ratio"
) +
theme_minimal()
# Display the plot
print(plot)Comparing wald Power
set.seed(1)
ns <- c(20,50,100)
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
policies <- c("UR_multiarm", "TS_multiarm")
priors <- array(c(0.05,0.4,0.4, 0.4,0.4,0.4), dim = c(2,3))
result <- data.frame(
n = numeric(0),
effect_size = numeric(0),
policy = numeric(0),
prior = numeric(0),
metric = numeric(0),
stringsAsFactors = FALSE
)
for (n in ns) {
for (effect_size in effect_sizes) {
for (policy in policies) {
for (i in 1:2) {
prior_label <- paste("Prior", i)
temp <- data.frame(
n = n,
effect_size = effect_size,
policy = policy,
prior = prior_label,
metric = simulation_n_times(
alpha = 0.05,
policy = get(policy, envir = .GlobalEnv),
effect_size = effect_size,
n = n,
B = 1000,
burnin = 1,
batch = 2,
multi_arm = 3,
priors = c(priors[i,1],priors[i,2], priors[i,3]+effect_size)
)$average_power ,
stringsAsFactors = FALSE
)
result <- rbind(result, temp)
}
}
}
}result$prior <- factor(result$prior, labels = c("p1 << p2, p3 = 0.4 +",
"p1 = p2, p3 = 0.4 +"))
result$n <- factor(result$n, labels = c('n = 20', 'n = 50', 'n = 100'))
# Create the plot
plot <- ggplot(result, aes(x = effect_size, y = metric, color = policy)) +
geom_point(size = 0.5, alpha = 0.7) +
geom_line(alpha = 0.7) +
facet_grid(rows = vars(prior), cols = vars(n), scales = "free") +
labs(
title = "Trend of Wald-Z test Power with respect to Effect Size",
x = "Effect Size",
y = "Power"
) +
theme_minimal()
# Display the plot
print(plot)comparing AP power
ns <- c(20,50,100)
effect_sizes <-
c(0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5)
policies <- c("UR_multiarm", "TS_multiarm")
priors <- array(c(0.05,0.4,0.4, 0.4,0.4,0.4), dim = c(2,3))
result <- data.frame(
n = numeric(0),
effect_size = numeric(0),
policy = numeric(0),
prior = numeric(0),
power = numeric(0),
stringsAsFactors = FALSE
)
cv <- list('UR_multiarm' = c(0,0), 'TS_multiarm' = c(0,0))
for (n in ns) {
for (effect_size in effect_sizes) {
for (policy in policies) {
for (i in 1:2) {
prior_label <- paste("Prior", i)
APT <- simulation_n_times(
alpha = 0.05,
policy = get(policy, envir = .GlobalEnv),
effect_size = effect_size,
n = n,
B = 1000,
burnin = 1,
batch = 2,
multi_arm = 3,
priors = priors[i,]
)$APT
if (effect_size == 0) {
cv[[policy]][i] <- empirical_cv(APT, 0.05) - 1
print(cv)
}
temp <- data.frame(
n = n,
effect_size = effect_size,
policy = policy,
prior = prior_label,
power = 1 - ecdf(APT)(cv[[policy]][i]),
stringsAsFactors = FALSE
)
result <- rbind(result, temp)
}
}
}
}$UR_multiarm
[1] 5 0
$TS_multiarm
[1] 0 0
$UR_multiarm
[1] 5 5
$TS_multiarm
[1] 0 0
$UR_multiarm
[1] 5 5
$TS_multiarm
[1] 7 0
$UR_multiarm
[1] 5 5
$TS_multiarm
[1] 7 6
$UR_multiarm
[1] 11 5
$TS_multiarm
[1] 7 6
$UR_multiarm
[1] 11 11
$TS_multiarm
[1] 7 6
$UR_multiarm
[1] 11 11
$TS_multiarm
[1] 19 6
$UR_multiarm
[1] 11 11
$TS_multiarm
[1] 19 16
$UR_multiarm
[1] 21 11
$TS_multiarm
[1] 19 16
$UR_multiarm
[1] 21 21
$TS_multiarm
[1] 19 16
$UR_multiarm
[1] 21 21
$TS_multiarm
[1] 40 16
$UR_multiarm
[1] 21 21
$TS_multiarm
[1] 40 35
result$prior <- factor(result$prior, labels = c("p1 << p2, p3 = 0.4 +",
"p1 = p2, p3 = 0.4 +"))
result$n <- factor(result$n, labels = c('n = 20', 'n = 50', 'n = 100'))
# Create the plot
plot <- ggplot(result, aes(x = effect_size, y = power, color = policy)) +
geom_point(size = 0.5, alpha = 0.7) +
geom_line(alpha = 0.7) +
facet_grid(rows = vars(prior), cols = vars(n), scales = "free") +
labs(
title = "Trend of AP test Power with respect to Effect Size",
x = "Effect Size",
y = "Power"
) +
theme_minimal()
# Display the plot
print(plot)