This file contains the code for the simulation of a multi-armed bandit with a minimum aspiration level which has to be reached at the end of a game. The goal of this simulation is to find out if the strategy to maximize the proportion of times the minimum aspiration level is reached is the same as the outcome maximization strategy.
with \(\Delta goal\) being the current state minus the minimum aspiration level.
Define the Rescorla-Wagner function
rw.fun <- function(exp.prior = c(5, 3, 7), # A vector of prior expectations
new.inf = c(NA, 2, NA), # A vector of new information (NAs except for selected option)
alpha = .3) { # Updating rate
# Save new expectations as prior
exp.new <- exp.prior
# Determine which option was selected
selection <- which(is.finite(new.inf))
# Update expectation of selected option
exp.new[selection] <- exp.prior[selection] + alpha * (new.inf[selection] - exp.prior[selection])
return(exp.new)
}
Define the normal softmax function (the \(\theta\) function is directly included later).
softmax.fun <- function(exp.current = c(5, 3, 6),
theta = .5
) {
output <- exp(exp.current * theta) / sum(exp(exp.current * theta))
return(output)
}
Now define a simulation function (note that here \(\theta\) is defined by theta = 1 / (1 + exp(-(sum(outcome.v, na.rm = T) - threshold) * delta))
right before it is then fed into the softmax function and the results stored in selprob.i
).
# Create main simulation function
rl.sim.fun <- function(n.trials = 100, # Trials in game
option.mean = c(1, -1, 0), # Mean of each option
option.sd = c(2, 10, .01), # SD of each option
prior.exp.start = rep(0, 3),
delta = .5,
alpha = .2,
plot = TRUE,
int.values = FALSE, # if TRUE generated values will be rounded to integers
allow.negative = TRUE, # if FALSE negative values are set to 0
n.exp = 0, # the last n.exp distributions will be drawn from an expon dist
exp.option.rate = .2, # rate of the exp distribution (lambda)
threshold = 100) { # minimum threshold to reach. if reached get bonus
# Get some game parameters from inputs
n.options <- length(option.mean) + n.exp
# Create outcome matrix giving the outcome on each trial for each option
outcome.mtx <- matrix(NA, nrow = n.trials, ncol = n.options)
for(option.i in 1:(n.options - n.exp)) {
outcome.mtx[,option.i] <- rnorm(n = n.trials,
mean = option.mean[option.i],
sd = option.sd[option.i])
if(int.values == TRUE) outcome.mtx[,option.i] <- round(outcome.mtx[,option.i])
if(allow.negative == FALSE) outcome.mtx[,option.i][outcome.mtx[,option.i] < 0] <- 0
}
for(option.ii in 1:n.exp) {
outcome.mtx[,option.ii+length(option.mean)] <- rexp(n = n.trials,
rate = exp.option.rate[option.ii])
if(int.values == TRUE) outcome.mtx[,option.ii + length(option.mean)] <- round(outcome.mtx[,option.ii + length(option.mean)])
if(allow.negative == FALSE) outcome.mtx[,option.ii + length(option.mean)][outcome.mtx[,option.ii + length(option.mean)] < 0] <- 0
}
# Create exp.prior and exp.new matrices
# These will hold the agent's expectation (either prior or new)
# of each option on each trial
exp.prior.mtx <- matrix(NA, nrow = n.trials, ncol = n.options)
exp.prior.mtx[1,] <- prior.exp.start
exp.new.mtx <- matrix(NA, nrow = n.trials, ncol = n.options)
# Now create some matrices to store values
selection.v <- rep(NA, n.trials) # Actual selections
outcome.v <- rep(NA, n.trials) # Actual outcomes
selprob.mtx <- matrix(NA, # Selection probabilities
nrow = n.trials,
ncol = n.options)
# RUN SIMULATION!
for(trial.i in 1:n.trials) {
# STEP 0: Get prior expectations for current trial
exp.prior.i <- exp.prior.mtx[trial.i,]
# STEP 1: SELECT AN OPTION
# Selection probabilities
theta = 1 / (1 + exp(-((sum(outcome.v, na.rm = T) - threshold) / threshold) * delta))
selprob.i <- softmax.fun(exp.current = exp.prior.i,
theta = theta)
# Select an option
selection.i <- sample(1:n.options, size = 1, prob = selprob.i)
# Get outcome from selected option
outcome.i <- outcome.mtx[trial.i, selection.i]
# STEP 3: CREATE NEW EXPECTANCIES
# Create a new.inf vector with NAs except for outcome of selected option
new.inf <- rep(NA, n.options)
new.inf[selection.i] <- outcome.i
# Get new expectancies
new.exp.i <- rw.fun(exp.prior = exp.prior.i,
new.inf = new.inf,
alpha = alpha)
# assign new expectatations to exp.new.mtx[trial.i,]
# and prior.expecation.mtx[trial.i + 1,]
exp.new.mtx[trial.i,] <- new.exp.i
if(trial.i < n.trials) {
exp.prior.mtx[trial.i + 1,] <- new.exp.i
}
# Save some values
selprob.mtx[trial.i,] <- selprob.i # Selection probabilities
selection.v[trial.i] <- selection.i # Actual selection
outcome.v[trial.i] <- outcome.i # Actual outcome
}
# Put main results in a single dataframe called sim.result.df
sim.result.df <- data.frame("selection" = selection.v,
"outcome" = outcome.v,
"outcome.cum" = cumsum(outcome.v),
stringsAsFactors = FALSE)
# Should simulation be plotted?
if(plot == TRUE) {
plot(1,
xlim = c(1, n.trials),
ylim = c(min(option.mean) , # Set limits to worst and best exepected performance
max(option.mean) * n.trials),
type = "n",
ylab = "Cumulative Rewards",
xlab = "Trial",
main = paste0("alpha = ", alpha, ", delta = ", delta))
rect(-1e3, -1e3, 1e3, 1e3, col = gray(.96))
abline(h = seq(-1e3, 1e3, by = 10),
v = seq(-1e3, 1e3, by = 10),
lwd = c(2, 1), col = "white")
abline(h = threshold, lty = 2)
points(x = 1:n.trials,
y = sim.result.df$outcome.cum,
type = "b")
text(x = 1:n.trials,
y = sim.result.df$outcome.cum,
sim.result.df$selection,
pos = 3)
}
# Now return the main simulation dataframe
return(sim.result.df)
}
Now define the last function, to be able to run many simulations/ agents.
# simulate many agents with the same theta value
many.agent.fun <- function(n.agent, # Number of agents
delta,
...) { # Agent's theta
sapply(1:n.agent, FUN = function(x) {
sim.result.i <- rl.sim.fun(delta = delta,
plot = FALSE,
...)
final.reward.i <- sim.result.i$outcome.cum[nrow(sim.result.i)]
return(final.reward.i)
}
)
}
Now that all functions are defined we can run the simulations (I didn’t actually run that part of the code here since it takes some time, but that’s how I did it on the server). I ran 1000 agents per \(\theta\) value. The minimum aspiration level or threshold is set to 250, \(\alpha\) is set to \(0.3\), the \(\theta\) values range from \(-5\) to \(5\) in steps of \(0.1\). Each game consists of \(30\) trials.
# Number of agents
n.agent <- 1000
threshold <- 250
#library(yarrr)
# Create a vector of exploit, explore, and random agent outcomes
# different parameter combinations:
alphas <- .3
deltas <- seq(-5, 5, .1)
par.combin <- expand.grid(alphas, deltas)
# create matrices to store the outcome values for the different environments
outc.mat <- matrix(NA, nrow = n.agent, ncol = nrow(par.combin))
# create matrices with different distributions (means and sds)
option.means.mat <- matrix(c(10, 8,
9, 7,
8, 6),
ncol = 2, byrow = T)
option.sd.mat <- matrix(c(2, 2,
2, 2,
2, 2),
ncol = 2, byrow = T)
n.exp.mat <- matrix(c(1,
1,
1),
ncol = 1, byrow = T)
exp.option.rate.mat <- matrix(c(.15,
.15,
.15),
ncol = 1, byrow = T)
# set the number of trials per game
ntrials <- 30
# loop through distributions
for (dist in 1:nrow(option.means.mat)){
# prepare values that are used
option.means <- option.means.mat[dist,]
option.sd <- option.sd.mat[dist,]
n.exp <- n.exp.mat[dist,]
exp.option.rate <- exp.option.rate.mat[dist,]
# prepare an empty variable to later store the expected outcomes in
mean.outc <- NULL
# loop 1000 times
for(xx in 1:1000){
# prepare variable
outc <- NULL
# loop through number of options/ bandit arms
for(yy in 1:ntrials){
samp <- sample(1:(length(option.means) + n.exp), 1)
# draw from distributions from each arm
if(samp <= length(option.means)){
outc <- c(outc, round(rnorm(1, option.means[samp], option.sd[samp])))
}
if(samp > length(option.means)){
outc <- rexp(1, exp.option.rate[samp - length(option.means)])
}
}
# cut the distribution at 0 (no losses allowed)
outc[outc < 0] <- 0
# save sum of drawn values
mean.outc[xx] <- sum(outc)
}
# compute the value according to that threshold
prop.rand.surv <- mean(mean.outc >= threshold)
random.mean <- mean(mean.outc)
# loop through all parameter combinations
for(pars in 1:nrow(par.combin)){
# for each parameter combination run n.agent times the simulation
outc.mat[,pars] <- many.agent.fun(n.agent, delta = par.combin[pars, 2],
alpha = par.combin[pars, 1], option.mean = option.means,
option.sd = option.sd, int.values = TRUE,
allow.negative = FALSE, n.trials = ntrials,
threshold = threshold, n.exp = n.exp,
exp.option.rate = exp.option.rate)
}
# compute proportion of survivors (how many reached threshold)
prop.surv <- colMeans(outc.mat >= threshold)
# look up which is the 95% threshold
mean.outc.value <- colMeans(outc.mat)
save(outc.mat, file = paste("data/outcDepTheta", dist,
".RData", sep = ''))
# plot the parameter values for highest 5% threshhold reached and highest 5% outcomes
pdf(paste('plots/compOutcDepThetadist', dist, '.pdf', sep = ''))
par(mar = c(4.5, 4.5, 4.5, 4.5))
plot(1, xlim = c(0,5), ylim = c(0, 1), type = 'n',
xlab ='delta values', ylab = 'prop survived',
main = paste('prop surv. and mean outc for delta values', sep = ''))
mtext('theta: 1 / (1 + exp(-dfg * delta))')
grid()
lines(par.combin[,2], prop.surv, col = 'red',
lwd = 1.5)
abline(h = prop.rand.surv, col = 'red', lty = 2)
par(new = T)
plot(par.combin[,2], mean.outc.value, col = 'blue', lwd = 1.5,
xlab = NA, ylab = NA, type = 'l', axes = F)
axis(side = 4)
mtext(side = 4, line = 3, 'mean outcome')
abline(h = random.mean, col = 'blue', lty = 2)
legend('topleft', c('prop threshold reached', 'mean outcomes',
'prop threshold reached random',
'mean outcomes random'),
col = rep(c('red', 'blue'), 2), lty = c(1, 1, 2, 2))
dev.off()
}
For each Environment (i.e. set of distributions), plot the theta values on the x axis and plot two y axes, one for the proportion of times the aspiration level was reached and one for the mean outcome reached.
First, because the code chunk above was not executed, we need to create some variables to then create the plot, such as what the proportion of threshold reached and the mean outcome is for a random chooser. The values from the simulation then need to be read in and the plot can be created. The procedure is the same for all environments which is why the code is only shown once.
To test whether low delta values actually lead to more exploration near the goal, run simulations for single agents and plot their behavior. Higher \(\delta\) values lead to more exploration when the goal is not yet reached and exploitation afterwards, than lower \(\delta\) values. By using negative \(\delta\) values, this pattern can be reversed. Note that the code for the simulations for the negative \(\delta\) values is not shown because it is the same except for the signs.
Plots for the positive \(\delta\)s.
par(mfrow = c(2, 2),oma = c(0, 0, 2, 0))
set.seed(100) # For replicability
sim1 <- rl.sim.fun(n.trials = 30, # Trials in game
option.mean = c(7,9), # Mean of each option
option.sd = c(2, 2), # SD of each option
prior.exp.start = rep(0, 3),
delta = .2,
alpha = .3,
plot = TRUE,
int.values = TRUE, # if TRUE generated values will be rounded to integers
allow.negative = FALSE, # if FALSE negative values are set to 0
n.exp = 1,
exp.option.rate = .15,
threshold = 250)
set.seed(100) # For replicability
sim2 <- rl.sim.fun(n.trials = 30, # Trials in game
option.mean = c(7,9), # Mean of each option
option.sd = c(2, 2), # SD of each option
prior.exp.start = rep(0, 3),
delta = .8,
alpha = .3,
plot = TRUE,
int.values = TRUE, # if TRUE generated values will be rounded to integers
allow.negative = FALSE, # if FALSE negative values are set to 0
n.exp = 1,
exp.option.rate = .15,
threshold = 250)
set.seed(100) # For replicability
sim3 <- rl.sim.fun(n.trials = 30, # Trials in game
option.mean = c(7,9), # Mean of each option
option.sd = c(2, 2), # SD of each option
prior.exp.start = rep(0, 3),
delta = 1.5,
alpha = .3,
plot = TRUE,
int.values = TRUE, # if TRUE generated values will be rounded to integers
allow.negative = FALSE, # if FALSE negative values are set to 0
n.exp = 1,
exp.option.rate = .15,
threshold = 250)
set.seed(100) # For replicability
sim4 <- rl.sim.fun(n.trials = 30, # Trials in game
option.mean = c(7,9), # Mean of each option
option.sd = c(2, 2), # SD of each option
prior.exp.start = rep(0, 3),
delta = 5,
alpha = .3,
plot = TRUE,
int.values = TRUE, # if TRUE generated values will be rounded to integers
allow.negative = FALSE, # if FALSE negative values are set to 0
n.exp = 1,
exp.option.rate = .15,
threshold = 250)
mtext("Environment: M: 9,7; sds 2, exp rate .15", outer = TRUE, cex = 1.5)
Figure 1: single agent simulations for positive delta values, Environment means 7,9; sds 2 and exp lambda .15.
single agent simulations for negative delta values, Environment means 7,9; sds 6.3 and exp lambda .15.
# Number of agents
n.agent <- 1000
threshold <- 250
#library(yarrr)
# Create a vector of exploit, explore, and random agent outcomes
# different parameter combinations:
alphas <- .3
deltas <- seq(-5, 5, .1)
par.combin <- expand.grid(alphas, deltas)
# create matrices to store the outcome values for the different environments
outc.mat <- matrix(NA, nrow = n.agent, ncol = nrow(par.combin))
# set the number of trials per game
ntrials <- 30
option.means <- c(10, 8)
option.sd <- c(2, 2)
n.exp <- 1
exp.option.rate <- .15
# prepare an empty variable to later store the expected outcomes in
mean.outc <- NULL
# loop 1000 times
for(xx in 1:1000){
# prepare variable
outc <- NULL
# loop through number of options/ bandit arms
for(yy in 1:ntrials){
samp <- sample(1:(length(option.means) + n.exp), 1)
# draw from distributions from each arm
if(samp <= length(option.means)){
outc <- c(outc, round(rnorm(1, option.means[samp], option.sd[samp])))
}
if(samp > length(option.means)){
outc <- rexp(1, exp.option.rate[samp - length(option.means)])
}
}
# cut the distribution at 0 (no losses allowed)
outc[outc < 0] <- 0
# save sum of drawn values
mean.outc[xx] <- sum(outc)
}
# compute the value according to that threshold
prop.rand.surv <- mean(mean.outc >= threshold)
random.mean <- mean(mean.outc)
load('data/outcDepTheta1.RData')
# compute proportion of survivors (how many reached threshold)
prop.surv <- colMeans(outc.mat >= threshold)
# look up which is the 95% threshold
mean.outc.value <- colMeans(outc.mat)
m <- rbind(1, 1, 1, 1, 1, 2, 2)
layout(m)
par(mar = c(4, 4, 4.5, 4))
plot(1, xlim = c(-5,5), ylim = c(min(prop.surv)-.05, ifelse(max(prop.surv) <= .95,
max(prop.surv) + .05, 1)), type = 'n', xlab ='delta values',
ylab = 'prop survived', main = paste('prop surv. and mean outc for delta values',
sep = ''), cex.lab = 1.5)
mtext('Environment: M: 10, 8; sd: 2; exp dist rate .15 i.e. M: 6.7')
grid()
lines(par.combin[,2], prop.surv, col = transparent('red', .5),
lwd = 1.5)
abline(h = prop.rand.surv, col = transparent('red', .5), lty = 2)
par(new = T)
plot(par.combin[,2], mean.outc.value, col = transparent('blue', .5), lwd = 1.5,
xlab = NA, ylab = NA, ylim = c(min(mean.outc.value)-1, max(mean.outc.value)), type = 'l', axes = F)
axis(side = 4)
mtext(side = 4, line = 3, 'mean outcome')
abline(h = random.mean, col = transparent('blue', .5), lty = 2)
par(mar = c(0, 0, 0, 0))
plot(1, type = 'n', axes = F, xlab = NA, ylab = NA)
legend('center', c('prop threshold reached', 'mean outcomes',
'prop threshold reached random',
'mean outcomes random'),
col = rep(c(transparent('red', .5), transparent('blue', .5)), 2), lty = c(1, 1, 2, 2), cex = 1.5)
Figure 1: Proportion threshold reached and mean outcomes for different delta values. Environment 1 means are 10, 8 and sds are 2, and an exp distr with lambda = .15.
Figure 2: Proportion threshold reached and mean outcomes for different delta values. Environment 2 means are 9, 7 and sds are 2, and an exp distr with lambda = .15.
Figure 3: Proportion threshold reached and mean outcomes for different delta values. Environment 3 means are 8, 6 and sds are 2, and an exp distr with lambda = .15.