Simulation with an Outcome Dependent Theta

This file contains the code for the simulation of a multi-armed bandit with a minimum aspiration level which has to be reached at the end of a game. The goal of this simulation is to find out if the strategy to maximize the proportion of times the minimum aspiration level is reached is the same as the outcome maximization strategy.

Define functions

First functions are defined to run the simulations. We use a Rescorla-Wagner prediction error updating function with an extended softmax function. The softmax function in this case is \[\begin{equation} p_{i}=\frac{e^{x_{i} \theta}}{\sum_{i=1}^{n}e^{x_{i} \theta}} \end{equation}\] but the \(\theta\) values are not fixed but a function of how far the current state is from the minimum aspiration level, i.e. \[\begin{equation} \theta = \frac{1}{1 + e^{- \frac{\Delta goal}{goal} * \delta}} \end{equation}\]

with \(\Delta goal\) being the current state minus the minimum aspiration level.

Define the Rescorla-Wagner function

rw.fun <- function(exp.prior = c(5, 3, 7),      # A vector of prior expectations
                   new.inf = c(NA, 2, NA),        # A vector of new information (NAs except for selected option)
                   alpha = .3) {   # Updating rate
  
  # Save new expectations as prior
  exp.new <- exp.prior
  
  # Determine which option was selected
  selection <- which(is.finite(new.inf))
  
  # Update expectation of selected option
  exp.new[selection] <- exp.prior[selection] + alpha * (new.inf[selection] - exp.prior[selection])
  
  return(exp.new)
  
}

Define the normal softmax function (the \(\theta\) function is directly included later).

softmax.fun <- function(exp.current = c(5, 3, 6),
                        theta = .5
) {
  
  output <- exp(exp.current * theta) / sum(exp(exp.current * theta))
  
  return(output)
  
}

Now define a simulation function (note that here \(\theta\) is defined by theta = 1 / (1 + exp(-(sum(outcome.v, na.rm = T) - threshold) * delta)) right before it is then fed into the softmax function and the results stored in selprob.i).

# Create main simulation function
rl.sim.fun <- function(n.trials = 100,     # Trials in game
                       option.mean = c(1, -1, 0),   # Mean of each option
                       option.sd = c(2, 10, .01),   # SD of each option
                       prior.exp.start = rep(0, 3), 
                       delta = .5, 
                       alpha = .2, 
                       plot = TRUE,
                       int.values = FALSE,  # if TRUE generated values will be rounded to integers
                       allow.negative = TRUE,  # if FALSE negative values are set to 0
                       n.exp = 0, # the last n.exp distributions will be drawn from an expon dist
                       exp.option.rate = .2, # rate of the exp distribution (lambda)
                       threshold = 100) { # minimum threshold to reach. if reached get bonus
  
  # Get some game parameters from inputs
  n.options <- length(option.mean) + n.exp
  
  # Create outcome matrix giving the outcome on each trial for each option
  outcome.mtx <- matrix(NA, nrow = n.trials, ncol = n.options)
  
  for(option.i in 1:(n.options - n.exp)) {
    
    outcome.mtx[,option.i] <- rnorm(n = n.trials, 
                                    mean = option.mean[option.i], 
                                    sd = option.sd[option.i])
    if(int.values == TRUE) outcome.mtx[,option.i] <- round(outcome.mtx[,option.i])
    if(allow.negative == FALSE) outcome.mtx[,option.i][outcome.mtx[,option.i] < 0] <- 0
    
  }
  
  for(option.ii in 1:n.exp) {
    
    outcome.mtx[,option.ii+length(option.mean)] <- rexp(n = n.trials, 
                                    rate = exp.option.rate[option.ii])
    if(int.values == TRUE) outcome.mtx[,option.ii + length(option.mean)] <- round(outcome.mtx[,option.ii + length(option.mean)])
    if(allow.negative == FALSE) outcome.mtx[,option.ii + length(option.mean)][outcome.mtx[,option.ii + length(option.mean)] < 0] <- 0
    
  }
  
  # Create exp.prior and exp.new matrices
  #  These will hold the agent's expectation (either prior or new) 
  #   of each option on each trial
  
  exp.prior.mtx <- matrix(NA, nrow = n.trials, ncol = n.options)
  exp.prior.mtx[1,] <- prior.exp.start
  exp.new.mtx <- matrix(NA, nrow = n.trials, ncol = n.options)
  
  # Now create some matrices to store values
  
  selection.v <- rep(NA, n.trials)      # Actual selections
  outcome.v <- rep(NA, n.trials)        # Actual outcomes
  selprob.mtx <- matrix(NA,             # Selection probabilities
                        nrow = n.trials, 
                        ncol = n.options)
  
  # RUN SIMULATION!
  
  for(trial.i in 1:n.trials) {
    
    # STEP 0: Get prior expectations for current trial
    
    exp.prior.i <- exp.prior.mtx[trial.i,]
    
    # STEP 1: SELECT AN OPTION
    
    # Selection probabilities
    
    
    theta = 1 / (1 + exp(-((sum(outcome.v, na.rm = T) - threshold) / threshold) * delta))
    
    selprob.i <- softmax.fun(exp.current = exp.prior.i, 
                             theta = theta)
    
    # Select an option
    
    selection.i <- sample(1:n.options, size = 1, prob = selprob.i)
    
    # Get outcome from selected option
    
    outcome.i <- outcome.mtx[trial.i, selection.i]
    
    # STEP 3: CREATE NEW EXPECTANCIES
    
    # Create a new.inf vector with NAs except for outcome of selected option
    
    new.inf <- rep(NA, n.options)
    new.inf[selection.i] <- outcome.i
    
    # Get new expectancies
    new.exp.i <- rw.fun(exp.prior = exp.prior.i,
                        new.inf = new.inf,
                        alpha = alpha)
    
    # assign new expectatations to exp.new.mtx[trial.i,]
    #  and prior.expecation.mtx[trial.i + 1,]
    
    exp.new.mtx[trial.i,] <- new.exp.i
    
    if(trial.i < n.trials) {
      
      exp.prior.mtx[trial.i + 1,] <- new.exp.i
      
    }
    
    # Save some values
    
    selprob.mtx[trial.i,] <- selprob.i  # Selection probabilities
    selection.v[trial.i] <- selection.i # Actual selection
    outcome.v[trial.i] <- outcome.i     # Actual outcome
  }
  
  # Put main results in a single dataframe called sim.result.df
  
  sim.result.df <- data.frame("selection" = selection.v, 
                              "outcome" = outcome.v,
                              "outcome.cum" = cumsum(outcome.v),
                              stringsAsFactors = FALSE)
  
  # Should simulation be plotted?
  
  if(plot == TRUE) {
    
    plot(1, 
         xlim = c(1, n.trials), 
         ylim = c(min(option.mean) ,  # Set limits to worst and best exepected performance
                  max(option.mean) * n.trials), 
         type = "n",
         ylab = "Cumulative Rewards",
         xlab = "Trial",
         main = paste0("alpha = ", alpha, ", delta = ", delta))
    
    rect(-1e3, -1e3, 1e3, 1e3, col = gray(.96))
    abline(h = seq(-1e3, 1e3, by = 10), 
           v = seq(-1e3, 1e3, by = 10),
           lwd = c(2, 1), col = "white")
    
    abline(h = threshold, lty = 2)
    
    points(x = 1:n.trials,
           y = sim.result.df$outcome.cum,
           type = "b")
    text(x = 1:n.trials,
         y = sim.result.df$outcome.cum,
         sim.result.df$selection,
         pos = 3)
    
  }
  
  
  # Now return the main simulation dataframe
  
  return(sim.result.df)
}

Now define the last function, to be able to run many simulations/ agents.

# simulate many agents with the same theta value

many.agent.fun <- function(n.agent,      # Number of agents
                           delta,
                           ...) {      # Agent's theta
  
  sapply(1:n.agent, FUN = function(x) {
    
    sim.result.i <- rl.sim.fun(delta = delta, 
                               plot = FALSE,
                               ...)
    
    final.reward.i <- sim.result.i$outcome.cum[nrow(sim.result.i)]
    
    return(final.reward.i)
    
  }
  )
}

Run the simulation

Now that all functions are defined we can run the simulations (I didn’t actually run that part of the code here since it takes some time, but that’s how I did it on the server). I ran 1000 agents per \(\theta\) value. The minimum aspiration level or threshold is set to 250, \(\alpha\) is set to \(0.3\), the \(\theta\) values range from \(-5\) to \(5\) in steps of \(0.1\). Each game consists of \(30\) trials.

# Number of agents
n.agent <- 1000
threshold <- 250

#library(yarrr)

# Create a vector of exploit, explore, and random agent outcomes
# different parameter combinations:
alphas <- .3
deltas <- seq(-5, 5, .1)
par.combin <- expand.grid(alphas, deltas)

# create matrices to store the outcome values for the different environments
outc.mat <- matrix(NA, nrow = n.agent, ncol = nrow(par.combin))

# create matrices with different distributions (means and sds)
option.means.mat <- matrix(c(10, 8,
                             9, 7,
                             8, 6),
                           ncol = 2, byrow = T)
option.sd.mat <- matrix(c(2, 2, 
                          2, 2,
                          2, 2),
                        ncol = 2, byrow = T)

n.exp.mat <- matrix(c(1, 
                      1, 
                      1),
                    ncol = 1, byrow = T)
exp.option.rate.mat <- matrix(c(.15, 
                                .15,
                                .15),
                              ncol = 1, byrow = T)
# set the number of trials per game
ntrials <- 30

# loop through distributions
for (dist in 1:nrow(option.means.mat)){
  
  # prepare values that are used
  option.means <- option.means.mat[dist,]
  option.sd <- option.sd.mat[dist,]
  n.exp <- n.exp.mat[dist,]
  exp.option.rate <- exp.option.rate.mat[dist,]
  
  # prepare an empty variable to later store the expected outcomes in
  mean.outc <- NULL
  
  # loop 1000 times
  for(xx in 1:1000){
    
    # prepare variable
    outc <- NULL
    
    # loop through number of options/ bandit arms
    for(yy in 1:ntrials){
      
      samp <- sample(1:(length(option.means) + n.exp), 1)
      # draw from distributions from each arm
      if(samp <= length(option.means)){
        outc <- c(outc, round(rnorm(1, option.means[samp], option.sd[samp])))
      }
      if(samp > length(option.means)){
        outc <- rexp(1, exp.option.rate[samp - length(option.means)])
      }
    }
    
    # cut the distribution at 0 (no losses allowed)
    outc[outc < 0] <- 0
    
    # save sum of drawn values
    mean.outc[xx] <- sum(outc)
  }
  
  
  # compute the value according to that threshold
  prop.rand.surv <- mean(mean.outc >= threshold)
  random.mean <- mean(mean.outc)
  
  # loop through all parameter combinations
  for(pars in 1:nrow(par.combin)){
    
    # for each parameter combination run n.agent times the simulation
    outc.mat[,pars] <- many.agent.fun(n.agent, delta = par.combin[pars, 2],
                                      alpha = par.combin[pars, 1], option.mean = option.means,
                                      option.sd = option.sd, int.values = TRUE,
                                      allow.negative = FALSE, n.trials = ntrials,
                                      threshold = threshold, n.exp = n.exp,
                                      exp.option.rate = exp.option.rate)
  }
  

  # compute proportion of survivors (how many reached threshold)
  prop.surv <- colMeans(outc.mat >= threshold)
  
  # look up which is the 95% threshold
  mean.outc.value <- colMeans(outc.mat)
  
  save(outc.mat, file = paste("data/outcDepTheta", dist,
                    ".RData", sep = ''))
  
  # plot the parameter values for highest 5% threshhold reached and highest 5% outcomes
  pdf(paste('plots/compOutcDepThetadist', dist, '.pdf', sep = ''))
  par(mar = c(4.5, 4.5, 4.5, 4.5))
  plot(1, xlim = c(0,5), ylim = c(0, 1), type = 'n',
       xlab ='delta values', ylab = 'prop survived',
       main = paste('prop surv. and mean outc for delta values', sep = '')) 
  mtext('theta: 1 / (1 + exp(-dfg * delta))')
  grid()
  lines(par.combin[,2], prop.surv, col = 'red',
         lwd = 1.5)
  abline(h = prop.rand.surv, col = 'red', lty = 2)
  
  par(new = T)
  plot(par.combin[,2], mean.outc.value, col = 'blue', lwd = 1.5,
       xlab = NA, ylab = NA, type = 'l', axes = F)
  axis(side = 4)
  mtext(side = 4, line = 3, 'mean outcome')
  abline(h = random.mean, col = 'blue', lty = 2)
  
  legend('topleft', c('prop threshold reached', 'mean outcomes', 
                    'prop threshold reached random',
                    'mean outcomes random'),
         col = rep(c('red', 'blue'), 2), lty = c(1, 1, 2, 2))
  dev.off()

  
}

Plots

For each Environment (i.e. set of distributions), plot the theta values on the x axis and plot two y axes, one for the proportion of times the aspiration level was reached and one for the mean outcome reached.

First, because the code chunk above was not executed, we need to create some variables to then create the plot, such as what the proportion of threshold reached and the mean outcome is for a random chooser. The values from the simulation then need to be read in and the plot can be created. The procedure is the same for all environments which is why the code is only shown once.

To test whether low delta values actually lead to more exploration near the goal, run simulations for single agents and plot their behavior. Higher \(\delta\) values lead to more exploration when the goal is not yet reached and exploitation afterwards, than lower \(\delta\) values. By using negative \(\delta\) values, this pattern can be reversed. Note that the code for the simulations for the negative \(\delta\) values is not shown because it is the same except for the signs.

Plots for the positive \(\delta\)s.

par(mfrow = c(2, 2),oma = c(0, 0, 2, 0))
set.seed(100) # For replicability
sim1 <- rl.sim.fun(n.trials = 30,     # Trials in game
              option.mean = c(7,9),   # Mean of each option
              option.sd = c(2, 2),   # SD of each option
              prior.exp.start = rep(0, 3), 
              delta = .2, 
              alpha = .3, 
              plot = TRUE,
              int.values = TRUE,  # if TRUE generated values will be rounded to integers
              allow.negative = FALSE,  # if FALSE negative values are set to 0
              n.exp = 1,
              exp.option.rate = .15,
              threshold = 250)

set.seed(100) # For replicability
sim2 <- rl.sim.fun(n.trials = 30,     # Trials in game
              option.mean = c(7,9),   # Mean of each option
              option.sd = c(2, 2),   # SD of each option
              prior.exp.start = rep(0, 3), 
              delta = .8, 
              alpha = .3, 
              plot = TRUE,
              int.values = TRUE,  # if TRUE generated values will be rounded to integers
              allow.negative = FALSE,  # if FALSE negative values are set to 0
              n.exp = 1,
              exp.option.rate = .15,
              threshold = 250)

set.seed(100) # For replicability
sim3 <- rl.sim.fun(n.trials = 30,     # Trials in game
              option.mean = c(7,9),   # Mean of each option
              option.sd = c(2, 2),   # SD of each option
              prior.exp.start = rep(0, 3), 
              delta = 1.5, 
              alpha = .3, 
              plot = TRUE,
              int.values = TRUE,  # if TRUE generated values will be rounded to integers
              allow.negative = FALSE,  # if FALSE negative values are set to 0
              n.exp = 1,
              exp.option.rate = .15,
              threshold = 250)

set.seed(100) # For replicability
sim4 <- rl.sim.fun(n.trials = 30,     # Trials in game
              option.mean = c(7,9),   # Mean of each option
              option.sd = c(2, 2),   # SD of each option
              prior.exp.start = rep(0, 3), 
              delta = 5, 
              alpha = .3, 
              plot = TRUE,
              int.values = TRUE,  # if TRUE generated values will be rounded to integers
              allow.negative = FALSE,  # if FALSE negative values are set to 0
              n.exp = 1,
              exp.option.rate = .15,
              threshold = 250)
mtext("Environment: M: 9,7; sds 2, exp rate .15", outer = TRUE, cex = 1.5)

Figure 1: single agent simulations for positive delta values, Environment means 7,9; sds 2 and exp lambda .15.

Plots for the negative \(\delta\)s.

single agent simulations for negative delta values, Environment means 7,9; sds 6.3 and exp lambda .15.

# Number of agents
n.agent <- 1000
threshold <- 250

#library(yarrr)

# Create a vector of exploit, explore, and random agent outcomes
# different parameter combinations:
alphas <- .3
deltas <- seq(-5, 5, .1)
par.combin <- expand.grid(alphas, deltas)

# create matrices to store the outcome values for the different environments
outc.mat <- matrix(NA, nrow = n.agent, ncol = nrow(par.combin))

# set the number of trials per game
ntrials <- 30

option.means <- c(10, 8)
option.sd <- c(2, 2)
n.exp <- 1
exp.option.rate <- .15

# prepare an empty variable to later store the expected outcomes in
mean.outc <- NULL

  # loop 1000 times
  for(xx in 1:1000){
    
    # prepare variable
    outc <- NULL
    
    # loop through number of options/ bandit arms
    for(yy in 1:ntrials){
      
      samp <- sample(1:(length(option.means) + n.exp), 1)
      # draw from distributions from each arm
      if(samp <= length(option.means)){
        outc <- c(outc, round(rnorm(1, option.means[samp], option.sd[samp])))
      }
      if(samp > length(option.means)){
        outc <- rexp(1, exp.option.rate[samp - length(option.means)])
      }
    }
  
  # cut the distribution at 0 (no losses allowed)
  outc[outc < 0] <- 0
  
  # save sum of drawn values
  mean.outc[xx] <- sum(outc)
}

# compute the value according to that threshold
prop.rand.surv <- mean(mean.outc >= threshold)
random.mean <- mean(mean.outc)

load('data/outcDepTheta1.RData')

# compute proportion of survivors (how many reached threshold)
prop.surv <- colMeans(outc.mat >= threshold)

# look up which is the 95% threshold
mean.outc.value <- colMeans(outc.mat)

m <- rbind(1, 1, 1, 1, 1, 2, 2)
layout(m)
par(mar = c(4, 4, 4.5, 4))
plot(1, xlim = c(-5,5), ylim = c(min(prop.surv)-.05, ifelse(max(prop.surv) <= .95,
                                max(prop.surv) + .05, 1)), type = 'n', xlab ='delta values',
     ylab = 'prop survived', main = paste('prop surv. and mean outc for delta values',
                                          sep = ''), cex.lab = 1.5) 
mtext('Environment: M: 10, 8; sd: 2; exp dist rate .15 i.e. M: 6.7')
grid()
lines(par.combin[,2], prop.surv, col = transparent('red', .5),
       lwd = 1.5)
abline(h = prop.rand.surv, col = transparent('red', .5), lty = 2)

par(new = T)
plot(par.combin[,2], mean.outc.value, col = transparent('blue', .5), lwd = 1.5,
     xlab = NA, ylab = NA, ylim = c(min(mean.outc.value)-1, max(mean.outc.value)), type = 'l', axes = F)
axis(side = 4)
mtext(side = 4, line = 3, 'mean outcome')
abline(h = random.mean, col = transparent('blue', .5), lty = 2)
par(mar = c(0, 0, 0, 0))
plot(1, type = 'n', axes = F, xlab = NA, ylab = NA)
legend('center', c('prop threshold reached', 'mean outcomes', 
                  'prop threshold reached random',
                  'mean outcomes random'),
       col = rep(c(transparent('red', .5), transparent('blue', .5)), 2), lty = c(1, 1, 2, 2), cex = 1.5)

Figure 1: Proportion threshold reached and mean outcomes for different delta values. Environment 1 means are 10, 8 and sds are 2, and an exp distr with lambda = .15.

Figure 2: Proportion threshold reached and mean outcomes for different delta values. Environment 2 means are 9, 7 and sds are 2, and an exp distr with lambda = .15.

Figure 3: Proportion threshold reached and mean outcomes for different delta values. Environment 3 means are 8, 6 and sds are 2, and an exp distr with lambda = .15.

Simulation with an Outcome Dependent Theta

Markus Steiner

1 Dezember 2016

Define functions

Run the simulation

Plots