Authors@R: person(“Annice”, “Najafi”, email = “annicenajafi27@gmail.com”)

options(warn=-1)

Finite Stage Models

Gambling Model

References:

Ross, S. M. (2014). Introduction to stochastic dynamic programming. Academic press. Summer 2024

This is an R implementation of the gambling model from Chapter 1, section 2 (“A Gambling Model”) of the “Introduction to Stochastic Dynamic Programming” book by Sheldon Ross. The following text is taken directly from the textbook.

“At each play of the game, a gambler can bet any nonnegative amount up to his present fortune and will either win or lose that amount with probabilities $p$ and $q = 1 - p$, respectively. The gambler is allowed to make $n$ bets, and his objective is to maximize the expectation of the logarithm of his final fortune. What strategy achieves this end?”

# Load libraries
library(ggplot2)
library(shiny)

Define theory and simulation functions

#Define V_n
#' Recursive function returns the maximal expected return
#'
#' @param n number of bets
#' @param x present fortune
#' @param p is the probability of winning, q is the probability of losing
#' hence p = 1-q
#' @param alpha is the percentage of fortune we are betting
#'
#' @return expected return value
#' @export
V_n <- function(n, x, p, alpha) {
  if (n == 0) {
    return(log(x))
  }
  
  
  V_previous <- function(x) {
    V_n(n - 1, x, p, alpha)
  }
  
  expected_reward <- p * V_previous(x * (1 + alpha)) + (1-p) * V_previous(x * (1 - alpha))
  
  return(expected_reward)
}


#Simulate problem
#' Simulate the case where a gambler is betting a percentage of his fortune
#' and 
#' @param n number of bets
#' @param initial_x initial fortune of the gambler
#' @param p probability of winning in each stage
#' @param alpha percentage the gambler bets 
#'
#' @return log of fortune
#' @export
simulate_problem <- function(n, initial_x, p, alpha) {
  x <- initial_x
  
  for (i in 1:n) {
    if (rbinom(1, 1, p) == 0) { #it's similar to flipping a coin with p prob
      #of getting heads
      x <- x * (1 - alpha) #you either get tails and lose
    } else {
      x <- x * (1 + alpha) #or get heads and win
    }
  }
  
  return(log(x))  # return log of fortune
}

Case I) Let’s try for when p is actually more than q

# Params

initial_fortune <- 1000  # Initial fortune
p <- 0.6  # Probability of winning
q <- 1- p #probability of losing
n <- 10  # Number of bets
alpha <- 0.3 #What fraction of his fortune is he betting

# Calculate V_n(x)

res_vector <- c() #store results

#Go over all values
for(alpha in seq(0, p, 0.01)){
  
  result <- V_n(n, initial_fortune, p, alpha)
  
  res_vector <- c(res_vector, result)
  
}

reward.df <- data.frame(alpha = seq(0, p, 0.01), reward = res_vector)

Simulate the first case

# Simulate
num_simulations <- 10000
alpha_values <- seq(0, p, 0.01)

expected_rewards_sim <- numeric(length(alpha_values))


for (j in seq_along(alpha_values)) {
  alpha <- alpha_values[j]
  fortunes <- numeric(num_simulations)
  
  for (i in 1:num_simulations) {
    fortunes[i] <- simulate_problem(n, initial_fortune, p, alpha)
  }
  
  expected_rewards_sim[j] <- mean(fortunes)
}

reward.df_sim <- data.frame(alpha = alpha_values, reward = expected_rewards_sim)

ggplot() +
  geom_point(data = reward.df_sim, aes(x = alpha, y = reward), stroke = 2, color = "#D96098", shape = 8) +
  geom_line(data = reward.df, aes(x = alpha, y = reward), size = 2, color = "#F7DBF0") +
  labs(x = "Alpha", y = "Expected Reward", title = "Gambling model") +
  geom_vline(xintercept = (p - q), linetype="dashed")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "A")

Case II) Let’s try p<1/2

p <- 0.3  # Probability of winning
q <- 1- p #probability of losing


# Calculate V_n(x)

res_vector <- c() #store results

#Go over all values
for(alpha in seq(0, p, 0.01)){
  
  result <- V_n(n, initial_fortune, p, alpha)
  
  res_vector <- c(res_vector, result)
  
}

reward.df <- data.frame(alpha = seq(0, p, 0.01), reward = res_vector)

Simulate the second case

# Simulate
num_simulations <- 10000
alpha_values <- seq(0, p, 0.01)

expected_rewards_sim <- numeric(length(alpha_values))


for (j in seq_along(alpha_values)) {
  alpha <- alpha_values[j]
  fortunes <- numeric(num_simulations)
  
  for (i in 1:num_simulations) {
    fortunes[i] <- simulate_problem(n, initial_fortune, p, alpha)
  }
  
  expected_rewards_sim[j] <- mean(fortunes)
}

reward.df_sim <- data.frame(alpha = alpha_values, reward = expected_rewards_sim)


ggplot() +
  geom_point(data = reward.df_sim, aes(x = alpha, y = reward), stroke = 2, color = "#D96098", shape = 8) +
  geom_line(data = reward.df, aes(x = alpha, y = reward), size = 2, color = "#F7DBF0") +
  labs(x = "Alpha", y = "Expected Reward", title = "Gambling model") +
  geom_vline(xintercept = 0, linetype = "dashed") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "B")

To try an interactive version of this problem please visit this link here!

Gambling model with changing win probabilities

Now let’s make the problem more advanced:

This is an R implementation of the gambling model with changing win probabilities from Chapter 1, section 4 (“A Gambling Model With Changing Win Probabilities”) of the “Introduction to Stochastic Dynamic Programming” book by Sheldon Ross.

A gambler that has x fortune, makes n bets wins with probability p At each stage the gambler has p probability of winning, with alpha probability of winning for previous stages taken from a distribution F. The gambler then decides to bet y from 0 to his current fortune (x). Goal is to maximize the expectation of the log of his fortune.

NOTE: utility function taken to be u(x) = log(x) like before, adding 1 to prevent errors.

NOTE: alpha is not the percentage we are betting but rather the probability of winning at the previous stage, don’t confuse this with the alpha in the gambling model in section 2.

Let’s start by loading relevant libraries

#Load relevant libraries
library(ggplot2)
library(truncnorm)

Define relevant functions

# Define the probability density function, going with the truncated normal dist 
#' probability density function for the truncated normal dist
#'
#' @param alpha the winning probability
#' @return the pdf
#' @export
f <- function(alpha) {
  
  return(dtruncnorm(alpha, a = 0, b = 1, mean = 0.5, sd = 0.1))
  
}

#Great now lets define the utility function
#' Utility function
#'
#' @param x 
#' @return log(x+1), adding 1 because if the gambler bets all of his fortune
#' then we will get an error because log(0) is undefined
#' @export
u <- function(x){
  
  return(log(x+1))
  
}

#' Recursive function that returns the maximum expected reward
#'
#' @param n how many times we are betting
#' @param x the fortune
#' @param p the probability of winning
#'
#' @return the maximal expected reward
#' @export
V <- function(n, x, p) {
  
  # Obviously first we define the base case
  if (n == 0) {
    return(u(x))
  }
  
  #alpha <- rtruncnorm(1, a = 0, b = 1, mean = 0.5, sd = 0.1) # This is how you would have taken alpha randomly from a truncated normal distribution. 
  
  hold.integrand_win  <- function(alpha) {
    V(n - 1, (x + y), alpha) * f(alpha)
  }
  hold.integrand_lose <- function(alpha) {
    V(n - 1, (x - y), alpha) * f(alpha)
  }
  
  max_y <- 0  # Initialize max_y
  
  #Find the best y:
  
  for (y in 0:x) {
  
    
    val <- p * integrate(hold.integrand_win, 0, 1)$value +
      (1 - p) * integrate(hold.integrand_lose, 0, 1)$value
    

    # Use max to find the maximum value
    max_y <- max(max_y, val)
  }

  return(max_y) #Return the maximal reward found through searching all y values
}

Calculate the maximal expected reward

# Initial fortune
x_initial <- 10
# Initial win probability
p_initial <- 0.5
# Number of plays
n <- 3

c()->return_vect

for(p_initial in seq(0.5, 1, 0.05)){
  
  # Calculate maximal expected final utility
  max_expected_utility <- V(n, x_initial, p_initial)
  return_vect <- c(return_vect, max_expected_utility)
}

df <- data.frame(p= seq(0.5, 1, 0.05), return=return_vect)

ggplot(df, aes(x=p, y=return_vect))+geom_line(size=2, color = "#538392")+
  geom_point(color="#538392", shape=8, stroke=3)+
  labs(x="Probability of winning", y="Return")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="A")

hold.vect <- c()
x<-10
p<-0.8
n<-3
for (y in seq(0, x, 1)){
  
  hold.val <- V(n - 1, x + y, p) - V(n - 1, x - y, p)
  hold.vect <- c(hold.vect, hold.val)
}


y_increasing <- data.frame(y = seq(0, x, 1), equation = hold.vect)

ggplot(y_increasing, aes(x=y, y=equation))+geom_line(size=2, color = "#538392")+
  geom_point(color="#538392", shape=8, stroke=3)+
  labs(x="y", y="Return")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="B")

Stock option model

This is an R implementation of the stock option model from Chapter 1, section 3 (“A Stock-Option Model”) of the “Introduction to Stochastic Dynamic Programming” book by Sheldon Ross.

An option for a stock is the ability to buy a stock for a pre-determined price. That means you can pay some money to have the right to buy a specific share of a company for a pre-determined price even if the stock price increases later on.

First let’s start by loading relevant libraries

#Load libraries
library(stats)
library(latex2exp)
library(ggplot2)

Define the maximum expected return function

# Define the probability density function
f <- function(x) {
  dnorm(x, mean = 0, sd = 1)
}

#Define V_n
#' Recursive function returns the maximal expected return
#'
#' @param n number of days
#' @param s current price of the stock
#' @param c past price that we buy the stock at
#' @return maximal expected return value
#' @export
V <- function(n, s, c) {
  
  if (n == 0) {
    return(max((s-c), 0))
  }
  
  
  #Remember F(x) is f(x)dx so we can multiply by the pdf and integrate with
  #respect to x.
  hold.integrand <- function(x) {V(n - 1, s + x, c) * f(x)}
  
  expected_future_value <- integrate(hold.integrand, -Inf, Inf)$value
  
  # Two possible scenarios here
  # Case I) immediately sell the option
  # Case II) wait and see what happens, hoping for better future value
  return(max(s - c, expected_future_value))
}

Let’s test

# Parameters
initial_stock_price <- 10  # Initial stock price
c <- 8  # stock purchase price
n <- 3  # Number of days

result <- V(n, initial_stock_price, c)

Proving Lemma 3.1

s_values <- seq(0, 100, by = 5)

V_n_s <- c()
for(i in 1:length(s_values)) {
  s <- s_values[i]
  V_n_s_value <- V(n, s, c)
  V_n_s <- c(V_n_s , V_n_s_value)
}

hold.df <- data.frame(s = s_values, V_n_s = (V_n_s_value - s_values))

ggplot(hold.df, aes(s, V_n_s))+
  geom_line(size=2, color = "#538392")+
  labs(x="s", y=TeX("$V_n(s) - s$"))+
  ggtitle(TeX("$V_n(s) - s$ is decreasing in s"))+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="A")

We just showed Lemma 3.1 using R!

Optimal allocation problems

The following R code is related to the special case of the problem - “Optimal Allocations In the Construction of k-out-of-n Reliability Systems” - by Derman et al, 1974.

References:

Derman, C., Lieberman, G. J., & Ross, S. M. (1974). Optimal allocations in the construction of k-out-of-n reliability systems. Management Science, 21(3), 241-250.

I am going to start this section by considering the most basic problem which is the situation where we are considering money that gets allocated for an n component system that is getting built. Assume the case where we need to have at least one successful component for the system to work.

Case k=1

Let’s think about this, if we are allocating money simultaneously then we are essentially trying to maximize the probability of one component being successful. That would be $1-(1-p(x_1))(1-p(x_2))...(1-p(x_n))$ ($1 - \prod_1^n(1-p(x_i))$) with $x_i$ being the money allocated at the ith stage and $i$ from 1 to n (total number of stages).

The probability of success of the component depends on how much we invest. If we invest 0 dollars, the probability of success is 0 meaning p(0)=0.

Now consider the case where we are unaware of whether a component will be successful or not until it is built. In that case we start with the first component and the probability of success of the component would be p(x_1). If it is unsuccessful then we move on to the next component so its probability would be conditioned on the first component being unsuccessful which is $p(x_1)(1-p(x_2))$ and so on ($p(x_1) + \sum_{i=2}^n \prod_{j=1}^{i-1} (1 - p(x_j)) p(x_i)$).

Below we are going to answer the question of whether investing all of our money into the first task is optimal or dividing our money into all tasks equally and invest in each of them. We take two cases where p(x) is concave and convex and then proceed by increasing the number of components we would need to be successful (k).

Load relevant libraries

library(Deriv)
library(ggplot2)

Concave p(x) function

# Define the function for probability of success
P <- function(x) {
  alpha <- 0.1 
  return((exp(alpha * x) - 1) / (exp(alpha * x) + 1))
}

#If you take the log of the equation we talked about above for the 
#non-sequential case
alpha <- 0.1 
log_expr <- expression(log(1 - (1 - (exp(alpha * x) - 1) / (exp(alpha * x) + 1))))
# and find the first derivative of log(1 - P(x)) with respect to x
first_derivative <- Deriv(log_expr, "x")

# and then the second derivative of log(1 - P(x)) with respect to x
# to check if it is concave or convex - remember that a function is 
# concave if its second derivative is negative and convex if positive
second_derivative <- Deriv(first_derivative, "x")

x <- seq(0, 100, 1) #assume the amount we are investing is between 0 and 100
evaluated_second_deriv <- eval(second_derivative)

#Which we see here that the function is actually concave
ggplot(data.frame(x=x, y=P(x)), aes(x=x, y=y))+geom_point(stroke=2,shape=8, color = "#D96098")+
  geom_line(size=2, color = "#D96098")+
  labs(x="x", y="p(x)")+
  ggtitle("p(x) is non-decreasing in x")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="A")->plt.A

plt.A

ggplot(data.frame(x=x, y=evaluated_second_deriv), aes(x=x, y=y))+
  geom_point(stroke=2,shape=8, color = "#D96098")+
  geom_line(size=2, color = "#D96098")+
  geom_hline(yintercept=0, linetype="dashed", size=2)+
  labs(x="x", y="p\"(x)")+
  ggtitle("p(x) is a concave function")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="B")->plt.B

plt.B

$Strategy comparison$

Strategy comparison

Define our strategies

Again, simulating the success of the outcome is like flipping a coin that has a p(x) probability of getting heads, if p(x) is higher then the probability of getting heads is higher.

#Let's say I am taking the strategy of allocating A/N for each component, 
#essentially I am dividing my total money (A) over all N component

#' Simulate problem under strategy 1
#'
#' @param A all of the money we are investing
#' @param n the number of stages
#'
#' @return whether that one task is completed successfully or not
#' @export
simulate_problem_strategy_1 <- function(A, n){
  
  count.success <- 0
  for(i in 1:n){
    if(rbinom(1, 1, P(A/n)) == 1){
      count.success <- count.success + 1
    }
  }
    
  if(count.success > 1){
    return(1)
  }else{
    return(0)
  }
  
}

#Second strategy is to put all of money down for the first task
#' Simulate problem under strategy 2
#' 
#' @param A all of the money we are investing
#' @param n the number of stages
#'
#' @return whether that one task is completed successfully or not
#' @export
simulate_problem_strategy_2 <- function(A, n){
  
  if(rbinom(1, 1, P(A)) == 0){
    return(0)
  }else{
    return(1)
  }
}

Optimal strategy for a concave p(x)

A <- 10
n <- 100
num_simulations <- 1000

numeric(num_simulations) -> strategy_1.rewards
numeric(num_simulations) -> strategy_2.rewards

for(k in 1:num_simulations){
  simulate_problem_strategy_1(A, n) -> strategy_1.rewards[k]
  simulate_problem_strategy_2(A, n) -> strategy_2.rewards[k]
}

ggplot(data.frame(name=c("Equally distributed", "All in 1"),
                  success_rate = c(sum(strategy_1.rewards)/num_simulations, sum(strategy_2.rewards)/num_simulations)),
       aes(x = name, y = success_rate))+
  geom_bar(stat = "identity", width=0.2, fill= "#D96098")+
  labs(x="Strategy", y="Success Rate")+
  ggtitle("Optimal solution")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))

Let’s try a case where p(x) is a convex function

Convex p(x) function

# Define the function for probability of success
P <- function(x) {

  return(0.01 * sqrt(x))
}
log_expr <- expression(log(1 - 0.01 * sqrt(x)))
# and find the first derivative of log(1 - P(x)) with respect to x
first_derivative <- Deriv(log_expr, "x")

# and then the second derivative of log(1 - P(x)) with respect to x
# to check if it is concave or convex
second_derivative <- Deriv(first_derivative, "x")

x <- seq(0, 100, 1)
evaluated_second_deriv <- eval(second_derivative)

#Which we see here that the function is actually concave
ggplot(data.frame(x=x, y=P(x)), aes(x=x, y=y))+geom_point(stroke=2,shape=8, color = "#D96098")+
  geom_line(size=2, color = "#D96098")+
  labs(x="x", y="p(x)")+
  ggtitle("p(x) is non-decreasing in x")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="A")->plt.A

plt.A

ggplot(data.frame(x=x, y=evaluated_second_deriv), aes(x=x, y=y))+
  geom_point(stroke=2,shape=8, color = "#D96098")+
  geom_line(size=2, color = "#D96098")+
  geom_hline(yintercept=0, linetype="dashed", size=2)+
  labs(x="x", y="p\"(x)")+
  ggtitle("p(x) is a convex function")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="B")->plt.B

plt.B

Optimal strategy for a convex p(x)

A <- 100
n <- 100
num_simulations <- 1000

numeric(num_simulations) -> strategy_1.rewards
numeric(num_simulations) -> strategy_2.rewards

for(k in 1:num_simulations){
  simulate_problem_strategy_1(A, n) -> strategy_1.rewards[k]
  simulate_problem_strategy_2(A, n) -> strategy_2.rewards[k]
}


ggplot(data.frame(name=c("Equally distributed", "All in 1"),
                  success_rate = c(sum(strategy_1.rewards)/num_simulations, sum(strategy_2.rewards)/num_simulations)),
       aes(x = name, y = success_rate))+
  geom_bar(stat = "identity", width=0.2, fill= "#D96098")+
  labs(x="Strategy", y="Success Rate")+
  ggtitle("Optimal solution")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))

k>1

Now let’s increase the number of successful components needed for the system to work to k…

#' Simulate problem under strategy 1
#'
#' @param A all of the money we are investing
#' @param n stages (total components)
#' @param k components we would like to have successfully built
#'
#' @return whether k successful components are built or not
#' @export
simulate_problem_strategy_1 <- function(A, n, k){
  
  count.success <- 0
  for(i in 1:n){
    if(rbinom(1, 1, P(A/n)) == 1){ 
      count.success <- count.success + 1
    }
  }
    
  if(count.success >= k){
    return(1)
  }else{
    return(0)
  }
  
}

#Second strategy is to put all of money down for the first k tasks
#' Simulate problem under strategy 2
#'
#' @param A all of the money we are investing
#' @param n stages (total components)
#' @param k components we would like to have successfully built
#'
#' @return whether k successful components are built or not
#' @export
simulate_problem_strategy_2 <- function(A, n, k){
  count.success <- 0
  for(i in 1:k){
    
    if(rbinom(1, 1, P(A/k)) == 1){
      count.success <- count.success + 1
      
    }
    
  }
  if(count.success >= k){
    return(1)
  }else{
    return(0)
  }
  
  
}

Simulate problem for k>1 successful components needed

A <- 10000
n <- 50
num_simulations <- 1000

numeric(num_simulations) -> strategy_1.rewards
numeric(num_simulations) -> strategy_2.rewards

storing.strategy.1.sums <- numeric(10)
storing.strategy.2.sums <- numeric(10)

k.index <- 1

for(k in seq(1, n, 2)){
  
  for(p in 1:num_simulations){
  simulate_problem_strategy_1(A, n, k) -> strategy_1.rewards[p]
  simulate_problem_strategy_2(A, n, k) -> strategy_2.rewards[p]
  }
  sum(strategy_1.rewards)->storing.strategy.1.sums[k.index]
  sum(strategy_2.rewards)->storing.strategy.2.sums[k.index]

  k.index <- k.index + 1
}

Comparison of strategies

df.k <- data.frame(k = seq(1, 50, 2), strategy_1 = storing.strategy.1.sums/num_simulations, strategy_2 = storing.strategy.2.sums/num_simulations)

ggplot() +
  geom_line(data = df.k, aes(x = k, y = strategy_1, colour = "Equally distributed"), size = 2) +
  geom_point(data = df.k, aes(x = k, y = strategy_1, colour = "Equally distributed"), shape = 8, stroke = 2) +
  geom_line(data = df.k, aes(x = k, y = strategy_2, colour = "All in k"), size = 2) +
  geom_point(data = df.k, aes(x = k, y = strategy_2, colour = "All in k"), shape = 8, stroke = 2) +
  
  labs(x = "k (n=50)", y = "Success Rate", colour = "Strategy") +
  ggtitle("Optimal solution") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  scale_color_manual(values = c("Equally distributed" = "#D96098", "All in k" = "#BEAEE2"))

Interesting! We see that the first strategy (plotted in pink) is optimal even if we increase the number of successful components k when p(x) is convex.

On special case: p(x) = x

Problem is taken from Derman et al, 1974.

Let’s consider the special case when P(x)=x and we want to maximize the probability of getting two components successfully built. Below, we simulate the case and compare to the theory mentioned in the paper.

The optimal policy $\pi$ is said to allocate $y/r$ - with $y$ being the amount of our present money and $r$ the number of stages left - when we have not yet built a successful component yet and allocate $y$ - our entire money - once we have one component successfully built.

#' Probability of successfuly constructing a component with
#' investment x
#'
#' @param x the amount of investment
#'
#' @return either the probability of succeeding or 1 in case we invest more money than needed
#' @export
P <- function(x){
  
  return(min(x, 1)) 
  
}


#Let's simulate the problem 

#' Simulate the special case problem
#'
#' @param A the total amount of money we can invest 
#' @param n the number of stages
#'
#' @return whether two components will be successfully built or not
simulate_problem <- function(A, n){
  
  successful.components <- 0 #Track successful components
  y <- A #How much money we are starting with 
  
  for(stage in 1:n){
    
    if(successful.components==0){ #we allocate y/r if nothing has been built yet
      x <- y/(n-stage+1)
    }
    if(successful.components==1){ #allocate all of our money once we have one component built
      x<-y
    }
    
    if(y==0){ #if we don't have any money then we can't invest
      x<-0
    }
    #at each stage simulate whether the component will be successfully built or not
    p <- P(x)
    if(rbinom(1, 1, p)==1){ 
      successful.components <- successful.components +1 
      
    }
    if(successful.components==2){
      return(TRUE)
    }
    y <- y - x #of course we lose the money we invest at each stage 
 
  }
  
  return(FALSE)
    
}

# #Equation is taken from Derman et al, 1974
# 
# #' Theory equation of the probability of getting 2 components successfully built under that policy
# #'
# #' @param n the number of stages
# #' @param y the total amount of money we invest
# #'
# #' @return the probability of successfully building two components
# #' @export
# U <- function(n, y){
#   
#   return((1-y/n)^n + y -1)
#   
# }

Compare simulations and theory

We are comparing simulations and theory for different investment amounts. Obviously we expect to see the probability increase for higher investment amounts.

#Do a 1000 simulations
sim.vec.A <- c()

for(A in seq(0, 1, 0.01)){
  
  sim.vec <- c()
  
  for(k in 1:1000){
    sim.vec <- c(sim.vec, simulate_problem(A, 10))
  }
  
  sim.vec.A<-c(sim.vec.A,sum(sim.vec)/1000)
  
}

# #Plug into the theoretical equation
# theory.vec <- c()
# for(y in seq(0, 1, 0.01)){
#   
#   theory.vec <- c(theory.vec, U(10, y))
#   
# }

Writing the optimality equation

I would suggest you pause at this point and try to write the optimality equation before seeing the solution below.

#' Optimality equation function
#'
#' @param y the amount of money we are investing
#' @param n the number of stages
#'
#' @return probability of building two successful components
#' @export
V <- function(y, n){
  
  #As always we write the boundary conditions first
  #We know that we have not defined the boundary conditions correctly if we get a C stack limit error. 
  if(y==0){
    return(0)
  }
  if(n==0){
    return(0)
  }
  
  #Check which x will maximize the probability of getting two components successfully built.
  max.x <- 0
    
  for(x in seq(0, y, 0.1)){
    
    p <- P(x)
    
    #Either we successfully build a component and invest our entire money in the next task and the process basically stops or we fail and we continue the same process with one less stage and less money (the money that we invested is now lost). 
    hold.reward <- max(p*(y-x) + (1-p)*V((y-x), n-1))
    
    max.x <- max(max.x, hold.reward)
    
  }

  return(max.x)
  
}


#Plot theory vs simulations
theory.vec <- c()
for(y in seq(0, 1, 0.01)){
  
  theory.vec <- c(theory.vec, V(y, 10))
  
}



res.df <- data.frame(y=seq(0, 1, 0.01), simulations = sim.vec.A, theory = theory.vec)

ggplot(res.df)+geom_point(aes(x=y, y=simulations), stroke=2
                              , color = "#D96098", shape=8)+
  geom_line(aes(x=y, y=theory), size = 1,color =  "#F7DBF0")+
  labs(x = "Investment money (y)", y = "Prob of building \n2 successful components", title = "special case p(x)=x") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "A")

Coming soon…

Part II of these series for more advanced problems on optimal allocation.

Sequential allocation model

Sequential allocation model with certainty

We are going to start by considering a simple case, let’s say we have $N$ periods to invest and at each period we invest a certain amount of money, $y$, (we invest with certainty at each period, our total money is $A$). The return that we get is $R(y)$. The return is not going to be available for investment in the future periods and we get it at the end of the $N$ periods. How much do we invest at each stage to maximize our total return.

Okay, first let’s define the $R(x)$ function. This is quite similar to $p(x)$ in the problems we covered before. I am going to use a simple convex function, $R(x) = \sqrt{x}$.

#' Return function
#'
#' @param x investment amount
#'
#' @return the return from investment
#' @export
R <- function(x){
  
  return(sqrt(x))
  
}

Let’s think of how we can write the optimality equation. The base case is when we are at the last period, we invest and we get R(x) back. Overall, we try to maximize the expected return we get from the current period and future periods. So technically we are thinking $V_n(A) = \max_{0 \le y \le A}[R(y) + V_{n-1}(A-y)]$. Let’s write a function for theory:

#' optimality equation function
#'
#' @param n number of stages
#' @param A total amount of money we can invest
#'
#' @return the expected return
#' @export
V <- function(n, A){
  
  if(n==1 || A==0){ #define boundary conditions
    return(R(A))
  }
  
  #Find the y that maximizes the function
  hold.max <- 0
    
  for(y in seq(0, A, 0.2)){
    
    expected_reward <- max(R(y)+V(n-1, A-y))
    max(expected_reward, hold.max)->hold.max
    
  }
  
  return(hold.max)
  
}

Let’s think about the optimality equation a little bit more. If I am at the boundary, I invest $y$ amount of money and get $R(y)$ as a return (and that is because we only have one action, we invest with certainty). Now if I had two periods to go then I know $V_2(A) = \max_{0 \le y \le A}[R(y) + V_{1}(A-y)]$. $V_{1}(A-y)$ is essentially $R(A-y)$. So basically the question is how do I pick y such that $R(y)+R(A-y)$ is maximized which would be $\sqrt(y)+\sqrt(A-y)$. To find out where the maximum happens we can see where the derivative is 0. $\frac{1}{2} y^{-1/2} + \frac{1}{2} (A-y)^{-1/2} = 0$. If we cancel out the $\frac{1}{2}$ then we get $A-y=y$ which gives us $y=A/2$.

Let’s move to the next stage. we are trying to find the $y$ that maximizes the optimality equation.

$V_3(A) = R(y)+V_2(A-y)$. We know that $V_2(A-y) = \sqrt{2}\sqrt{A-y}$. To find the $y$ that maximizes $V_3(A)$, we set its derivative to zero and we end up with: $\sqrt{2(A-y)} = \sqrt{2y}$. Then we get $y=A/3$. By induction we see that the optimal strategy $\pi$ is to equally distribute our money over stages ($y=A/N$).

So essentially, the total return would be $\sqrt{n*A}$ with $n$ being the number of stages and A the total amount of money we are investing.

theory_vec <- c()
formula_vec <- c()

stages <- 5

for(A in seq(0, 10, 1)){
  
  theory_vec <- c(theory_vec, V(stages, A))
  formula_vec <- c(formula_vec, stages*sqrt(A/stages))
}

res.df <- data.frame(y = seq(0, 10, 1), theory = theory_vec, formula = formula_vec)
ggplot(res.df)+geom_point(aes(y, theory), stroke=2
                          , color = "#D96098", shape=8)+ 
  geom_line(aes(y, formula), size = 1,color =  "#F7DBF0")+
  labs(x = "Investment money (y)", y = "Return", title = "Deterministic case") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "A")

Sequential allocation model with uncertainty

Let’s add some uncertainty to the situation. What if at each stage, there is an opportunity to invest that comes up. If the opportunity to invest does not come up then we cannot invest. That probability is $p$. Let’s revise the optimality equation.

We can consider the occurrence of the opportunity to invest with an indicator function ($I$). The indicator function returns 1 when the opportunity to invest comes up else it returns 0.

$V_n(A, 1) = R(y) + p V_{n-1}(A-y, 1) + (1-p) V_{n-1}(A-y, 0)$,

$V_n(A, 0) = V_{n-1}(A, 0)(1-p) + V_{n-1}(A, 1)p$

#' Optimality equation for the sequential allocation model with uncertainty
#'
#' @param n number of stages
#' @param A total amount of money for investment
#' @param I the indicator function, input can be either 1 or 0
#' @param p the probability of the investment opportunity arising
#'
#' @return the expected reward
#' @export
V <- function(n, A, I, p){
  
  #Define boundary conditions
  if((n==1 || A==0)&&I==0){
    return(0)
  }
  if((n==1 || A==0)&&I==1){
    return(R(A))
  }
  
  #Search for the y that maximizes the bellman equation
  hold.max <- 0
  
  for(y in seq(0, A, 0.2)){
    
  if(I == 0){
    expected_reward <- V(n-1, A, 0, p)*(1-p) + p*V(n-1, A, 1, p)
  }
  else{
    expected_reward <- R(y) + p*V(n-1, A-y, 1, p) + (1-p)*V(n-1, A-y, 0, p)
  }

  max(expected_reward, hold.max)->hold.max
  }
  
  return(hold.max)
  
}

Coming soon, more on sequential allocation problems…

Optimal stopping problems

Finding the best offer

Imagine you are presented with n offers in sequential order. You don’t know the value of the offer $i$ until you are at stage $i$ and the offer is presented to you. Therefore, your only option to realize how good or bad the offer is, is to compare it with the previous offers. You have the option to either reject the offer or accept the offer. If you accept it the game will stop and you have either selected the best offer and win or not selected the offer and lose. If you reject it then you move on to the next offer. This is a famous problem known as the “Optimal stopping problem” or the “Secretary problem”. There is a great video that explains the mathematics behind this problem by Numberphile. It explains how we can select the best toilet at a music festival

Find the YouTube video here.

The overall strategy of such problems is to keep rejecting the offers up to stage $k$ and then start comparing the offers to anything that came before stage $k+1$. If any offer is superior then we accept it, otherwise we keep rejecting and moving forward.

With that let’s simulate the situation.

#' function for simulating the optimal stopping problem
#'
#' @param offers the sequence of offers presented
#' @param i the stage of the offer
#' @param n the total number of stages
#'
#' @return the offer selected
#' @export
select_offer <- function(offers, i, n){
  
  for(j in (i+1):n){
    
    if(offers[j]>max(offers[1:i])){

      return(j)
    }
  
  }
  return(n)

}

Simulate the situation

n <- 1000 #The number of offers

hold.diff.i <- c()

for(i in seq(100, 500, 10)){ #This is the point where we stop rejecting
  
hold.res <- c()

for(k in 1:5000){ #Repeat 5k times for each i
  
offers <- rnorm(1000) #Generate random offer scores


hold.vec <- c()
selected.offer <- c()
    
select_offer(offers, i, n) -> hold.offer.idx
  
hold.vec <- c(hold.vec, hold.offer.idx)
selected.offer <- c(selected.offer, offers[hold.offer.idx])

#Find out if our strategy was successful in finding the best offer
#or not
hold.res<-c(hold.res, (selected.offer==max(offers)))

}

#Find the success rate
hold.diff.i <- c(hold.diff.i, sum(hold.res)/5000) 

}

Find the success rates for different stage number ($i$) where you stop rejecting the offer and start comparing.

results.df <-  data.frame(i =  seq(100, 500, 10), success_rate = hold.diff.i)


ggplot(results.df)+geom_point(aes(x=i, y=success_rate), stroke=2
                              , color = "#D96098", shape = 8)+
  labs(x = "i", y = "Success rate", title = "Best offer") +
  geom_vline(xintercept = floor(n * 1 / exp(1)), linetype="dashed")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "A")

Interesting, you see that even in simulations it is clear that the success rate is highest when we are stage $n/e$.

Secretary problem with the possibility of rejection

Reference: The theoretical solution to this problem is taken from Smith, 1975.

Consider the optimal stopping problem with the possibility of rejection. Imagine that we are trying to hire a secretary. There are a total of $N$ secretaries and at the end of each interview, we can decide to either reject or accept the job applicant for the position. Either we select the applicant and the process is over or we reject the applicant and move on to the next one. We can’t go back to an applicant we had rejected in the past. In this particular problem, we are considering the case where the secretary has a choice of either accepting or rejecting the opportunity if given to him/her. The probability of the secretary accepting the offer is $p$. Question is when should be stop rejecting the secretaries and start accepting to increase our chances of finding the best secretary?

Let’s start by simulating the problem

#' Simulate the secretary problem with the possibility of rejection. 
#'
#' @param offers the set of offers, in this case we can consider the scores of the secretaries with 100 being the best and 1 the worst secretary. 
#' @param i denotes at what stage we stop rejecting and start comparing to the previous candidates
#' @param p the probability of the secretary accepting the offer
#'
#' @return the applicant that accepts the offer
#' @export
simulate_problem <- function(offers, i, p){
  
  for(j in (i+1):length(offers)){
    if(offers[j] > max(offers[1:i])){
      prob <- rbinom(1, 1, p)
      if(prob==1){
        return(j)
      }
    }
  }
  return(length(offers))
  
}

#Simulate the problem 0.5k times

num_simulations <- 500

matrix(NA, nrow = length(seq(2, 1000, 10)), ncol = 3)->store.mat

count <- 1

for(p in c(0.5, 0.7, 0.9)){

success.rate.vec <- c()

for(N in seq(2, 1000, 10)){ #try different stages at which we stop rejecting
 
success.rate <- 0

for(simulations in 1:num_simulations){
  
  offers <- sample(seq(1, 1000, 1))
  
  store.res <- simulate_problem(offers, N, p)
  
  if(offers[store.res]==1000){
    
    success.rate <- success.rate + 1
    
  }
  
}

#find the success rate
success.rate/num_simulations -> success.rate

success.rate.vec <- c(success.rate.vec, success.rate)


}

store.mat[,count] <- success.rate.vec

count <- count + 1
}

Plot the simulation results

as.data.frame(store.mat)->store.mat

colnames(store.mat)<-c("P = 0.5", "P = 0.7", "P = 0.9")

store.mat$r <- seq(2, 1000, 10)

#' The stage at which we stop rejecting
#'
#' @param p the probability of the secretary accepting the job
#'
#' @return the stage which we stop rejecting
#' @export
r <- function(p){
  
  return(p^(1/(1-p)))
  
}


ggplot()+
  geom_point(data = store.mat, aes(x=r, y=`P = 0.5`, colour = "p = 0.5"), shape = 8, stroke = 2)+
  geom_point(data = store.mat, aes(x=r, y=`P = 0.7`, colour = "p = 0.7"), shape = 8, stroke = 2)+
  geom_point(data = store.mat, aes(x=r, y=`P = 0.9`, colour = "p = 0.9"), shape = 8, stroke = 2)+
  geom_point(aes(x = r(0.5)*1000, y = r(0.5)), color = "#EE99C2", shape = 8, stroke = 5)+
  geom_point(aes(x = r(0.7)*1000, y = r(0.7)), color = "#D96098", shape = 8, stroke = 5)+
  geom_point(aes(x = r(0.9)*1000, y = r(0.9)), color = "#86469C", shape = 8, stroke = 5)+
  geom_line(aes(x = r(seq(0, 0.99, 0.01))*1000, y = r(seq(0, 0.99, 0.01)), colour = "Optimal solution"))+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+
  labs(x = "r", y = "Success rate", colour = "")+
  scale_color_manual(values = c("p = 0.5" = "#FFCDEA", "p = 0.7" = "#FB9AD1", "p = 0.9" = "#BC7FCD", "Optimal solution" = "#00224D"))

Secretary problem with recall

The special example below is taken from the “Algorithms to live by” book by Brian Christian and Tom Griffiths.

Let’s consider another variation. Let’s assume that the secretary may be recalled at a later time but the chance of the secretary accepting the offer would then be 50% at a later time (the probability of the acceptance of a belated offer is lower than an immediate offer). We start simulating this basic case and then move on to more advanced variations of the secretary problem.

According to the book, in this case of the optimal stopping problem, the optimal strategy $\pi$ would be to stop rejecting after we interview 61% of the offers and start comparing offers afterwards.

#' Simulate the problem
#'
#' @param offers the score of offers
#' @param i the stage at which we stop rejecting
#' @param p the probability of the acceptance of a belated offer
#'
#' @return the final accepted offer
#' @export
simulate_problem <- function(offers, i, p){
  
  for(j in (i+1):length(offers)){
    
    if(offers[j] > max(offers[1:i])){
      
      return(offers[j])
      
    }
    
  }
  
  heads <- rbinom(1, 1, p) #try to see if a belated offer is accepted or not
  
   if(heads==1){
     
      return(max(offers[1:i]))
     
   }else{ #if not return the last offer
     
     return(offers[length(offers)])
   }
  
  
}

#simulate the situation 5k times
num_simulations <- 5000

hold.offer.sum <- c()

for(i in 30:99){ #try different stages of when we stop rejecting
  
  hold.accepted.offer <- c()
  
  for(k in 1:num_simulations){
    
    offers <- sample(seq(1, 100, 1))
    hold.accepted.offer <- c(hold.accepted.offer, (simulate_problem(offers, i, 0.5) == 100))
    
  }
  
  hold.offer.sum <- c(hold.offer.sum, sum(hold.accepted.offer))
  
}

res.df <- data.frame(i = 30:99, success_rate =  hold.offer.sum/num_simulations)

ggplot(res.df, aes(x = i, y = success_rate))+geom_point(color = "#D96098", shape = 8, stroke = 2)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+
  geom_vline(xintercept = 61, linetype="dashed")+
  labs(x = "r", y = "Success rate")

Full information secretary problem

Megan is looking for a man in finance with trust fund and blue eyes who is also 195 cm tall. Suppose there are a sequence of men in finance who have trust funds and blue eyes and Megan is trying to find the tallest among them (their heights range from 145cm to 195cm). Megan starts by dating the first man, she needs to make a decision of either rejecting the man and moving to the next candidate or settling down. If she settles down, then the problem stops. However, if she decides to reject the man, she will move on to the next man in the sequence of men.

If Megan rejects a candidate, she cannot return to him later; her decision is final. Question is how can Megan maximize her chances of finding the tallest man among the candidates?

To maximize her chances of ending up with the tallest man, Megan employs a strategy similar to the “full information” version of the secretary problem. In this version, Megan knows the height of each man (relative to all men) after dating them, which allows her to make an informed decision at each step. Her goal is to choose the tallest man possible among the candidates. The challenge, however, lies in deciding when to stop searching and settling down. If she settles too early, she might miss out on a potentially taller man who appears later in the sequence. If she waits too long, she risks ending up with a shorter candidate towards the end.

To optimize her chances, Megan uses a threshold-based strategy. At each stage depending on the number of candidates she has left to date, she considers the probability of finding a taller man if she rejects the current candidate. To understand the problem better, let’s consider the problem backwards. If Megan has rejected all candidates and is down to the last candidate then she has no other choice than to select him as her partner. If she were at the candidate who is one candidate to the last one, she can check to see if his height exceeds the 50th percentile of the heights of men in the general population. If the candidate’s height is less than the 50th percentile then it would be better to reject the man and move on to the last candidate because there is a 50% chance that the last candidate’s height falls above the 50th percentile. If Megan was at the two to the last candidate stage then she would want to only reject the candidate if the candidate’s height falls below a threshold for which moving to the next candidate has a greater than 50% chance of finding a better candidate. Mathematically that means $1-(0.5)^{1/2}=0.5$ which gives her a threshold of around ~70%. Likewise she can find the particular threshold of rejection for each stage k to be $0.5^{1/k}$.

Below we find thresholds of rejection

#' Finding the threshold for rejection 
#'
#' @param num.applicants number of total applicants
#'
#' @return the threshold of rejection for each stage
#' @export
find.threshold <- function(num.applicants){
  
  c() -> hold.thresholds 
  
  for(k in 1:num.applicants){
    
    hold.thresholds  <- c(hold.thresholds , 0.5^(1/k))
    
  }
  return(hold.thresholds)
  
}

thresholds <- find.threshold(10)

df.threshold <- data.frame(applicant = seq(0, 10, 1), threshold = c(0, thresholds))

ggplot(df.threshold, aes(x = applicant, y = threshold))+geom_line(size=2, color = "#538392")+
  labs(x="Number of applicants", y="Threshold")+
  ggtitle("Threshold of Rejection")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="A")

Simulate the problem

#' Simulate the Megan problem
#'
#' @param candidate.heights height of candidates
#' @param reference.heights percentile of candidates from general population,
#' should be a dataframe and have two columns named Percentile and Number. Percentile
#' representing the specific percentile and Number representing the height value
#'
#' @return the position of the candidate Megan chooses
#' @export
simulate.megan.problem <- function(candidate.heights, reference.heights){
  
  num.applicants <- length(candidate.heights)
  
  find.threshold(num.applicants) -> thresholds
  
  candidate.position <- num.applicants - 1
  
  for(candidate.height in candidate.heights[-num.applicants]){

    if(candidate.height > (reference.heights[which.min(abs(reference.heights$Percentile - thresholds[num.applicants]*100)),])$Number){
      
      return(which(candidate.heights==candidate.height)[1])
      
    }
      
    candidate.position <- candidate.position - 1
  }
  
  return(num.applicants)
}


#Let's find the specific heights for different percentiles

mean.value <- 175 #Let's consider the mean height to be 175cm
sd.value <- 10 #with a standard deviation of 10cm

#Generate 5k random numbers

random.numbers <- rnorm(5000, mean = mean.value, sd = sd.value)

#Filter for heights not exceeding 195cm and not less than 140cm

filtered.numbers <- random.numbers[random.numbers >= 140 & random.numbers <= 195]

final.sample <- sample(filtered.numbers, 1000, replace = TRUE)

ecdf.function <- ecdf(final.sample)

percentiles <- ecdf.function(final.sample) * 100

reference.heights <- data.frame(Number = final.sample, Percentile = percentiles)

#Let's now simulate the problem

c()->changing.applicant.sims

#Let's see what happens if we change the number of applicants in the pool.

for(num.applicants in c(10, 20, 30, 50, 100)){

num.simulations <- 5000 #Let's simulate the problem 5k times
c() -> candidate.heights.sim

for(i in 1:num.simulations){
  
  success <- 0
  candidate.heights <- sample(140:195, num.applicants, replace = TRUE)
  if(max(candidate.heights) == candidate.heights[simulate.megan.problem(candidate.heights, reference.heights)]) {success <- 1}
  candidate.heights.sim <- c(candidate.heights.sim, success)
    
}

changing.applicant.sims <- c(changing.applicant.sims, sum(candidate.heights.sim)/num.simulations)

}

res.df <- data.frame(num_applicants = c(10, 20, 30, 50, 100), probability_success = changing.applicant.sims)

ggplot(res.df, aes(x = num_applicants, y=probability_success))+geom_point(stroke = 2, color = "#D96098", shape = 8)+
  geom_line(size = 2, color = "#F7DBF0")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+labs(x = "Number of applicants", y="Probability of success", tag="B")+
  ggtitle("Full Information Secretary Problem")+ylim(c(0, 1))

We see that Megan has a pretty good chance of finding her ideal man in finance using this strategy even if the number of available men to date is low (with ~60% of getting the best man among 10 men).

Note: The story behind the problem is inspired by the song written by Megan Boni (Lamour, 2024).

Note: The solution to the full-information secretary problem is taken from Christian & Griffiths’ “Algorithm to live by” book (2016).

References:

Christian, B., & Griffiths, T. (2016). Algorithms to live by: The computer science of human decisions. Macmillan.

Lamour, J. (2024, May 29). How a viral song about “looking for a man in finance” took over the internet. TODAY.com. https://www.today.com/popculture/music/tiktok-im-looking-for-a-man-in-finance-megan-boni-interview-rcna151152

The Cayley - Moser problem

The following problem is taken from Guttman, 1959.

A game involves taking random numbers from a uniform distribution. Here we take the distribution from 0 to 1. We have N opportunities to draw numbers from the distribution. At each stage we either reject the opportunity and move to the next stage or accept the opportunity and the process stops. The question is when do we stop and accept the drawn number. The optimal strategy $\pi$ is to stop when the number we get is higher than a specific threshold. The threshold is 0.5 when we have only one opportunity since the expected value of the drawn number is 0.5. When we have two chances then for the first draw either the drawn number is less than 0.5 which happens with probability 0.5 and we move on to the next case with an expected threshold of 0.5 OR the drawn number is more than 0.5 with probability 0.5 but the expected value would then be 0.75 (because the number is from 0.5 to 1). Therefore $E_2 = 0.5(0.75)+0.5(0.5)$. We can follow the same logic to find $E_3$. Either the drawn number is less than $E_2$ which is 0.625 and thus we move to the second opportunity which has an expected value of 0.625 or the drawn number exceeds 0.625 with a probability of (1-0.625) and an expected value of 0.8125 ($E_3 = 0.375(0.8125)+0.625(0.625)$).

#' Probability density function
#'
#' @param x parameter x
#'
#' @return the probability density function for a uniform distirbution from 0 
#' to 1.
#' @export
f <- function(x){
  dunif(x, min = 0, max = 1)
}



#' Threshold function, we stop if the drawn number is greater or equal to E_n
#' at stage n
#'
#' @param n the number of opportunities left to draw
#'
#' @return the threshold for stopping at that stage
E <- function(n){
  
  #Define the boundary condition
  if(n==0){ 
    return(0)
  }
  
  
  hold.integrand.first.part <- function(x) {x * f(x)}
  first.part <- integrate(hold.integrand.first.part, E(n-1), Inf)$value
  
  hold.integrand.second.part <- function(x) {f(x)}
  second.part <- integrate(hold.integrand.second.part, - Inf, E(n-1))$value
  
  return(first.part + E(n-1) *second.part)
  
}

E_vector <- c()

for(i in 1:12){
  
  E_vector <- c(E_vector, E(i))
  
}

E_res <- data.frame(left_opportunity = 1:12, E = E_vector)



ggplot(E_res) + 
  geom_point(aes(x=left_opportunity, y = E),stroke=2, color = "#538392", shape=8)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="A", x="Left opportunity")

Results are consistent with Table 1 from Guttman, 1959.

Guessing one card from a deck

A deck of 52 cards, before we turn the cards we can guess whether it is the ace of spades or not.

Let’s start with the base case. The base case is when you only have one ace of spades card so the probability of that one card being the ace of spades is 1. Alternatively, you have one non-spade card so the probability of having an ace of spades is 0. When you move to the next case, there are two possible scenarios,

A- you guess it is the ace of spades and we are done
B- you pass, don’t say anything and we move backwards

I guess intuitively it doesn’t make much of a sense to think about this problem this way because unlike the other optimal stopping problems, there are no comparisons and all other cards except for the ace of spades are essentially the same. Consequently, the probability of guessing correctly would always be $1/N$. But let’s see what happens if we try to write it as a dynamic programming problem.

#' Probability of guessing the ace of spades correctly depending on the number
#' of cards
#'
#' @param n number of total cards
#' @param chances number of cards left to guess
#'
#' @return the probability of guessing ace of spades correctly
#' @export
V <- function(n, chances){
  
  if(n==1){
    return(1) #once you get to this point, the probability of 
    #the card being the ace of spades is 1. 
  }

  if(chances>1){
    
    return(max(1/n*V(n-1, (chances-1)), (n-1)/n*V(n-1, chances)))
    
  }else{
    
  return(max(1/n, (n-1)/n*V(n-1, 1)))
    
  }
  
}
#We can try plotting the probability
V_vector <- c()

for(chances in 1:5){
  
  c(V_vector, V(52, chances))->V_vector
  
}

#And the probability of guessing the ace of spades correctly remains constant.

And we see that this confirms out initial thoughts. I am guessing the point of the question was for us to be able to write the Bellman equation correctly which we have done.

$V_N = \max[1/N, \frac{N-1}{N} V_{N-1}]$

Pick any of the thirteen spades

Part D, if we were to pick any of the thirteen spades then the problem would become something like this:

#' The probability of guessing the spades correctly
#'
#' @param no_spades number of spades
#' @param no_others number of other cards
#'
#' @return the probability of getting spades
V <- function(no_spades, no_others){
  
  #Define boundary conditions first
  if(no_spades == 0){
    return(0) #you ran out of spades and guessed none
  }
  if(no_others == 0){
    return(1) #you ran out of all other cards, you have 1 spade left
  }
  
  #Let's think about what happens
  #We guess we have a spade card that comes with the probability of no_spaces/all cards
  #We pass, either the passed card is a spades card or it is a non-spade card
  return(max(no_spades/(no_spades+no_others), no_spades/(no_spades+no_others)*V(no_spades-1, no_others)+
               no_others/(no_spades+no_others)*V(no_spades, no_others - 1))) 
  
}

Taking balls from an urn

Problem #11 from Sheldon Ross’ book on stochastic dynamic programming

An urn initially contains $n$ white and $m$ black balls. Balls are selected at random, without replacement, and the decision maker wins one unit when a white ball is selected and loses one unit when a black ball is selected. At any point in time, the decision maker is allowed to quit playing. The objective is to maximize expected winnings.

Let’s start by simulating the problem. If we were to take balls at random from an urn, we would intuitively stop taking more balls once the number of white and black balls are the same or there are more black balls in the urn because there will be a 50% or more chance of losing.

#' Simulate the urn problem
#'
#' @param m the number of black balls
#' @param n the number of white balls
#'
#' @return the score
#' @export
simulate_problem <- function(m, n){
  
  #We take balls at random, similar to assigning black and white to a sequence at random
  
  results.seq <- sample(c(rep('b', m), rep('w', n)))
  
  #assign scores for each stage
  results.seq[results.seq=='b']<- -1
  results.seq[results.seq=='w']<- +1
  
  #We stop the process if any stage the number of black and white balls are equal. 
  
  for(k in 1:(m+n)){
    
    if(sum(as.numeric(results.seq[k:(m+n)]))==0){
      return(sum(as.numeric(results.seq[1:k])))
    }
    
  }
  
  #otherwise we take all of the balls out and return the score
  return(sum(as.numeric(results.seq)))
  
 
}

Then let’s write a recursive function for the theory. As always think of the base case, either we get to a point when $m$ and $n$ are equal and we stop (we return nothing), or we have only white or black balls left. If all of the balls are white then we get a score of $+n$, if all are black then we get a score of $-m$. Moving backward, either we pick a white ball with probability $p$, add +1 to our score and move to the previous step with one less white ball or we pick a black ball with probability $q$, subtract 1 from our score and move to the previous step with one less black ball.

#' Theory function
#'
#' @param m the number of black balls
#' @param n the number of white balls
#'
#' @return the expected reward
#' @export
V <- function(m, n){
 
  if(m==n){
    return(0)
  }
  if(n==0){
    return(-m)
  }
  if(m==0){
    return(n)
  }

  #The probability of getting whites is n/(m+n) or p
  #The probability of getting blacks is m/(m+n) or 1-p or q
  
  p <- n/(n+m)
  q <- 1 - p

  #Either we pick a white one with probability p, add +1 to our score and move to the previous step with one less white ball or we pick a black ball with probability q, subtract 1 from our score and move to the previous step with one less black ball. 
  
  #On the other hand, if we quit, our expected reward would be zero at that point so we are weighing the option of quiting or continuing. 
  
  expected.reward <- max(0, q * (V(m-1, n) - 1) + p * (1 + V(m, n-1)))
  
  return(expected.reward)
}

Compare simulations and theory

simulation.res.all <- c()
theory.res.all <- c()

num_simulations<-100

for(i in 5:30){
  
  
  simulation.res <- c()
  
  V(5, i)->theory.res
  
  for(k in 1:num_simulations){
    
    c(simulation.res, simulate_problem(5, i))->simulation.res
  
  }
  simulation.res.all <- c(simulation.res.all, mean(simulation.res))
  theory.res.all <- c(theory.res.all, theory.res)

}



res.df <- data.frame(n = 5:30, simulations = simulation.res.all, theory = theory.res.all)

ggplot(res.df)+geom_point(aes(x=n, y=simulations), stroke=2
                              , color = "#D96098", shape=8)+
  geom_line(aes(x=n, y=theory), size = 1,color =  "#F7DBF0")+
  labs(x = "Number of white balls (n)", y = "Expected rewards", title = "Black balls, m = 5") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "A")

Awesome, we see that simulations and theory match very well!

Game of pure strategy

Goofspiel or game of pure strategy involves two players, one holding the diamonds and the other holding the hearts of a deck of cards. The spades are placed in the center and in each stage a spade is turned face up. The two players would then put two cards down and the one with the higher card value (with ace having a value of 1 and king a value of 13) wins the value of the faced up card in the center. The question is what strategy achieves this?

The optimal strategy $\pi$ tries to match the card in the center in each stage.

#' simulate a player playing the optimal strategy against a player who puts random cards down
#'
#' @return the scores of player 1 and player 2
#' @export
simulate_optimal_strategy <- function(){
  
  center.cards <- sample(seq(1, 13, 1)) #Center cards drawn randomly
  
  player.1.cards <- center.cards #optimal strategy - player 1 matches ceter cards
  player.2.cards <- sample(seq(1, 13, 1)) #puts cards down randomly
  
  player.1.scores <- 0;player.2.scores <- 0
  
  for(i in 1:13){
    
    if(player.1.cards[i] > player.2.cards[i]){
      player.1.scores <- player.1.scores + center.cards[i]
    }else if(player.1.cards[i] < player.2.cards[i]){
      player.2.scores <- player.2.scores + center.cards[i]
    }
    
  }
  
  return(list(player.1.scores, player.2.scores))
}

Simulate the game

#simulate the game 1000 times

num_simualtions <- 1000

player.1.scores <- c()
player.2.scores <- c()

for(i in 1:num_simulations){
  
  res_lst <- simulate_optimal_strategy()
  
  player.1.scores <- c(player.1.scores, res_lst[[1]])
  player.2.scores <- c(player.2.scores, res_lst[[2]])
  
}

data.frame(score = player.1.scores)->player.1.scores
data.frame(score = player.2.scores)->player.2.scores

player.1.scores$player <- 'player1'
player.2.scores$player <- 'player2'

sim_res <- rbind(player.1.scores, player.2.scores)

To find the expected score for player I, we have to go over different cards and count the probability of the value of the current card exceeding that of player II. Then multiply it by the amount of the center card. For example, let’s start with the base case. We have two rounds with two cards ace and two. We can never win if we put an ace down (even if the other player puts their ace down, we both get a score of zero). If we put a two down then the chance of winning is $\frac{1}{2}$ because either the other player puts a two down and we both get a score of 0 or they put an ace down and we win. In this case, it is obvious that we should reserve our two card for when two of spades comes up in the center.

By induction you can see that it is optimal to always match the strategy of the card in the center. The expected score for player I would be $\sum_{i=1}^{N}i\frac{i-1}{N}$. If we replace $N$ with the total number of cards, 13, we get an expected score of 56. Let’s see if it is consistent with our simulations of the game.

expected_winning <- 0 

for(i in 1:13){
  
  expected_winning <- expected_winning + i*((i-1))/13
  
}


ggplot(sim_res, aes(score, fill = player)) +
  geom_density(alpha=0.4)+
  geom_vline(xintercept = expected_winning, linetype="dashed", size=2)+
  labs(x = "Score", y = "Density", title = "Game of pure strategy - 1k simulations") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  scale_fill_manual(values=c("#201E43", "#E3A5C7"))+
  labs(tag = "A")

Timid/bold play

Example is taken from Bertsekas (Dynamic Programming and Optimal Control), Chapter 1, Example 1.2.1

Reference: Bertsekas, D. (2012). Dynamic programming and optimal control: Volume I (Vol. 4). Athena scientific.

Suppose you are playing against someone and you have the option to either play timidly or boldy. If you play boldly you win with probability $p_w$ < 1/2. If you play timidly you draw with probability $p_d$ > 1/2. If any of the players win they get a score of 1 while the other losing player gets a score of 0 and if there is a draw, then both players get a score of 0.5 for that match. You play two matches. If at the end of the second match you and the other player reach a draw, the game faces a sudden death meaning players play more and more matches until someone wins the game. Here we simulate and theoretically show the optimal strategy.

How do we find the optimal strategy? Let’s think of all of the possible scenarios:

Figure taken from Bertsekas, 2012 We have four possible scenarios, let’s calculate the probability of winning in each case:

CASE I: Timid - Timid

The only times we win under this strategy is if we draw the first two matches and win in the third match (sudden death happens) $p_d^2 (p_w)$.

CASE II: Bold - Bold

Three possible routes we can win under this strategy. Luckiest scenario is we win both times then our probability of winning is $p_w^2$. Alternatively, we win the first match, lose the others and then move to the third match and win that one. This case happens with probability $p_w(1-p_w)p_w$. Similarly, we may lose the first match and win the second one then move to the third match and win that game with probability $(1-p_w)(p_w)(p_w)$. Overall the probability of winning in this case would be $p_w^2 + 2(1-p_w)(p_w^2)$.

CASE III: Bold - Timid

We win the first match and draw the second with probability $p_w(p_d)$. The other possible route of winning is if we win the first match, lose the second and win the third match with probability $p_w(1-p_d)p_w$. The overall probability of winning in this case would be $p_w(p_d)+p_w(1-p_d)p_w$.

CASE IV: Timid - Bold

We draw the first match and win the second with probability $p_d(p_w)$. Lose the first match, win the second match and proceed to win the third match with probability $(1-p_d)(p_w)p_w$. The overall probability of winning in this case is $p_d(p_w) + (1-p_d)(p_w)p_w$.

Let’s compare the strategies:

timid.timid <- function(p_d, p_w) return(p_d^2 * (p_w))

bold.bold <- function(p_d, p_w) return(p_w^2*(3-2*p_w))

timid.bold <- function(p_d, p_w) return(p_w*p_d + p_w^2*(1-p_d))

timid.timid.vector <- c()

bold.bold.vector <- c()

timid.bold.vector <- c()

p_d <- 0.9

for(p_w in seq(0.2, 0.5, 0.05)){
  
  c(timid.timid.vector, timid.timid(p_d, p_w)) -> timid.timid.vector
  c(bold.bold.vector, bold.bold(p_d, p_w))-> bold.bold.vector
  c(timid.bold.vector, timid.bold(p_d, p_w))-> timid.bold.vector
  
}

win_prob_df <- data.frame(p_w = seq(0.2, 0.5, 0.05), timid_timid = timid.timid.vector,
           bold_bold = bold.bold.vector,
           timid_bold = timid.bold.vector)

In the book it is mentioned that the Timid-Timid strategy is always dominated by the others. Here, we show that although it will always be dominated by either the Bold-Timid and Timid-Bold strategies, it will not always be dominated by the Bold-Bold strategy.

Assume $p_d = $ 0.9. In that case the Timid-Timid strategy will be equally good as the Bold-Bold strategy when $p_d^2 = p_w(3-2(p_w))$. If we plug in $p_d$ in the equation, we get $p_w$ equal to 0.35313730334 as one of the roots. We can see that whenever $p_w$ is less than this value when $p_d$ is 0.9 it is actually better to play timidly than boldly if we had the chance to decide between either playing completely timidly or boldly.

p_w_intersections <- c()

for(p_d in seq(0.55, 0.95, 0.05)){
  
  f <- function(p_w) {
     return(bold.bold(p_d, p_w) - timid.bold(p_d, p_w))
  }
  p_w_intersections <- c(p_w_intersections, uniroot(f, lower = 0.1, upper = 0.9)$root)
  
}

And we see that it is always optimal to play with the Timid-Bold strategy as opposed to the Bold-Bold strategy when $2(p_w)$ is less than $p_d$.

ggplot(win_prob_df)+
  geom_rect(aes(xmin=0.2, xmax=0.35313730334, ymin=0.1, ymax=0.5), fill="#F1D3CE")+
  geom_rect(aes(xmin=0.35313730334, xmax=0.9/2, ymin=0.1, ymax=0.5), fill="#D1E9F6")+
  geom_rect(aes(xmin=0.9/2, xmax=0.5, ymin=0.1, ymax=0.5), fill="#C75B7A")+
  geom_line(aes(x=p_w, y=timid_timid, colour = "Timid-Timid"),  size = 2)+
  geom_line(aes(x=p_w, y=bold_bold, colour = "Bold-Bold"), size = 2)+
  geom_line(aes(x=p_w, y=timid_bold, colour = "Timid-Bold"), size = 2)+
  geom_vline(xintercept = 0.35313730334, linetype='dashed')+
  geom_vline(xintercept = 0.9/2, linetype='dashed')+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) + 
  scale_color_manual(values = c("Timid-Timid" = '#5F6F65', "Bold-Bold" = "#522258", "Timid-Bold" = "#6482AD"))+
  labs(x="p_w", y="Win probability", tag="A")

Optimal strategy

First, let’s start by defining functions for timid and bold play:

#' Function that returns the scores of both players when player 1 plays timidly
#'
#' @param player.1.scores player #1 score
#' @param player.2.scores player #2 score
#' @param p_d probability of a draw on timid strategy
#'
#' @return the score of both players at the end of a match when player 1 plays timidly
#' @export
timid.play <- function(player.1.scores, player.2.scores, p_d){
  
  draw <- rbinom(1, 1, p_d)
  
  if(draw == 1){
    
    player.1.scores <- player.1.scores + 0.5
    player.2.scores <- player.2.scores + 0.5
    
  }else{
    
    player.1.scores <- player.1.scores + 0
    player.2.scores <- player.2.scores + 1
    
  }
  
  return(list(player.1.scores, player.2.scores))
}


#' Function that returns the scores of both players when player 1 plays boldly
#'
#' @param player.1.scores player #1 score
#' @param player.2.scores player #2 score
#' @param p_w probability of a win on bold strategy
#'
#' @return the score of both players at the end of a match when player 1 plays boldly
#' @export
bold.play <- function(player.1.scores, player.2.scores, p_w){
  
  win <- rbinom(1, 1, p_w)
  
  if(win == 1){
    
    player.1.scores <- player.1.scores + 1
    player.2.scores <- player.2.scores + 0
    
  }else{
    
    player.1.scores <- player.1.scores + 0
    player.2.scores <- player.2.scores + 1
    
  }
  
  return(list(player.1.scores, player.2.scores))
  
}




#' Simulate the chess match
#'
#' @param p_d the probability of a draw under timid strategy
#' @param p_w the probability of a win under bold strategy
#'
#' @return whether player 1 wins or not
#' @export
simulate.chess.game <- function(p_d, p_w){
  
  player.1.scores <- 0
  player.2.scores <- 0
  
  game.stage <- 1
  sudden.death <- FALSE
  
  while(game.stage < 3 || sudden.death == TRUE){
    
    if(player.1.scores > player.2.scores){
      
      score.lst <- timid.play(player.1.scores, player.2.scores, p_d)
      score.lst[[1]]->player.1.scores
      score.lst[[2]]->player.2.scores
      
    }else{
      
      score.lst <- bold.play(player.1.scores, player.2.scores, p_w)
      score.lst[[1]]->player.1.scores
      score.lst[[2]]->player.2.scores
      
    }
    
    if(game.stage >= 2 && (player.1.scores == player.2.scores)){
      sudden.death <- TRUE
    }else{
      sudden.death <- FALSE
    }
    
    game.stage <- game.stage + 1
    
  }

  player.1.win <- FALSE
  
  if(player.1.scores > player.2.scores){
    
    player.1.win <- TRUE
    
  }
  
  return(player.1.win)
  
}

Now let’s test this with the parameters mentioned in the book, with $p_w =$ 0.45 and $p_d=$ 0.9 we should be getting a win probability of 0.53.

p_w <- 0.45
p_d <- 0.9

win.sum <- 0

num_simulations <- 10000

for(k in 1:num_simulations){
  
simulate.chess.game(p_d, p_w) + win.sum -> win.sum
    
}


print(win.sum/num_simulations)

## [1] 0.5382

And that is exactly what we get!

Compare simulations and theory

Let’s think of this problem theoretically. We play the first game boldly and there are three possible scenarios:

CASE I:

If we win in the first match and draw in the next match we have won and the game stops this happens with probability $p_w(p_d)$.

CASE II:

We win in the first match and lose in the second match then we play bold for the third match and win with probability $p_w(1-p_d)(p_w)$.

CASE III:

$(1-p_w)(p_w)(p_w)$

#Define theoretical winning probability function:

win_probability_theory <- function(p_d, p_w) p_w*((1-p_w)*p_w + p_w*(1-p_d))+ p_w*p_d



#Let's compare theory and simulations for different parameter sets

sim.mat<-matrix(0, nrow=length(seq(0.2, 0.45, 0.05)), ncol=length(seq(0.7, 0.95, 0.05)))
theory.mat<-matrix(0, nrow=length(seq(0.2, 0.45, 0.05)), ncol=length(seq(0.7, 0.95, 0.05)))


p_w <- seq(0.2, 0.45, 0.05)
p_d <- seq(0.7, 0.95, 0.05)

for(i in seq_along(p_w)){
  
  for(j in seq_along(p_d)){
    
    for(k in 1:num_simulations){
      
    sim.mat[i, j] <- sim.mat[i, j] + simulate.chess.game(p_d[j], p_w[i])
    
    }
    
    theory.mat[i, j] <- win_probability_theory(p_d[j], p_w[i])
    
  }
  
}


sim.mat/num_simulations -> sim.mat

rownames(sim.mat) <- paste("p_w:",p_w)
colnames(sim.mat) <- paste("p_d:", p_d)

rownames(theory.mat) <- paste("p_w:", p_w)
colnames(theory.mat) <- paste("p_d:", p_d)

library(pheatmap)
  
  
pheatmap(sim.mat, display_numbers = T, 
         color = colorRampPalette(c('white','#522258'))(100), 
         cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = 15)

pheatmap(theory.mat, display_numbers = T, 
         color = colorRampPalette(c('white','#522258'))(100), 
         cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = 15)

Multi armed bandit problems

Competing technologies problem

The following problem is taken from Arthur, 1989.

Reference: Arthur, W. B. (1989). Competing technologies, increasing returns, and lock-in by historical events. The economic journal, 99(394), 116-131.

Suppose we have two technologies, $A$ and $B$ which are competing for adoption by a market consisting of two types of agents, $R$, and $S$. $R$ agents have a preference for the $A$ technology and $S$ agents have a preference for the $B$ technology due to political reasons. We start with a very basic example:

In the most simple problem, at each time point one agent enters the systems and makes a decision to adopt either strategy. $R$ agents have a strong preference for $A$. Likewise $S$ agents have a strong preference for the $B$ technology. At each time point, the expected return depends on the type of agent that makes the decision and the previous adoptions. The number of previous $A$ adoptions is referred to as $n_A$ and the number of previous $B$ technology adoptions is referred to as $n_B$ ($n_A + n_B = n$).
Depending on whether an $R$ or $S$ agent comes up with choose the return based on the following payoff matrix:

Agent Type	Technology A	Technology B
R agent	$a_R + r n_A$	$b_R + r n_B$
S agent	$a_S + s n_A$	$b_S + s n_B$

To account for the preference for $A$ or $B$, we consider $a_R > b_R$ and $a_s < b_s$.

In the simple case we consider here, $R$ agents always adopt technology $A$ unless if technology $B$ becomes widely adopted, increases in number and it would result in a higher payoff if $R$ agents switch their strategy. Likewise, $S$ agents always adopt technology $B$ unless if technology $A$ becomes widely adopted, increases in number and it would result in a higher payoff if $S$ agents switch their strategy. We define the threshold for switching below:

find.delta_R <- function(b_R, a_R, r) return (b_R - a_R)/r
find.delta_S <- function(b_S, a_S, s) return (b_S - a_S)/s

Define the simulation function

#' Simulate the problem
#'
#' @param stages the number of stages
#' @param a_R the return for agent R adopting technology A
#' @param a_S the return for agent S adopting technology A
#' @param b_R the return for agent R adopting technology B
#' @param b_S the return for agent S adopting technology B
#' @param r the return for agent R based on previous adoptions
#' @param s the return for agent S based on previous adoptions
#'
#' @return a list containing the total return, number of A adoptions, number of B adoptions
#' @export
simulate.problem <- function(stages, a_R, a_S, b_R, b_S, r, s){
  
  #initialize parameters
  n_A <- c(0) #number of A adoptions 
  n_B <- c(0) #number of B adoptions
  returns <- c(0) #return
  
  delta_R <- find.delta_R(b_R,a_R,r)
  delta_S <- find.delta_S(b_S,a_S,s)
  
  if(r<0 && s<0){
    condition <- "diminishing"
  }else if(r>0 && s>0){
    condition <- "increasing"
  }else if(r==0 && s==0){
    condition <- "constant"
  }else{
    return("r and s signs should match.")
  }
  
  
  for(i in 1:stages){
    
    d_n <- n_A[length(n_A)] - n_B[length(n_B)]
    
    
    if(condition == "diminishing"){
      delta_R <- -1*delta_R
      delta_S <- -1*delta_S
      condition.1 <- d_n < delta_R
      condition.2 <- d_n > delta_S
    }else if(condition == "increasing"){
      condition.1 <- d_n >= delta_R
      condition.2 <- d_n <= delta_S
    }else{
      condition.1 <- TRUE
      condition.2 <- TRUE
    }
    
    p <- rbinom(1, 1, 0.5) #throw a coin, if we get tails then an R agent comes up
    #otherwise an S agent comes up
    
    if(p == 0){
      
      if(condition.1){ #check if we have reached the threshold for switching
        
        returns <- c(returns, a_R + r*n_A[length(n_A)])
        n_A <- c(n_A, n_A[length(n_A)] + 1)
        n_B <- c(n_B, n_B[length(n_B)])
      }else{
        
        returns <- c(returns, b_R + r*n_B[length(n_B)])
        n_A <- c(n_A, n_A[length(n_A)])
        n_B <- c(n_B, n_B[length(n_B)] + 1)
        
      }
      
    }else{
      
      if(condition.2){
        
        returns <- c(returns, b_S + s*n_B[length(n_B)])
        n_A <- c(n_A, n_A[length(n_A)])
        n_B <- c(n_B, n_B[length(n_B)] + 1)
        
      }else{
        
        returns <- c(returns, a_S + s*n_A[length(n_A)])
        n_A <- c(n_A, n_A[length(n_A)] + 1)
        n_B <- c(n_B, n_B[length(n_B)])
      }
      
    }
    
  }
  
  
  return(list(returns, n_A, n_B))
}

The increasing return case

#Define parameters

a_R <- 10
a_S <- 6
  
b_R <- 4
b_S <- 12

r <- 1
s <- 3


returns.mat<-matrix(nrow=50, ncol=10)
d.mat<-matrix(nrow=50, ncol=10)

#simulate the problem
for(i in 1:10){
  
  
  simulate.problem(49, a_R, a_S, b_R, b_S, r, s)->sims
  sims[[1]] -> returns
  sims[[2]] - sims[[3]] -> d_n
  returns.mat[,i]<-returns
  d.mat[,i] <- d_n
  
}
as.data.frame(returns.mat)->returns.df
as.data.frame(d.mat)->d.df

colnames(returns.df) <- as.character(as.roman(1:10))
colnames(d.df) <- as.character(as.roman(1:10))

returns.df$stage <- 1:50
d.df$stage <- 1:50

delta_R <- find.delta_R(b_R,a_R,r)
delta_S <- find.delta_S(b_S,a_S,s)

ggplot(data = d.df)+
  geom_point(aes(x=stage, y=I), shape=0, color = "#3AA6B9")+
  geom_point(aes(x=stage, y=II), shape=1, color = "#254336")+
  geom_point(aes(x=stage, y=III), shape=2, color = "#32012F")+
  geom_point(aes(x=stage, y=IV), shape=3, color = "#8E3E63")+
  geom_point(aes(x=stage, y=V), shape=4, color = "#A91D3A")+
  geom_point(aes(x=stage, y=VI), shape=5, color = "#A79277")+
  geom_point(aes(x=stage, y=VII), shape=6, color = "#481E14")+
  geom_point(aes(x=stage, y=VIII), shape=7, color = "#2C7865")+
  geom_point(aes(x=stage, y=IX), shape=8, color = "#222831")+
  geom_point(aes(x=stage, y=X), shape=9, color = "#E1AFD1")+
  geom_hline(yintercept = delta_R, linetype = 'dashed')+
  geom_hline(yintercept = delta_S, linetype = 'dashed')+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+labs(tag="A", y = TeX("$d_n$"))

ggplot(data = returns.df)+
  geom_point(aes(x=stage, y=I), shape=0, color = "#3AA6B9")+
  geom_point(aes(x=stage, y=II), shape=1, color = "#254336")+
  geom_point(aes(x=stage, y=III), shape=2, color = "#32012F")+
  geom_point(aes(x=stage, y=IV), shape=3, color = "#8E3E63")+
  geom_point(aes(x=stage, y=V), shape=4, color = "#A91D3A")+
  geom_point(aes(x=stage, y=VI), shape=5, color = "#A79277")+
  geom_point(aes(x=stage, y=VII), shape=6, color = "#481E14")+
  geom_point(aes(x=stage, y=VIII), shape=7, color = "#2C7865")+
  geom_point(aes(x=stage, y=IX), shape=8, color = "#222831")+
  geom_point(aes(x=stage, y=X), shape=9, color = "#E1AFD1")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+labs(tag="B", y="returns")

As you see we picked $r$ to be less than $s$ meaning the payoff of the adoption of the B technology is three times higher than the adoption of Technology A. However, we can see in simulations that it is possible for the market to be locked in with the technology with the lower payoff. It is evident in the panel B plot that the simulation cases with the highest payoff have absorption to Technology B (cases, red plus and pink diamond plus).

The diminishing return case

#Define parameters

a_R <- -4
a_S <- -12

b_R <- -10
b_S <- -6

r <- -3
s <- -1


returns.mat<-matrix(nrow=50, ncol=10)
d.mat<-matrix(nrow=50, ncol=10)

#simulate the problem
for(i in 1:10){
  
  
  simulate.problem(49, a_R, a_S, b_R, b_S, r, s)->sims
  sims[[1]] -> returns
  sims[[2]] - sims[[3]] -> d_n
  returns.mat[,i]<-returns
  d.mat[,i] <- d_n
  
}
as.data.frame(returns.mat)->returns.df
as.data.frame(d.mat)->d.df

colnames(returns.df) <- as.character(as.roman(1:10))
colnames(d.df) <- as.character(as.roman(1:10))

returns.df$stage <- 1:50
d.df$stage <- 1:50

delta_R <- find.delta_R(b_R,a_R,r)
delta_S <- find.delta_S(b_S,a_S,s)

ggplot(data = d.df)+
  geom_point(aes(x=stage, y=I), shape=0, color = "#3AA6B9")+
  geom_point(aes(x=stage, y=II), shape=1, color = "#254336")+
  geom_point(aes(x=stage, y=III), shape=2, color = "#32012F")+
  geom_point(aes(x=stage, y=IV), shape=3, color = "#8E3E63")+
  geom_point(aes(x=stage, y=V), shape=4, color = "#A91D3A")+
  geom_point(aes(x=stage, y=VI), shape=5, color = "#A79277")+
  geom_point(aes(x=stage, y=VII), shape=6, color = "#481E14")+
  geom_point(aes(x=stage, y=VIII), shape=7, color = "#2C7865")+
  geom_point(aes(x=stage, y=IX), shape=8, color = "#222831")+
  geom_point(aes(x=stage, y=X), shape=9, color = "#E1AFD1")+
  geom_hline(yintercept = delta_R, linetype = 'dashed')+
  geom_hline(yintercept = delta_S, linetype = 'dashed')+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+labs(tag="A", y = TeX("$d_n$"))+ylim(c(-10, 10))

ggplot(data = returns.df)+
  geom_point(aes(x=stage, y=I), shape=0, color = "#3AA6B9")+
  geom_point(aes(x=stage, y=II), shape=1, color = "#254336")+
  geom_point(aes(x=stage, y=III), shape=2, color = "#32012F")+
  geom_point(aes(x=stage, y=IV), shape=3, color = "#8E3E63")+
  geom_point(aes(x=stage, y=V), shape=4, color = "#A91D3A")+
  geom_point(aes(x=stage, y=VI), shape=5, color = "#A79277")+
  geom_point(aes(x=stage, y=VII), shape=6, color = "#481E14")+
  geom_point(aes(x=stage, y=VIII), shape=7, color = "#2C7865")+
  geom_point(aes(x=stage, y=IX), shape=8, color = "#222831")+
  geom_point(aes(x=stage, y=X), shape=9, color = "#E1AFD1")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+labs(tag="B", y="returns")

The constant return case

#Define parameters

a_R <- 5
a_S <- 3

b_R <- 2
b_S <- 6

r <- 0
s <- 0


returns.mat<-matrix(nrow=10000, ncol=10)
d.mat<-matrix(nrow=10000, ncol=10)

#simulate the problem
for(i in 1:10){
  
  
  simulate.problem(9999, a_R, a_S, b_R, b_S, r, s)->sims
  sims[[1]] -> returns
  sims[[2]] - sims[[3]] -> d_n
  returns.mat[,i]<-returns
  d.mat[,i] <- d_n
  
}
as.data.frame(returns.mat)->returns.df
as.data.frame(d.mat)->d.df

colnames(returns.df) <- as.character(as.roman(1:10))
colnames(d.df) <- as.character(as.roman(1:10))

returns.df$stage <- 1:10000
d.df$stage <- 1:10000

delta_R <- find.delta_R(b_R,a_R,r)
delta_S <- find.delta_S(b_S,a_S,s)

#We track the limit of $d_n/n$ as $n$ goes to infinity

# Apply the division using the generated Roman numerals
for (col in as.character(as.roman(1:10))) {
  d.df[[col]] <- d.df[[col]] / d.df$stage
}



ggplot(data = d.df)+
  geom_point(aes(x=stage, y=I), shape=0, color = "#3AA6B9")+
  geom_point(aes(x=stage, y=II), shape=1, color = "#254336")+
  geom_point(aes(x=stage, y=III), shape=2, color = "#32012F")+
  geom_point(aes(x=stage, y=IV), shape=3, color = "#8E3E63")+
  geom_point(aes(x=stage, y=V), shape=4, color = "#A91D3A")+
  geom_point(aes(x=stage, y=VI), shape=5, color = "#A79277")+
  geom_point(aes(x=stage, y=VII), shape=6, color = "#481E14")+
  geom_point(aes(x=stage, y=VIII), shape=7, color = "#2C7865")+
  geom_point(aes(x=stage, y=IX), shape=8, color = "#222831")+
  geom_point(aes(x=stage, y=X), shape=9, color = "#E1AFD1")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+labs(tag="A", y = TeX("$\\frac{d_n}{n}$"))

To be continued…

Thompson sampling

In this example we consider an agent interacting with n slot machines, each with a different, unknown probability of winning. The agent’s objective is to identify and play the slot machine that maximizes the chances of winning over time.

We first use a Thompson sampling approach. Thompson sampling is a Bayesian approach for optimizing the probability of selecting the best machine. Suppose we begin with three slot machines and we have no prior knowledge of their individual probabilities of success. Since we have no background information, we assign each slot machine a prior distribution of $\beta(1, 1)$ (equivalent to a uniform distribution). At each stage, we draw a random sample from the $\beta$ distribution of each machine and compare the results. We then select the machine with the highest sampled value for play.

If the chosen machine results in a win, we update its distribution to $\beta(2, 1)$. Otherwise, if the machine results in a loss, we adjust its distribution to $\beta(1, 2)$. We then repeat this process over N trials, and we track the number of wins and losses for each machine.

library(tidyr)

#' Simulate the MAB problem over a finite horizon using Thompson sampling
#'
#' @param N the number of trials
#' @param p the actual probability of winning by playing each arm
#'
#' @return a list containing played arms, winnings per arm, losses per arm, and simulations
#' @export
simulate.MAB.thompson.sampling <- function(N, p){
  
  simulations <- sapply(p, function(prob) rbinom(N, 1, prob)) #You win or lose based
  #on a Bernoulli distribution
  
  num.arms <- length(p) #Number of bandits
  
  #Store number of winnings and losses per bandit
  winnings <- rep(0, num.arms)
  losses <- rep(0, num.arms)
  
  played.arms <- c() #Store which arm you have played at each stage
  
  #Doing this over a finite horizon, n stages
  for(n in 1:N){
    
    hold.max.beta <- 0
    
    for(arm in 1:num.arms){
      beta <- rbeta(1, shape1 = winnings[arm] + 1, shape2 = losses[arm] + 1)
      
      if(beta > hold.max.beta){
        track.arm <- arm
        beta -> hold.max.beta
      }
      
    }
    played.arms <- c(played.arms, track.arm)
    
    if(simulations[n, track.arm] == 1){
      
      winnings[track.arm] <- winnings[track.arm] + 1
      
    }else{
      
      losses[track.arm] <- losses[track.arm] + 1
      
    }
    
  }
  return(list(played.arms, winnings, losses, simulations))
}

Simulate the game

Let’s start with a simple scenario where we have three slot machines that we have never played before. We don’t know which of the slot machines has a higher probability of winning. We denote the probability of winning to be $p_1$, $p_2$, and $p_3$

In this version we are trying the bandits over a finite horizon with no discounting

N <- 1000 #The number of trials
p <- c(0.55, 0.7, 0.45) #The probability of winning for different bandits

simulate.MAB.thompson.sampling(N, p) -> results.lst

results.lst[[1]]->played.arms
results.lst[[2]]->winnings
results.lst[[3]]->losses
results.lst[[4]]->simulations

num.arms <- length(p)

#Matrices for tracking cumulative plays and cumulative wins for each arm
cumulative.plays <- matrix(0, nrow=length(played.arms), ncol=num.arms)
cumulative.winnings <- matrix(0, nrow = nrow(simulations), ncol = num.arms)

#Iterate over arms and find cumulative wins and cumulative plays
for (i in 1:length(played.arms)) {
  if (i > 1) {
    cumulative.plays[i, ] <- cumulative.plays[i-1, ]
    cumulative.winnings[i, ] <- cumulative.winnings[i - 1, ]
  }
  arm_played <- played.arms[i]
  cumulative.plays[i, played.arms[i]] <- cumulative.plays[i, played.arms[i]] + 1
  cumulative.winnings[i, arm_played] <- cumulative.winnings[i, arm_played] + simulations[i, arm_played]
}

#Convert matrix to dataframe

cum.plays.df <- as.data.frame(cumulative.plays)
colnames(cum.plays.df) <- paste0("Arm ", 1:num.arms)
cum.plays.df$Stage <- 1:nrow(cum.plays.df)

#Reshape dataframe to long format
cum.plays.long <- pivot_longer(cum.plays.df, cols = starts_with("Arm "),
                               names_to = "Arm", values_to = "CumulativePlays")


ggplot(cum.plays.long, aes(x = Stage, y = CumulativePlays, color = Arm)) +
  geom_line(size = 1) +
  labs(title = "Thompson sampling",
       x = "Stage",
       y = "Cumulative plays") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "A")+
  scale_color_manual(values = c("Arm 1" = "#17153B", "Arm 2" = "#C8ACD6", "Arm 3" = "#6A9C89"))

cum.winnings.df <- as.data.frame(cumulative.winnings)
colnames(cum.winnings.df) <- paste0("Arm ", 1:num.arms)
cum.winnings.df$Stage <- 1:nrow(cum.winnings.df)

#Reshape dataframe to long format
cum.winnings.long <- pivot_longer(cum.winnings.df, cols = starts_with("Arm "),
                                  names_to = "Arm", values_to = "CumulativeWinnings")


ggplot(cum.winnings.long, aes(x = Stage, y = CumulativeWinnings, color = Arm)) +
  geom_line(size = 1) +
  labs(title = "Thompson sampling", 
       x = "Stage", y = "Cumulative winnings") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "B")+
  scale_color_manual(values = c("Arm 1" = "#17153B", "Arm 2" = "#C8ACD6", "Arm 3" = "#6A9C89"))

Upper Confidence Bound (UCB)

Now, let us try another method to find the solution. Through the UCB method we aim to choose the arm with the highest upper confidence bound at each stage. That means at each stage we calculate the confidence interval for each arm and choose the arm with the highest upper confidence bound.

#' Simulate the MAB problem over a finite horizon using Upper Confidence Bound (UCB)
#'
#' @param N the number of trials
#' @param p the actual probability of winning by playing each arm
#'
#' @return a list containing played arms, winnings per arm, losses per arm, and simulations
#' @export
simulate.MAB.upper.confidence.bound <- function(N, p){
  
  simulations <- sapply(p, function(prob) rbinom(N, 1, prob)) #You win or lose based
  #on a Bernoulli distribution
  
  num.arms <- length(p) #Number of bandits
  
  #Store number of winnings and losses per bandit
  winnings <- rep(0, num.arms)
  losses <- rep(0, num.arms)
  
  played.arms <- c() #Store which arm you have played at each stage
  
  #Doing this over a finite horizon, n stages
  for(n in 1:N){
    
    hold.max.upper.bound <- -Inf
    
    for(arm in 1:num.arms){ #Iterate over arms
      if(sum(played.arms==arm)>0){
        
      reward <- winnings[arm]/(sum(played.arms==arm))
      delta <- sqrt(2 * log(n) / sum(played.arms==arm))
      upper.bound <- reward + delta
      
      }else{upper.bound <- Inf}
      
      if(upper.bound > hold.max.upper.bound){
        track.arm <- arm
        upper.bound -> hold.max.upper.bound
      }
      
    }
    played.arms<- c(played.arms, track.arm)
    
    if(simulations[n, track.arm] == 1){
      
      winnings[track.arm] <- winnings[track.arm] + 1
      
    }else{
      
      losses[track.arm] <- losses[track.arm] + 1
      
    }
    
  }
  return(list(played.arms, winnings, losses, simulations))
}

Simulate the problem using UCB

N <- 1000 #The number of trials
p <- c(0.55, 0.7, 0.45) #The probability of winning for different bandits

simulate.MAB.upper.confidence.bound(N, p) ->results.lst

results.lst[[1]]->played.arms
results.lst[[2]]->winnings
results.lst[[3]]->losses
results.lst[[4]]->simulations

num.arms <- length(p)

#Matrices for tracking cumulative plays and cumulative wins for each arm
cumulative.plays <- matrix(0, nrow=length(played.arms), ncol=num.arms)
cumulative.winnings <- matrix(0, nrow = nrow(simulations), ncol = num.arms)

#Iterate over arms and find cumulative wins and cumulative plays
for (i in 1:length(played.arms)) {
  if (i > 1) {
    cumulative.plays[i, ] <- cumulative.plays[i-1, ]
    cumulative.winnings[i, ] <- cumulative.winnings[i - 1, ]
  }
  arm_played <- played.arms[i]
  cumulative.plays[i, played.arms[i]] <- cumulative.plays[i, played.arms[i]] + 1
  cumulative.winnings[i, arm_played] <- cumulative.winnings[i, arm_played] + simulations[i, arm_played]
}

#Convert matrix to dataframe

cum.plays.df <- as.data.frame(cumulative.plays)
colnames(cum.plays.df) <- paste0("Arm ", 1:num.arms)
cum.plays.df$Stage <- 1:nrow(cum.plays.df)

#Reshape dataframe to long format
cum.plays.long <- pivot_longer(cum.plays.df, cols = starts_with("Arm "),
                               names_to = "Arm", values_to = "CumulativePlays")


ggplot(cum.plays.long, aes(x = Stage, y = CumulativePlays, color = Arm)) +
  geom_line(size = 1) +
  labs(title = "Upper Confidence Bound",
       x = "Stage",
       y = "Cumulative plays") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "A")+
  scale_color_manual(values = c("Arm 1" = "#17153B", "Arm 2" = "#C8ACD6", "Arm 3" = "#6A9C89"))

cum.winnings.df <- as.data.frame(cumulative.winnings)
colnames(cum.winnings.df) <- paste0("Arm ", 1:num.arms)
cum.winnings.df$Stage <- 1:nrow(cum.winnings.df)

#Reshape dataframe to long format
cum.winnings.long <- pivot_longer(cum.winnings.df, cols = starts_with("Arm "),
                                  names_to = "Arm", values_to = "CumulativeWinnings")


ggplot(cum.winnings.long, aes(x = Stage, y = CumulativeWinnings, color = Arm)) +
  geom_line(size = 1) +
  labs(title = "Upper Confidence Bound", 
       x = "Stage", y = "Cumulative winnings") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "B")+
  scale_color_manual(values = c("Arm 1" = "#17153B", "Arm 2" = "#C8ACD6", "Arm 3" = "#6A9C89"))

Epsilon-greedy

Using the $\epsilon$-greedy approach, we aim to find the best arm by balancing exploration and exploitation. We first pick an $\epsilon$ ranging from 0 to 1 and at each stage we compare a random number drawn from a uniform distribution from 0 to 1 and compare it to $\epsilon$. If it is smaller than $\epsilon$ then we pick an arm randomly otherwise we pick the arm with the highest observed probability of winning. This means that an $\epsilon$ of 1 corresponds to complete randomness.

#' Simulate the MAB problem over a finite horizon using epsilon-greedy
#'
#' @param N the number of trials
#' @param p the actual probability of winning by playing each arm
#' @param eps epsilon value
#'
#' @return a list containing played arms, winnings per arm, losses per arm, and simulations
#' @export
simulate.MAB.epsilon.greedy <- function(N, p, eps){
  
  simulations <- sapply(p, function(prob) rbinom(N, 1, prob)) #You win or lose based
  #on a Bernoulli distribution
  
  num.arms <- length(p) #Number of bandits
  
  #Store number of winnings and losses per bandit
  winnings <- rep(0, num.arms)
  losses <- rep(0, num.arms)
  
  played.arms <- c() #Store which arm you have played at each stage
  
  #Doing this over a finite horizon, n stages
  for(n in 1:N){
    rand.num <- runif(1)
    
    if(rand.num < eps){
      
      track.arm <- sample(num.arms)[1]
      
    }else{
      
      observed.prob.win <- winnings/(winnings+losses)
      observed.prob.win[is.na(observed.prob.win)]<- 0
      track.arm <- which.max(observed.prob.win)
      
    }
    
    played.arms <- c(played.arms, track.arm)
    
    if(simulations[n, track.arm] == 1){
      
      winnings[track.arm] <- winnings[track.arm] + 1
      
    }else{
      
      losses[track.arm] <- losses[track.arm] + 1
      
    }
    
  }
  return(list(played.arms, winnings, losses, simulations))
}

Simulate the problem using epsilon-greedy

N <- 1000 #The number of trials
p <- c(0.55, 0.7, 0.45) #The probability of winning for different bandits

simulate.MAB.epsilon.greedy(N, p, 0.2) -> results.lst

results.lst[[1]]->played.arms
results.lst[[2]]->winnings
results.lst[[3]]->losses
results.lst[[4]]->simulations

num.arms <- length(p)

#Matrices for tracking cumulative plays and cumulative wins for each arm
cumulative.plays <- matrix(0, nrow=length(played.arms), ncol=num.arms)
cumulative.winnings <- matrix(0, nrow = nrow(simulations), ncol = num.arms)

#Iterate over arms and find cumulative wins and cumulative plays
for (i in 1:length(played.arms)) {
  if (i > 1) {
    cumulative.plays[i, ] <- cumulative.plays[i-1, ]
    cumulative.winnings[i, ] <- cumulative.winnings[i - 1, ]
  }
  arm_played <- played.arms[i]
  cumulative.plays[i, played.arms[i]] <- cumulative.plays[i, played.arms[i]] + 1
  cumulative.winnings[i, arm_played] <- cumulative.winnings[i, arm_played] + simulations[i, arm_played]
}

#Convert matrix to dataframe

cum.plays.df <- as.data.frame(cumulative.plays)
colnames(cum.plays.df) <- paste0("Arm ", 1:num.arms)
cum.plays.df$Stage <- 1:nrow(cum.plays.df)

#Reshape dataframe to long format
cum.plays.long <- pivot_longer(cum.plays.df, cols = starts_with("Arm "),
                               names_to = "Arm", values_to = "CumulativePlays")


ggplot(cum.plays.long, aes(x = Stage, y = CumulativePlays, color = Arm)) +
  geom_line(size = 1) +
  labs(title = "Epsilon-greedy",
       x = "Stage",
       y = "Cumulative plays") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "A")+
  scale_color_manual(values = c("Arm 1" = "#17153B", "Arm 2" = "#C8ACD6", "Arm 3" = "#6A9C89"))

cum.winnings.df <- as.data.frame(cumulative.winnings)
colnames(cum.winnings.df) <- paste0("Arm ", 1:num.arms)
cum.winnings.df$Stage <- 1:nrow(cum.winnings.df)

#Reshape dataframe to long format
cum.winnings.long <- pivot_longer(cum.winnings.df, cols = starts_with("Arm "),
                                  names_to = "Arm", values_to = "CumulativeWinnings")


ggplot(cum.winnings.long, aes(x = Stage, y = CumulativeWinnings, color = Arm)) +
  geom_line(size = 1) +
  labs(title = "Epsilon-greedy", 
       x = "Stage", y = "Cumulative winnings") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "B")+
  scale_color_manual(values = c("Arm 1" = "#17153B", "Arm 2" = "#C8ACD6", "Arm 3" = "#6A9C89"))

Compare strategies

Let’s compare the run times of the algorithms:

num <- seq(10, 10000, 100) #change the number of stages from 10 to 10k


epsilon_greedy_times <- numeric(length(num))
thompson_sampling_times <- numeric(length(num))
ucb_times <- numeric(length(num))


for (i in seq_along(num)) {
  n <- num[i]  
  
  #Epsilon-greedy
  start.time <- Sys.time()
  simulate.MAB.epsilon.greedy(n, p, 0.2)  
  end.time <- Sys.time()
  epsilon_greedy_times[i] <- as.numeric(difftime(end.time, start.time, units = "secs"))
  
  #Thompson sampling
  start.time <- Sys.time()
  simulate.MAB.thompson.sampling(n, p)  
  end.time <- Sys.time()
  thompson_sampling_times[i] <- as.numeric(difftime(end.time, start.time, units = "secs"))
  
  #Upper Confidence Bound (UCB)
  start.time <- Sys.time()
  simulate.MAB.upper.confidence.bound(n, p)  
  end.time <- Sys.time()
  ucb_times[i] <- as.numeric(difftime(end.time, start.time, units = "secs"))
}


results <- data.frame(
  n = num,
  Epsilon_Greedy = epsilon_greedy_times,
  Thompson_Sampling = thompson_sampling_times,
  UCB = ucb_times
)

ggplot(results, aes(x = n)) +
  geom_line(aes(y = Epsilon_Greedy, color = "Epsilon Greedy"), size=1.25) +
  geom_line(aes(y = Thompson_Sampling, color = "Thompson Sampling"), size=1.25) +
  geom_line(aes(y = UCB, color = "UCB"), size=1.25) +
  labs(title = "Algorithm Run Times", x = "Stages", y = "Time (seconds)") +
  theme_minimal() +
  scale_color_manual(name = "Algorithm", values = c("Epsilon Greedy" = "#674188", 
                                                    "Thompson Sampling" = "#E2BFD9", 
                                                    "UCB" = "#921A40"))+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )

Gold mining problem

Problem is taken from Richard Bellman’s book on stochastic dynamic programming.

Suppose there are two gold mines, Anaconda ($A$) and Bonanza ($B$) and each has a certain amount of gold $x$ and $y$. Now suppose that we are given the option to mine gold from either mine $A$ or mine $B$ and we know that we have a probability $p_1$ of succeeding if mining from mine $A$ to extract $r_1$ fraction of the gold and a probability $p_2$ of succeeding if mining from mine $B$ to extract an $r_2$ fraction of the gold. At each stage if we fail to extract gold, our mining machine will be damaged beyond repair.

To understand the optimal strategy, let us start with the first stage. We know that the expected amount of gold is $p_1(r_1)x$ if we choose to mine from mine $A$ and $p_2(r_2)y$ if we choose to mine from the second mine. If we have only one stage then we would choose the mine that maximizes the expected extracted gold.

$V_1(x, y) = max[p_1(r_1)x, p_2(r_2)y]$

Now if we were to extend to this to an $N$-stage problem then the Bellman equation would become:

$V_N(x, y) = max[p_1(r_1(x) + V_{N-1}((1-r_1)x, y)), p_2(r_2(y) + V_{N-1}(x, (1-r_2)y))]$

#' Bellman equation function
#'
#' @param N the number of stages
#' @param x the total gold at mine A
#' @param y the total gold at mine B
#' @param p the vector containing the probability of successful extraction of gold from either mine
#' @param r the vector containing the probability of fraction of gold mined from either mine
#'
#' @return the expected extracted gold
#' @export
V <- function(N, x, y, p, r){
  
  if(N==1){
    return(max(p[1]*r[1]*x, p[2]*r[2]*y))
  }
  
  
  return(max(p[1]*(r[1]*x + V(N-1, (1-r[1])*x, y, p, r)), p[2]*(r[2]*y + V(N-1, x, (1-r[2])*y, p, r))))
  
}



#' Simulation function for the gold mininig problem
#'
#' @param N the number of stages
#' @param x the total gold at mine A
#' @param y the total gold at mine B
#' @param p the vector containing the probability of successful extraction of gold from either mine
#' @param r the vector containing the probability of fraction of gold mined from either mine
#'
#' @return the simulated extracted gold
#' @export
simulate.problem <- function(N, x, y, p, r){
  
  extracted.gold <- 0
  
  for(n in 1:N){
    
    if(x*p[1]*r[1]>y*p[2]*r[2]){
      
      if(rbinom(1, 1, p[1])){
        
        extracted.gold <- extracted.gold + r[1]*x
        x <- x - r[1]*x
        
      }else{
        return(extracted.gold)
      }
      
        
    }else{
      
      if(rbinom(1, 1, p[2])){
        
        extracted.gold <- extracted.gold + r[2]*y
        y <- y - r[2]*y
        
      }else{
        return(extracted.gold)
      }
      
    }
    
  }
  
  return(extracted.gold)
  
}

N <- 20
x <- 50
y <- 70
#p <- c(0.7, 0.8)
r <- c(0.5, 0.4)
num.simulations <- 1000

p1.greater.theory <- c()
p2.greater.theory <- c()

p1.greater.sim <- c()
p2.greater.sim <- c()

for(x in seq(50, 150, 5)){
  
  p1.greater.theory <- c(p1.greater.theory, V(N, x, y, c(0.9, 0.6), r))
  p1.greater.sim <- c(p1.greater.sim, mean(replicate(num.simulations, simulate.problem(N, x, y, c(0.9, 0.6), r))))
  
  p2.greater.theory <- c(p2.greater.theory, V(N, x, y, c(0.6, 0.9), r))
  p2.greater.sim <- c(p2.greater.sim, mean(replicate(num.simulations, simulate.problem(N, x, y, c(0.6, 0.9), r))))
  
}


results.df <- data.frame(x = seq(50, 150, 5), theory_p1_greater = p1.greater.theory, theory_p2_greater = p2.greater.theory, simulation_p1_greater = p1.greater.sim, simulation_p2_greater = p2.greater.sim)


ggplot(results.df)+
  geom_line(aes(x=x, y=theory_p1_greater, color="p1=0.9, p2=0.6 theory"), size=1.25)+
  geom_line(aes(x=x, y=theory_p2_greater, color="p1=0.6, p2=0.9 theory"), size=1.25)+
  geom_point(aes(x=x, y=simulation_p1_greater, color="p1=0.9, p2=0.6 sim"), stroke=1.5, shape=8)+
  geom_point(aes(x=x, y=simulation_p2_greater, color="p1=0.6, p2=0.9 sim"), stroke=1.5, shape=5)+
  scale_color_manual(values=c("p1=0.9, p2=0.6 theory" = "#5C5470","p1=0.9, p2=0.6 sim"="#B9B4C7" , "p1=0.6, p2=0.9 sim" = "#629584", "p1=0.6, p2=0.9 theory" ="#387478"))+labs(title="Gold mining;\n N=20, y=70, r=[0.5, 0.4]", x="x", y="Extracted gold amount", colour="Theory/simulation")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )

Discounted Programming

Machine replacement model

The following problem is an R implementation of the machine replacement example from Sheldon Ross’s “Introduction to Dynamic Programming” book, Example 3.1.

Suppose we have a machine with three different states “Perfect” (state: 0), “Good” (state: 1), and “Bad” (state: 2). We have 20 time periods (20 months) and at the end of each month we can either replace the machine with a cost of $R$ or keep the machine and pay some fees to maintain the machine.

Here, we consider a basic three state model.

Let’s consider the non-decreasing linear maintenance cost function $C(i) = \alpha*i$ (Don’t confuse $\alpha$ with the discount factor used later on).

#' Maintenance cost function 
#'
#' @param i the state
#'
#' @return the cost at state i
#' @export
C <- function(i){
  
  alpha <- 2
  return(alpha*i)
  
}

Define probability transition matrix

#Define the probability transition matrix ($P$). The transition matrix
#defines the probability of transitioning from one state to another. 
#The element at index $i, j$ denotes the probability of transitioning from state
#$i$ to state $j$. 

P <- matrix(
  
  c(0.4, 0.4, 0.2, 0.2, 0.4, 0.4, 0.1, 0.2, 0.7), 
  
  nrow = 3,  ncol = 3,  byrow = TRUE          
)

Let’s first define the expected theoretical cost function. We have two actions at each step, either we replace the machine or we don’t. If we replace the machine, the state of the machine switches to 0 and we pay a replacement cost. Otherwise the machine transitions to another state with the probabilities defined in the transition matrix $P$ and we pay a maintanance cost depending on the state of the system $C(i)$. The discount factor is added to take into account the decrease in the value of money overtime.

#' Expected cost function
#'
#' @param i state $i$
#' @param n number of 
#' @param P probability transition matrix
#' @param R cost of replacing the machine
#' @param alpha discount factor
#'
#' @return the expected cost
#' @export
V <- function(i, n, P, R, alpha){
  
  #boundary condition
  if(n==1){
    return(C(i))
  }
  
  sum <- 0

  for(j in 0:(dim(P)[1]-1)){

    sum <- sum + P[i+1, j+1] * V(j, n-1, P, R, alpha)
    
  }
  
  return(C(i) + min((R + alpha*V(0, n-1, P, R, alpha)), alpha*sum))
  
  
}


#Find $V_n(i)$ and plot for $\alpha$ = 0.3

df.lim <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(df.lim)<-c("i", "ii", "iii")
k<-1
p<-1

for(state in 0:2){
  k<-1
  for(n in 1:11){
    
    df.lim[k,p] <- V(state, n, P, 1, 0.3)
    k<-k+1
  }
  p<-p+1
  
}

df.lim$n <- 1:11

ggplot(data=df.lim)+
  geom_line(aes(x=n, y=i, colour = "State: 0"), size = 1.5)+
  geom_line(aes(x=n, y=ii, colour = "State: 1"), size = 1.5)+
  geom_line(aes(x=n, y=iii, colour = "State: 2"), size = 1.5)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) +
  labs(tag = "A")+
  scale_color_manual(values = c("State: 0" = "#B9B4C7", "State: 1" = "#5C5470", "State: 2" = "#352F44"))+
  labs(colour = "Initial state", x = "Number of periods (n)", y="Expected cost")

#Save the results at the end of stage 11 as $V(i)$
df.lim <- df.lim[nrow(df.lim), 1:3]

In search for $\breve{\imath}$

$\breve{\imath}$ is the stage at which we should replace the machine

#' Finding i bar
#'
#' @param df.lim the expected optimal cost for the system starting with state i, V(i)
#' @param P the probability transition matrix
#' @param R the cost of replacement
#' @param alpha the discount factor
#'
#' @return the optimal stage to replace
#' @export
find_i_bar <- function(df.lim, P, R, alpha){
  
  
  hold.i.bar <- c()
  
  A <- R + alpha * df.lim[1] #compare the expected cost for replacement (A)
  
  #To the cost for maintance without replacing B

  for(i in 0:2){
    
    sum <- 0
    for (j in 0:(dim(P)[1]-1)) {
      sum <- sum + P[i+1, j+1] * df.lim[j+1]
    }
    
    B <- sum * alpha

    hold.i.bar <- c(hold.i.bar, A<=B)
    
  }
  
  return(hold.i.bar)
}

# Find $\breve{\imath}$ for $\alpha$ = 0.3 and $R$ equals 1 with transition matrix
#defined above.

min(which(find_i_bar(df.lim, P, 1, 0.3) == TRUE))-1 -> i_bar

Now let’s simulate the problem and compare simulation with theory

#' Simulating the machine replacement model
#'
#' @param n the number of time periods
#' @param i the initial state of the system
#' @param i_bar the stage we replace the machine
#' @param P the probability transition matrix
#' @param R the cost of machine replacement
#' @param alpha the discount factor
#'
#' @return the total cost
#' @export
simulate.problem <- function(n, i, i_bar, P, R, alpha){
  
  cost <- 0
  state <- i
  
  for(k in 1:n){ #iterate over time periods
    
    if(state >= i_bar){ #see if we have reached the time period at which we replace the machine or not
      
      
      cost_to_add <- R + C(state) #if we replace then we incur the replacement cost
      #NOTE: because our C(i) function is equal to 2i then we don't incur any maintenance cost when we are at state 0, we may have to add the C(i) cost for other functions
      state <- 0 #reset to state 0
      
    }else{
      
      cost_to_add <- C(state) #we incur maintenance cost
      
      probabilities <- P[state+1,] #the system transitions to another state with the probabilities defined in the transition matrix
      
      result <- sample(c(0, 1, 2), size = 1, prob = probabilities)
      state <- result
      
    }
    
    cost <- cost + cost_to_add*alpha^(k-1) #take into account the discount factor
    
  }
  
  return(cost)
  
}


#Compare simulations and theory

theory.vector <- c()

num_simulations <- 1000
data.frame(matrix(ncol = 3, nrow = num_simulations)) -> hold.simulations.mat

for(state in 0:2){
  
  hold.sims <- c()
  
  for(k in 1:num_simulations){
    
    hold.sims <- c(hold.sims, simulate.problem(11, state, i_bar, P, 1, 0.3))
    
  }
  
  hold.simulations.mat[,state+1] <- hold.sims
  
  theory.vector <- c(theory.vector, V(state, 11, P, 1, 0.3))
  
  
}

colnames(hold.simulations.mat) <- c("i", "ii", "iii")

#Load libraries for melting the dataframe and showing LaTeX in ggplot
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(latex2exp)

melt(hold.simulations.mat, variable.name = "state") -> melted.simulations.mat

## No id variables; using all as measure variables

ggplot() + 
  geom_boxplot(data = melted.simulations.mat, aes(x=state, y=value), notch=TRUE, 
               size = 2, fill = c('#B9B4C7', '#5C5470', '#352F44'))+
  geom_point(aes(x = 1, theory.vector[1]), shape = 8, color = "#E3A5C7", stroke = 8)+
  geom_point(aes(x = 2, theory.vector[2]), shape = 8, color = "#B692C2", stroke = 8)+
  geom_point(aes(x = 3, theory.vector[3]), shape = 8, color = "#694F8E", stroke = 8)+
  scale_fill_brewer(palette="Blues") + 
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+
  labs(x = "State", y="Cost", title = TeX("R = 1, $\\alpha$ = 0.3, $C(i) = 2i$"), tag = "A")

And we see that simulating the model (box plots) gives us results consistent with the theory (shown with flowers).

Quality control models

Problem is taken from Sheldon Ross’s book on Stochastic Dynamic Programming. The solution to the problem is taken from:

Wang, R. C. (1976). Computing optimal quality control policies—two actions. Journal of Applied Probability, 13(4), 826-832.

Quality control model with two hidden states

Consider a machine that can be in either of two states, good or bad. At the beginning of each day, the machine produces items that are either defective or nondefective. The probability of a defective item is $p_1$ when in the good state and $p_2$ when in the bad state. Once in the bad state, the machine remains in this state until it is replaced. However, if the machine is in the good state at the beginning of one day, then with probability $\gamma$ it will be in the bad state at the beginning of the next day. A decision as to whether or not to replace the machine must be made each day after observing the item produced. Let R be the cost of replacing the machine and let C be the cost incurred whenever a defective item is produced. Set this problem up as a Markov decision model and determine the functional equation satisfied by V. Assume that at time zero there is a known probability that the machine is in the bad state.

To determine the point to replace we can start by taking into consideration the posterior probability of the machine being in a bad state which we refer to as $p$. The posterior probability at day $n$ is $1-(1-p)(1-\gamma)^n$ since there is a $1-p$ probability of the machine being in a good state and a $1-\gamma$ probability of remaining in a good state and we are interested in the posterior probability of the machine being in a bad state. The optimal policy keeps the machine up to some point and then replaces it after the posterior probability of the machine being in a bad state passes a certain threshold.

To find this, we consider a function $L$ to demonstrate the incurred cost due to defective item that get produced in a day. $L$ depends on the posterior probability of the machine being in a bad state. We can define $L(p, p_1, p_2, C) = (pp_2 + (1-p)p_1)C$. Since either the machine is in a bad state and generates a defective item and results in a cost of $C$ with probability $p(p_2)$ or the machine is in a good state and generates a defective item with probability $(1-p)p_1$.

#' Incurred cost
#'
#' @param p the posterior probability of the machine being in the bad state
#' @param p_1 the probability of producing a defective item when in the good state
#' @param p_2 the probability of producing a defective item when in the bad state
#' @param C the cost of producing a defective item
#'
#' @return the total incurred cost
#' @export
L <- function(p, p_1, p_2, C){
  
  return(p*(p_2-p_1)*C + p_1*C)
  
}



#' The optimal incurred cost function
#'
#' @param N the number of stages
#' @param gamma the probability of transitioning into the bad state from the good state
#' @param alpha the discounting factor
#' @param p the posterior probability of the machine being in a bad state
#' @param p_1 the probability of producing a defective item when in the good state
#' @param p_2 the probability of producing a defective item when in the bad state
#' @param C the cost of producing a defective item
#' @param R the cost of replacing the machine
#'
#' @return the expected cost
#' @export
V <- function(N, gamma, alpha, p, p_1, p_2, C, R){
  
  if(N==0){
    return(0)
  }
  p <-  1 - (1-p)*(1-gamma)
  
  return(min(L(p, p_1, p_2, C)+alpha*V(N-1, gamma, alpha, p, p_1, p_2, C, R), R + L(0, p_1, p_2, C) + alpha*V(N-1, gamma, alpha, p, p_1, p_2, C, R)))
  
}




#' Upper bound of the theoretical expected cost
#'
#' @param p the posterior probability of the machine being in a bad state
#' @param p_1 the probability of producing a defective item when in the good state
#' @param p_2 the probability of producing a defective item when in the bad state
#' @param C the cost of producing a defective item
#' @param R the cost of replacing the machine
#'
#' @return
#' @export
upper.bound.V <- function(p, p_1, p_2, C, R, alpha){
  return((L(p, p_1, p_2, C)+R)/(1-alpha))
}


#' Lower bound of the theoretical expected cost
#'
#' @param p the posterior probability of the machine being in a bad state
#' @param p_1 the probability of producing a defective item when in the good state
#' @param p_2 the probability of producing a defective item when in the bad state
#' @param C the cost of producing a defective item
#' @param R the cost of replacing the machine
#'
#' @return
#' @export
lower.bound.V <- function(p_1, p_2, C, R, alpha){
  return(L(0, p_1, p_2, C)/(1-alpha))
}

Find critical posterior probability for resetting

To find the posterior probability threshold we first have to find the smallest integer $t$ such that

$\sum_{j=0}^t(\sum_{k=0}^j \alpha^k)(1-\gamma)^j \geq \frac{R}{(\gamma(p_2-p_1)C)}$

#' function to caluclate the left hand side of the $t$ equation defined above
#'
#' @param t the smallest integer for which the above equation holds
#' @param alpha the discounting factor
#' @param gamma the probability of the machine transitioning from a good state 
#' to a bad state
#'
#' @return the left hand side of the equation
#' @export
LHS_t_alpha_equation <- function(t, alpha, gamma){
  
  outer.sum <- 0
  
  for(j in 0:t){
    
    inner.sum <- 0
    
    for(k in 0:j){
      inner.sum <- inner.sum + alpha^k
    }
    
    outer.sum <- outer.sum + inner.sum * (1-gamma)^j 
    
  }
  
  return(outer.sum)
}



#' function to caluclate the right hand side of the $t$ equation defined above
#'
#' @param R the cost of replacing the machine
#' @param C the cost of producing defective product
#' @param gamma the probability of transitioning from a good machine to a bad machine
#' @param p_1 the probability of a good machine producing a defective product
#' @param p_2 the probability of a bad machine producing a defective product
#'
#' @return the RHS of the equation
#' @export
RHS_alpha_t_equation <- function(R, C, gamma, p_1, p_2){
  return(R/(gamma*(p_2-p_1)*C))
}




#' Find critical integer
#'
#' @param R the cost of resetting the machine
#' @param C the cost of producing a defective product
#' @param gamma the probability of transitioning from a good machine to a bad machine
#' @param p_1 the probability of a good machine producing a defective product
#' @param p_2 the probability of a bad machine producing a defective product
#' @param alpha the discount factor
#'
#' @return the critical t
#' @export
find.t.alpha <- function(R, C, gamma, p_1, p_2, alpha){
  
  hold.LHS <- c()
  
  for(t in 0:500){
    
    hold.LHS <- c(hold.LHS, LHS_t_alpha_equation(t, alpha, gamma))
    
  }
  
  return(min(which(hold.LHS >= RHS_alpha_t_equation(R, C, gamma, p_1, p_2))))
  
}


#Find the critical posterior


#' Find the critical posterior
#'
#' @param R the cost of resetting the machine
#' @param C the cost of producing a defective product
#' @param gamma the probability of transitioning from a good machine to a bad machine
#' @param p_1 the probability of a good machine producing a defective product
#' @param p_2 the probability of a bad machine producing a defective product
#' @param alpha the discount factor
#'
#' @return the critical posterior probability of the machine being in a bad state
#' @export
find.critical.posterior <- function(R, C, gamma, p_1, p_2, alpha){
  
  hold.sum.numerator <- 0
  hold.sum.denominator <- 0
  critical.t <- find.t.alpha(R, C, gamma, p_1, p_2, alpha) - 1
  
  if(critical.t == Inf){
    return(1)
  }
  
  for(j in 0:critical.t){
    hold.sum.numerator <- hold.sum.numerator + alpha^j*(1-(1-gamma)^j) 
    hold.sum.denominator <- hold.sum.denominator + alpha^j
  }

  return((hold.sum.numerator+ R/((p_2-p_1)*C))/hold.sum.denominator)
}

Let’s first start by simulating the problem where we know the probabilities $p_1$, $p_2$, and $\gamma$.

#' Simulation function for the quality control problem
#'
#' @param N the number of stages
#' @param gamma the probability of transitioning from a good machine to a bad machine
#' @param alpha the discount factor
#' @param p the initial assumption on the posterior probability of the machine being in
#' a bad state
#' @param p_1 the probability of a good machine producing a defective product
#' @param p_2 the probability of a bad machine producing a defective product
#' @param C the cost of producing a defective item
#' @param R the cost of resetting the machine to a good state
#'
#' @return the total incurred cost
#' @export
simulate.problem <- function(N, gamma, alpha, p, p_1, p_2, C, R){
  #Start with a good machine
  state <- 0
  L.sum <- 0
  
  posterior.threshold <- find.critical.posterior(R, C, gamma, p_1, p_2, alpha)
  
  for(n in 1:N){
    
    if(state == 0){
      
      defective.product <- rbinom(1, 1, p_1)
      state <- rbinom(1, 1, gamma)
      
    }else{
      
      defective.product <- rbinom(1, 1, p_2)
      
    }
    if(defective.product == 1){
      
      L.sum <- C*alpha^(n-1) + L.sum
      
    }
    
    p <- 1 - (1-p)*(1-gamma)
    
    if(p >= posterior.threshold){
      state <- 0
      L.sum <- L.sum + R*alpha^(n-1)
      p <- 0
    }
    
  }

  return(L.sum)
  
}


##Check the upperbound and lowerbound on the cost
upper.bound.changing.alpha <- c()
lower.bound.changing.alpha <- c()
total.expected.cost <- c()
simulated.cost <- c()

for(alpha in seq(0.005, 0.95, 0.005)){
  upper.bound.changing.alpha <- c(upper.bound.changing.alpha, upper.bound.V(0.1, 0.1, 0.8, 4, 10, alpha))
  lower.bound.changing.alpha <- c(lower.bound.changing.alpha, lower.bound.V(0.1, 0.8, 4, 10, alpha))
  total.expected.cost <- c(total.expected.cost, V(10, 0.1, alpha, 0.1, 0.1, 0.8, 4, 10))
  simulated.cost <- c(simulated.cost, simulate.problem(10, 0.1, alpha, 0.1, 0.1, 0.8, 4, 10))
}


boundaries.df <- data.frame(alpha = seq(0.005, 0.95, 0.005), lower = lower.bound.changing.alpha, upper = upper.bound.changing.alpha, cost = total.expected.cost, simulations = simulated.cost)

ggplot(boundaries.df)+
  geom_line(aes(x=alpha, y=lower, colour="Lower bound V (Inf horizon)"), size=1.2)+
  geom_line(aes(x=alpha, y=upper, colour="Upper bound V (Inf horizon)"), size=1.2)+
  geom_line(aes(x=alpha, y=cost, colour="Expected cost (V)"), size=1.2)+
  geom_line(aes(x=alpha, y=simulated.cost, colour="Simulated cost"), size=1.2)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+
  scale_color_manual(values = c("Lower bound V (Inf horizon)" = "#522258", "Upper bound V (Inf horizon)" = "#508C9B", "Expected cost (V)" ="#921A40",  "Simulated cost" = "#B692C2"))+
  labs(x = TeX("$\\alpha$"), y="Cost", title = TeX("R = 10, C =4, $\\gamma$ = 0.1, $p_1$ = 0.1, $p_2$ = 0.8"), colour = "Cost", tag = "A")

You may be interested to know when the switch from a good machine to a bad machine happens given that we only observe the products and know whether a produced product is defective or not. To understand that critical we can use the forward algorithm to find out the likelihood of observations given a hidden Markov model.

Let’s start with the simulation, it is similar to the function defined above but returns different parameters from the problem.

#' Function for simulating the machine with two states, good and bad problem
#'
#' @param N the number of stages
#' @param gamma the probability of transitioning from a good machine to a bad machine
#' @param p_1 the probability of a good machine producing a defective product
#' @param p_2 the probability of a bad machine producing a defective product
#'
#' @return list containing a vector that stores the quality of items produced, a vector that stores the posterior probability of the machine being in a bad state, the stage at which the machine switches from a good state to a bad state.
#' @export
simulate.machine.problem <- function(N, gamma, p_1, p_2){
  
  state <- 0
  defective.vector <- c()
  posterior.vector <- c()
  for(i in 1:N){
    
    if(state==0){
      state <- rbinom(1, 1, gamma)
      defective.vector <- c(defective.vector, rbinom(1, 1, p_1))
      if(state==1){
        switching.stage <- i+1
      }
    }
    if(state==1){
      defective.vector <- c(defective.vector, rbinom(1, 1, p_2))
    }
    
    posterior.vector <- c(posterior.vector,1 - (1-p)*(1-gamma)^i)
    
  }
  
  return(list(defective.vector, posterior.vector, switching.stage))
  
}


#Apply forward algorithm 

#Load library
library(HMM)

#Define parameters
gamma <- 0.05
p_1 <- 0.1
p_2 <- 0.8
N <- 200


#Simulate the problem
machine.sims <- simulate.machine.problem(N, gamma, p_1, p_2)

machine.sims[[1]]->defective.vector
machine.sims[[2]]->posterior.vector
machine.sims[[3]]->switching.stage


hmm <- initHMM(c("Good","Bad"), c("D","N"),          
              transProbs=matrix(c((1-gamma),0, gamma, 1), nrow=2, ncol=2),
              emissionProbs=matrix(c(p_1, p_2,(1-p_1), (1-p_2)), nrow=2, ncol=2))


# Sequence of observations
defective.vector -> Y

Y[Y==0]<-"N"
Y[Y==1]<-"D"

# Calculate forward probablities
logForwardProbabilities = HMM::forward(hmm,Y)
exp(logForwardProbabilities) -> forward.probs

as.data.frame(t(forward.probs))->forward.probs

forward.probs$theoretical <- c(0, posterior.vector)
forward.probs$cumulative_forward <- cumsum(forward.probs[,1]>forward.probs[,2])

#ggplot(forward.probs)+
#  geom_line(aes(x = seq(1, N+1), y = theoretical), color='darkorange')+
#  geom_vline(xintercept=switching.stage, linetype='dashed', size=1.5)


#The dashed line below shows the timepoint where the good machine actually switches to a bad state.

ggplot(forward.probs)+
  geom_rect(aes(xmin=0, xmax=switching.stage, ymin=0, ymax=Inf), fill="#F1D3CE")+
  geom_rect(aes(xmin=switching.stage, xmax=N+1, ymin=0, ymax=Inf), fill="#D1E9F6")+
  geom_line(aes(x = seq(1, N+1), y = cumulative_forward), color = "#522258", size=2)+
  #geom_line(aes(x = seq(1, N+1), y = Good), color='navy')+
  #geom_line(aes(x = seq(1, N+1), y = Bad), color='darkred')+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+
  geom_vline(xintercept=switching.stage, linetype='dashed', size=1.5, color = "#C75B7A")+
  labs(x = "Observations", y="Cumulative forward prob", title = TeX("R = 10, C =4, $\\gamma$ = 0.1, $p_1$ = 0.1, $p_2$ = 0.8"), tag = "A")

Continuous time variation

Let’s consider another scenario what if neither the products get produced at regular time points nor the transition to a bad machine happens at regular time steps. Instead the timepoints are taken from an exponential distribution. To model this, we use a Continuous Time Hidden Markov (CTHMM) model.

The model is similar to two coupled stochastic process which evolve through two phases

The first and second phases evolve based on the following generator matrices respectively:

$G_1=\begin{bmatrix} -\mu_1 & \mu_1 \\ \mu_2 & -\mu_2 \end{bmatrix}$

$G_2=\begin{bmatrix} -\nu_1 & \nu_1 \\ \nu_2 & -\nu_2 \end{bmatrix}$

and the switch of the states happens through the following generator matrix:

$\Lambda=\begin{bmatrix} -\lambda & \lambda \\ 0 & 0 \end{bmatrix}$

#' Simulating the continuous time variation of the problem without intervention
#'
#' @param N 
#' @param mu_1 Rate of generating a defective item when in a good state
#' @param mu_2 Rate of generating a non-defective item when in a good state
#' @param v_1 Rate of generating a defective item when in a bad state
#' @param v_2 Rate of generating a non-defective item when in a bad state
#' @param lambda Rate of switching from a good state to a bad state
#'
#' @return observations, time of arrival of events, switching time of the stochastic processes
#' @export
simulate.machine.CTHMM.variation <- function(N, mu_1, mu_2, v_1, v_2, lambda){
  
  state <- 0 #Start at a good state
  observations <- c()
  timepoints <- c(0)
  for(n in 1:N){
    
    if(state ==0){ #If the machine is at a good state
      
      switch.tau <- rexp(1, rate=lambda)
      defective.tau <- rexp(1, rate=mu_1)
      nondefective.tau <- rexp(1, rate=mu_2)
      
      #Compare the time of arrival of the products (whether defective or non-defective)
      if(defective.tau<nondefective.tau){ #If the defective product arrives first
        timepoints <- c(timepoints, timepoints[n-1] + defective.tau)
        observations <- c(observations, "D")
      }else{ #If the non-defective product arrives first
        timepoints <- c(timepoints, timepoints[n-1] + nondefective.tau)
        observations <- c(observations, "N")
      }
      #Check if a change in phase happens
      if(switch.tau < min(defective.tau, nondefective.tau)){
        state <- 1
        switch.time <- timepoints[n-1] + switch.tau
      }
      
      
    }else{ #If the machine is at a bad state

      defective.tau <- rexp(1, rate=v_1)
      nondefective.tau <- rexp(1, rate=v_2)
      
      if(defective.tau<nondefective.tau){ #If the defective product arrives first
        timepoints <- c(timepoints, timepoints[n-1] + defective.tau)
        observations <- c(observations, "D")
      }else{ #If the non-defective product arrives first
        timepoints <- c(timepoints, timepoints[n-1] + nondefective.tau)
        observations <- c(observations, "N")
      }
    }
    
    
  }
  
  return(list(observations, timepoints, switch.time))
  
  
}

Simulate the CT variation without intervention

N <- 2000
mu_1 <- 0.1
mu_2 <- 1
v_1 <- 0.8
v_2 <- 0.1
lambda <- 0.001


results <- simulate.machine.CTHMM.variation(N, mu_1, mu_2, v_1, v_2, lambda)


observations <- results[[1]]
time <- results[[2]]
switch.time <- results[[3]]

ct.hmm.qc.df <- data.frame(time=time, observations=observations)

ggplot()+
  geom_rect(aes(xmin=0, xmax=switch.time, ymin=0, ymax=Inf), fill="#F1D3CE")+
  geom_rect(aes(xmin=switch.time, xmax=tail(ct.hmm.qc.df$time,1), ymin=0, ymax=Inf), fill="#D1E9F6")+
  geom_line(data = ct.hmm.qc.df, aes(x=time, y=cumsum(observations=="N")), color="#640D5F", size=1.5)+
  geom_vline(xintercept=switch.time)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+
  labs(title="CT variation simulation", y="Cumulative non-defective products")

Quality control model with inspection

Let us consider a variation of the above problem. Assume that if the machine is in a good state, it does not produce a defective product and it transitions to a bad state with probability $\gamma$ and once in that state it only produces defective products and does not transition back to a good state. You incur a cost of $C$ for producing defective products. You have the option to inspect the machine at each stage with cost $I$ and upon inspection, if you find the machine in a bad state you replace the machine with cost $R$. The question is when to inspect the machine to minimize the total incurred cost.

Define critical posterior probability

#' function to calculate the left hand side of the $t$ equation defined above
#'
#' @param t the smallest integer for which the above equation holds
#' @param alpha the discounting factor
#' @param gamma the probability of the machine transitioning from a good state 
#' to a bad state
#'
#' @return the left hand side of the equation
#' @export
LHS_t_alpha_equation <- function(t, alpha, gamma){
  
  outer.sum <- 0
  
  for(j in 0:t){
    
    inner.sum <- 0
    
    for(k in 0:j){
      inner.sum <- inner.sum + alpha^k
    }
    
    outer.sum <- outer.sum + inner.sum * (1-gamma)^j 
    
  }
  
  return(outer.sum)
}



#' function to calculate the right hand side of the $t$ equation defined above
#'
#' @param R the cost of replacing the machine
#' @param C the cost of producing defective product
#' @param gamma the probability of transitioning from a good machine to a bad machine
#' @param alpha the discount factor
#' @param I the cost of inspecting the product
#'
#' @return the RHS of the equation
#' @export
RHS_alpha_t_equation <- function(R, C, gamma, alpha, I){
  return(I/(gamma*(alpha*(1-gamma)*(C+R) - R)))
}




#' Find critical integer
#'
#' @param R the cost of resetting the machine
#' @param C the cost of producing a defective product
#' @param gamma the probability of transitioning from a good machine to a bad machine
#' @param alpha the discount factor
#' @param I the cost of inspecting the product
#'
#' @return the critical t
#' @export
find.t.alpha <- function(R, C, gamma, alpha, I){
  
  hold.LHS <- c()
  
  for(t in 0:500){
    
    hold.LHS <- c(hold.LHS, LHS_t_alpha_equation(t, alpha, gamma))
    
  }
  
  return(min(which(hold.LHS >= RHS_alpha_t_equation(R, C, gamma, alpha, I))))
  
}

#Find the critical posterior


#' Find the critical posterior
#'
#' @param R the cost of resetting the machine
#' @param C the cost of producing a defective product
#' @param gamma the probability of transitioning from a good machine to a bad machine
#' @param alpha the discount factor
#' @param I the cost of inspecting the product
#'
#' @return the critical posterior probability of the machine being in a bad state
#' @export
find.critical.posterior <- function(R, C, gamma, alpha, I){
  
  hold.sum.numerator <- 0
  hold.sum.denominator <- 0
  critical.t <- find.t.alpha(R, C, gamma, alpha, I) - 1
  
  if(critical.t == Inf){
    return(1)
  }
  
  for(j in 0:critical.t){
    hold.sum.numerator <- hold.sum.numerator + alpha^j*(1-(1-gamma)^j) 
    hold.sum.denominator <- hold.sum.denominator + alpha^j
  }
  
  return((hold.sum.numerator+ I/(alpha*(1-gamma)*(C+R) - R))/hold.sum.denominator)
}



#' Incurred cost
#'
#' @param p the posterior probability of the machine being in the bad state
#' @param alpha the discount factor
#' @param C the cost of producing a defective product
#' @param R the cost of resetting the machine
#'
#' @return the total incurred cost
#' @export
L <- function(p, alpha, C, R){
  
  return(p*((alpha)*(1-gamma)*(C+R)-R) + alpha*gamma*(C+R))
  
}

#' The optimal incurred cost function
#'
#' @param N the number of stages
#' @param gamma the probability of transitioning into the bad state from the good state
#' @param alpha the discount factor
#' @param p the posterior probability of the machine being in a bad state
#' @param C the cost of producing a defective item
#' @param R the cost of replacing the machine
#' @param I the inspection cost
#'
#' @return the expected cost
#' @export
V <- function(N, gamma, alpha, p, C, R, I){
  
  if(N==0){
    return(0)
  }
  updated.p <- 1 - (1-p)*(1-gamma)

  return(min(p*C + alpha*V(N-1, gamma, alpha, updated.p, C, R, I), I + p*(C+R) +alpha*V(N-1, gamma, alpha, 0, C, R, I)))
  
}

Simulate the problem

#' The simulated incurred cost found through simulation
#'
#' @param N the number of stages
#' @param gamma the probability of transitioning into the bad state from the good state
#' @param alpha the discount factor
#' @param p the posterior probability of the machine being in a bad state
#' @param C the cost of producing a defective item
#' @param R the cost of replacing the machine
#' @param I the inspection cost
#'
#' @return the simulated cost
#' @export
simulate.problem <- function(N, gamma, alpha, p, C, R, I){
  
  state <- 0
  posterior.bad <- p
  sum.cost <- 0
  critical.posterior <- find.critical.posterior(R, C, gamma, alpha, I) #find the critical posterior probability of inspection

  for(n in 1:N){
    
    if(state == 1){ #if machine is in a bad state, incur costs related to the production of defective products
      sum.cost <- sum.cost + C*alpha^(n-1)
    }
    
    posterior.bad <- 1 - (1-posterior.bad)*(1-gamma) #update posterior
    
    if(posterior.bad >= critical.posterior){
      sum.cost <- sum.cost + I*alpha^(n-1) #incur inspection costs
      
      if(state == 1){ #costs related to replacement, if machine is found to be in a bad state
        sum.cost <- sum.cost + R*alpha^(n-1)
        state <- 0
        
      }
      posterior.bad <- 0

    }
    
    if(state ==0){ #simulate transition to the bad state from good state
      state <- rbinom(1, 1, gamma)
    }

    
  }
  
  return(sum.cost)
  
}

Red and green coin

Suppose we have two coins, a green one and a red one. We know that one of the coins comes heads with probability $p_1$ and the other comes heads with probability $p_2$ ($p_1 > p_2$). We don’t know which is which but we know that the red coin is the better coin ($p_1$) with probability $p$. We are flipping coins over a finite horizon and we try to maximize the expected reward. For each heads that we get we get a reward of 1 with a discount factor of $\beta$.

Bayesian inference

Let’s first try to track the posterior probability that the red coin is in fact the better coin. For that, we try either flipping the red coin or the green coin.

Obviously, if we flip the red coin and get heads then the posterior probability of the red coin being the better coin would get updated to:

$\frac{p(p_1)}{p(p_1)+(1-p)p_2}$

On the other hand if we flip the red coin and get tails then the posterior probability of the red coin being the better coin would get updated to:

$\frac{p(1-p_1)}{p(1-p_1)+(1-p)(1-p_2)}$

#' Function for updating the posterior probability of the red coin being the more successful coin
#' when flipping the red coin. The assumption is $p_1>p_2$
#' 
#' @param p the posterior probability of the red coin being the better coin
#' @param p_1 the probability of success for the more successful coin
#' @param p_2 the probability of success for the less successful coin
#' @param flip.result the result of flipping the coin; can be either "H" or "T"
#'
#' @return The updated posterior probability of the red coin being the more successful coin
#' @export
prior.update.red <- function(p, p_1, p_2, flip.result){
  
  if(flip.result == "H"){
    
    p.updated <- p*p_1/(p*p_1 + (1-p)*p_2)
    
  }else if(flip.result == "T"){
    
    p.updated <- p*(1-p_1)/(p*(1-p_1)+(1-p)*(1-p_2))
    
  }else{
    return("Error! Please include only 'H' or 'T' as function argument for 'flip.result'")
  }
  
  return(p.updated)
}

#If we flip the green coin then it would be easy to calculate the posterior probability of the
#green coin being the worse one (it is essentially the same as saying the red coin is the better one).

#If we flip the green coin and we get heads then the posterior probability would get updated to:

#$\frac{p(p_2)}{p(p_2)+(1-p)p_1}

#If we flip the green coin and get tails then the posterior probability would get updated to:

#$\frac{p(1-p_2)}{p(1-p_2)+(1-p)(1-p_1)}$

#' Function for updating the posterior probability of the red coin being the more successful coin
#' when flipping the green coin. The assumption is $p_1>p_2$
#'
#' @param p the posterior probability of the red coin being the better coin
#' @param p_1 the probability of success for the more successful coin
#' @param p_2 the probability of success for the less successful coin
#' @param flip.result the result of flipping the coin; can be either "H" or "T"
#'
#' @return The updated posterior probability of the red coin being the more successful coin
#' @export
prior.update.green <- function(p, p_1, p_2, flip.result){
  
  if(flip.result == "H"){
    
    p.updated <- p*p_2/(p*p_2+(1-p)*p_1)
    
  }else if(flip.result == "T"){
    
    p.updated <- p*(1-p_2)/(p*(1-p_2)+(1-p)*(1-p_1))
    
  }else{
    return("Error! Please include only 'H' or 'T' as function argument for 'flip.result'")
  }
  
  return(p.updated)
  
}

Simulate the problem

#' Simulate the green and red coin problem
#'
#' @param N the number of stages
#' @param p the posterior probability of the red coin being the one with probability $p_1$ of being
#' successful
#' @param p_1 the probability of success for coin 1
#' @param p_2 the probability of success for coin 2
#' @param beta the discount factor 
#'
#' @return the list of rewards for each stage; the coin we choose at each stage; the posterior probability of the red coin being
#' the more successful one; the actual coin with a higher probabilty - if 1 it is the red coin, if 0 it is the green coin
#' @export
simulate.problem <- function(N, p, p_1, p_2, beta){
  
  red.prior <- p
  #green.prior <- 1-p
  
  rbinom(1, 1, p)->coin.assignment

  n <- 1
  rewards <- c()
  chosen.coin <- c()

  red.posterior <- c()
  while(n < N+1){
  if(coin.assignment == 0){
    p_g <- p_1
    p_r <- p_2
  }else{
    p_r <- p_1
    p_g <- p_2
  }
  
  if(red.prior >= 1/2){
    chosen.coin <- c(chosen.coin, "red")
    rbinom(1, 1, p_r)->flipped.coin
    if(flipped.coin == 0){
      
      red.prior <- prior.update.red(red.prior, p_1, p_2, "T")

      rewards <- c(rewards, 0)
      red.posterior <- c(red.posterior, red.prior)
      
    }else{
      
      red.prior <- prior.update.red(red.prior, p_1, p_2, "H")

      rewards <- c(rewards, 1*beta^(n-1))
      red.posterior <- c(red.posterior, red.prior)
      
    }
    
  }else{
    chosen.coin <- c(chosen.coin, "green")
    rbinom(1, 1, p_g)->flipped.coin
    if(flipped.coin == 0){
      
      red.prior <- prior.update.green(red.prior, p_1, p_2, "T")
      
      rewards <- c(rewards, 0)
      red.posterior <- c(red.posterior, red.prior)
      
    }else{
      
      red.prior <- prior.update.green(red.prior, p_1, p_2, "H")

      rewards <- c(rewards, 1*beta^(n-1))
      red.posterior <- c(red.posterior, red.prior)
      
    }
  }
    n <- n+1
  }
  
  return(list(rewards, chosen.coin, red.posterior, coin.assignment))
}

Theoretical function

Let’s think about the theoretical function. We can let it run for a long time and set the base case to when the posterior probability reaches either 0 or 1 based on the convergence we saw in the simulations. But it will take a very long time. Consequently, I will run it for a set number of stages for demonstration purposes.

#' Bellman optimality equation function
#'
#' @param p the posterior probability of the red coin being the better coin
#' @param p_1 the probability of success for the more successful coin
#' @param p_2 the probability of success for the less successful coin
#' @param beta the discount factor
#' @param stages the number of stages
#'
#' @return the expected reward
#' @export
V <- function(p, p_1, p_2, beta, stages) {
    
    #The base case
    if (stages == 0) {
      return(0)
    }
    #We have two actions, either we: 
  
    #flip the red coin
    flip.red.action <- p * (p_1 + beta * V(p * p_1 / (p * p_1 + (1 - p) * p_2),p_1, p_2, beta, stages -1)) + 
      (1 - p) * (p_2 + beta * V(p * (1 - p_1) / (p * (1 - p_1) + (1 - p) * (1 - p_2)),p_1, p_2, beta, stages -1))
    
    #or we flip the green coin
    flip.green.action <- (1 - p) * (p_1 + beta * V((1 - p) * p_1 / ((1 - p) * p_1 + p * p_2),p_1, p_2, beta, stages -1)) + 
      p * (p_2 + beta * V((1 - p) * (1 - p_1) / ((1 - p) * (1 - p_1) + p * (1 - p_2)),p_1, p_2, beta, stages -1))
    
    return(max(flip.red.action, flip.green.action))
}
  

# Parameters
p <- 0.9    # Initial belief that red coin is better
p_1 <- 0.9  # Probability of heads for the better coin
p_2 <- 0.3  # Probability of heads for the worse coin
beta <- 0.95  # Discount factor



results.sim.vector <- c()
results.theoretical.vector <- c()

for(beta in seq(0, 0.95, 0.1)){
  
sims <- c()
for(k in 1:1000){
  sims <- c(sims, sum(simulate.problem(10, p, p_1, p_2, beta)[[1]]))
}
results.sim.vector <- c(results.sim.vector, mean(sims))
results.theoretical.vector <- c(results.theoretical.vector, V(p, p_1, p_2, beta, 10))

}


result.df <- data.frame(beta = seq(0, 0.95, 0.1), theory = results.theoretical.vector, simulations = results.sim.vector)

#Check simulations vs theory

ggplot(result.df)+
  geom_line(aes(x=beta, y=theory), color="#AC87C5", size=1.5)+
  geom_point(aes(x=beta, y=simulations), color="#E0AED0", stroke=2.5, shape=8)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(x=TeX("$\\beta"), y="Reward", title="Red and green coin problem; 10 stages")

Inventory problem

Consider the following inventory problem. At the beginning of each day, the amount of goods on hand is noted and a decision is made as to how much to order. The cost for ordering $j$ additional units is $C(j)$, where

\[ C(j) = \begin{cases} K + cj & \text{if } j > 0, \\ 0 & \text{if } j = 0. \end{cases} \]

The order is assumed to be immediately filled. After the order has been filled, the daily demand for the product occurs. The demand will be $j$ with probability $P_j, j \geq 0$. If the demand exceeds the present supply, then a penalty cost of $A$ per unit of unmet demand is incurred. It is also assumed that, if the demand exceeds the supply, then the additional demand is backlogged and is filled when additional inventory becomes available (this can be represented as negative inventory). In addition, there is an inventory holding cost of $h$ for each item of remaining inventory at the end of a period.

The objective is to minimize the total expected discounted cost over an infinite time horizon when $\alpha$ is the discount factor.

Set this up as a Markov decision process and write the optimality equation.

Consider now a single-period version of the preceding problem. Let $L(j) = A \sum_{k=j}^{\infty} (k-j)P_k + h \sum_{k=0}^{j} (j-k)P_k$ denote the expected penalty and holding costs if we order to bring inventory up to $j$.

Show that $L(j)$ is convex. That is, $L(j+1) - L(j)$ is non-decreasing in $j$. Show that the optimal policy when the initial inventory is $i$ is to order $S - i$ if $i < s$ and 0 if $i \geq s$ where $S$ is the value that minimizes $cj + L(j)$ and $s$ is such that $cs + L(s) = K + cs + L(S)$.

We take the demand from a Poisson distribution. To simulate the problem, let us first define some functions.

#' The expected incurred cost function for a single stage
#'
#' @param j the inventory value
#' @param A the cost of not meeting demands
#' @param h the cost of holding excess inventory
#' @param lambda the Poisson distribution parameter for the demand
#'
#' @return the expected cost for a single stage
#' @export
L <- function(j, A, h, lambda){
  
  #Two scenarios either the demand exceeds inventory or the inventory exceeds the demand
  #If demand is greater than supply then we go over all possible values for the demand with the probability of occurrence and calculate the expected cost of not meeting demands. 
  k <- j
  sum.demand.g.supply <- 0
  
  while(dpois(k, lambda)>10e-5){ 
    
    sum.demand.g.supply <- sum.demand.g.supply + (k-j)*dpois(k, lambda)
    k <- k+1
    
  }
  
  sum.demand.g.supply <- sum.demand.g.supply*A

    #If supply is greater than demand then we go over all possible values for the demand with the probability of occurrence (up to our current inventory) and calculate the expected cost of holding excess inventory.  
  sum.demand.l.supply <- 0
  
  for(k in 0:j){
    
    sum.demand.l.supply <- sum.demand.l.supply + (j-k)*dpois(k, lambda)
    
  }
  
  sum.demand.l.supply <- sum.demand.l.supply*h
  #Return the expected cost
  return(sum.demand.g.supply + sum.demand.l.supply)
  
}

$L(j)$ is convex

j_seq <- 2:20
L_j <- c()
L_actual <- c()

for(j in j_seq){
  
  L_j <- c(L_j, L(j+1, 1, 2, 5) - L(j, 1, 2, 5))
  L_actual <- c(L_actual, L(j, 1, 2, 5))
  
}

L.df <- data.frame(j = j_seq, L = L_j)

ggplot(L.df)+
  geom_line(aes(x=j_seq, y=L_j, colour = "Difference"), size=1.5)+
  geom_line(aes(x=j_seq, y=L_actual, colour = "L(j)"), size=1.5)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(x="j", y="Value", title="Inventory problem")+
  scale_color_manual(values = c("Difference" = "#7E60BF", "L(j)" = "#433878"))+
  labs(colour="", tag="A")

Optimal ordering

#' Finds the critical S value (for how much the inventory value should be) that minimizes the cost when K=0
#'
#' @param A the cost of not meeting demands
#' @param h the cost of exceeding demands
#' @param lambda the parameter for the Poisson distribution from which the demand is taken from
#' @param c the cost of ordering additional products
#' @param search.interval the search interval for how much we should increase inventory, should be an interval of integer numbers such as 1:300 
#'
#' @return the critical S value which minimizes the incurred cost for when K=0 (no baseline cost for ordering an item)
#' @export
find.S <- function(A, h, lambda, c, search.interval){
  #We do a brute force search over all interval
  brute.force.vector <- sapply(search.interval, function(j) {
    L(j, A, h, lambda) + c * j
  })
  S <- search.interval[which(brute.force.vector == min(brute.force.vector))]
  
  
  return(S)
}


#' Finds the critical s value (for how much the inventory value should be) that minimizes the cost
#'
#' @param A the cost of not meeting demands
#' @param h the cost of exceeding demands
#' @param lambda the parameter for the Poisson distribution from which the demand is taken from
#' @param c the cost of ordering additional products
#' @param S the critical S that minimizes the incurred cost for when K=0
#' @param K the necessary cost for even ordering (you have to pay $K if you decide to order in addition to the cost for buying the items)
#' @param search.interval the search interval for how much we should increase inventory, should be an interval of integer numbers such as 1:300 
#' @return the critical s value which minimizes the incurred cost
#' @export 
find.s <- function(A, h, lambda, c, S, K, search.interval){
  
  #Check the target value to which we compare the incurred cost with the critical s value
  target_value <- c * S + L(S, A, h, lambda) + K
  
  #We do a brute force search over all interval
  brute.force.vector <- sapply(search.interval, function(s) {
    L(s, A, h, lambda) + c * s
  })
  
  #Find the s that minimizes the Euclidean distance
  distances <- sqrt((brute.force.vector - target_value)^2)
  min_index <- which.min(distances)
  
  s <- search.interval[min_index]
  
  return(s)
}

Simulate the problem

We are assuming that we know the $\lambda$ parameter for the Poisson distribution from which the demand comes from.

#' Return the cost incurred
#'
#' @param initial.stock the initial inventory value we start with 
#' @param A the cost of not meeting demands
#' @param h the cost of 
#' @param lambda 
#' @param stages the number of stages
#' @param alpha the discount factor
#' @param K the baseline and necessary cost for ordering
#' @param c the cost of ordering additional products
#' @param search.interval the search interval for how much we should increase inventory, should be an interval of integer numbers such as 1:300 
#'
#' @return the incurred cost over all stages
#' @export
simulate.problem <- function(initial.stock, A, h, lambda, stages, alpha, K, c, search.interval){
  
  stock <- initial.stock #Set initial inventory value
  
  cost.incurred <- 0 #Initialize incurred cost
  
  #Find out how much we should order
  critical.S <- find.S(A, h, lambda, c, search.interval)
  critical.s <- find.s(A, h, lambda, c, critical.S, K, search.interval)
  
  for(n in 1:stages){ #Go over all stages
    
    #If we have more inventory than how much we should have to satisfy demands, we don't order and incur no costs.
    if(critical.s<=stock){
      
      u <- 0
      ordering.cost <- 0  
      
    }else{ #Otherwise we order and pay some ordering costs. 
      
      u <- critical.s - stock
      
      ordering.cost <- u*c + K
    }
    
    
    w <- rpois(1, lambda) #Demand comes from a Poisson distribution
   
    if(w > (u+stock)){ #If the demand exceeds updated inventory value then we incur some costs
      
      cost.incurred + (w - (u+stock))*A*(alpha)^(n-1) -> cost.incurred 
      stock <- w - (u + stock)
    }
    if(w < (u+stock)){ #If the updated inventory value exceeds the demand then we incur costs related to holding excess inventory
      
      cost.incurred + ((u+stock) - w)*h*(alpha)^(n-1) -> cost.incurred 
      stock <- u+stock - w
    }
    
    ordering.cost*alpha^(n-1) + cost.incurred ->cost.incurred
    
  }
  
  return(cost.incurred)
  
}

Optimality equation

#' The expected incurred cost for the inventory problem
#'
#' @param i the initial inventory value we start with 
#' @param A the cost of not meeting demands
#' @param h the cost of 
#' @param lambda 
#' @param stages the number of stages
#' @param alpha the discount factor
#' @param K the baseline and necessary cost for ordering
#' @param c the cost of ordering additional products
#' @param search.region the search region for how much we should increase inventory, since we start with 0 should be a single number
#'
#' @return the expected incurred cost for running the inventory problem
#' @export
V <- function(i, A, h, lambda, n, alpha, K, c, search.region){
  
  #Base case is when we exhaust all stages then there is no demand
  if(n==0){
    return(0)
  }
  
  #Find out how much we should increase our inventory value (j is the optimal inventory value we want to have)
  sum.hold.j <- c()
  
  for(j in i:(i+search.region)){
    
    sum.hold.j <- c(sum.hold.j, L(j, A, h, lambda))
    
  }
  
  i:(i+search.region) -> search.interval
  
  search.interval[which(sum.hold.j == min(sum.hold.j))]->j
  ordering.cost <- c*(j-i)+K
  
  #If it is optimal to not order anything then we don't incur any ordering costs and the inventory loses the expected amount of demand
  if(j == i){
    i - lambda -> j
    ordering.cost <- 0
  }
  #At each step we sum up the expected cost of the previous stage and the cost incurred in the current stage
  return(V(j, A, h, lambda, n-1, alpha, K, c, search.region)*alpha + min(sum.hold.j) + ordering.cost)

}

Compare simulations and theory

hold.store.sim <- numeric()
hold.store.theory <- numeric()

inventory_values <- seq(0, 100, 10)

hold.store.sim <- sapply(inventory_values, function(i) {
  
  hold.store <- replicate(500, simulate.problem(i, 5, 1, 5, 10, 0.7, 1, 0.3, 1:300))
  mean(hold.store)
})


hold.store.theory <- sapply(inventory_values, function(i) {
  V(i, 5, 1, 5, 10, 0.7, 1, 0.3, 300)
})


result.df <- data.frame(inventory = inventory_values, simulations = hold.store.sim, theory = hold.store.theory)


ggplot(result.df)+
  geom_line(aes(x=inventory, y=theory), color = "#7E60BF", size=2)+
  geom_point(aes(x=inventory, y=simulations), color = "#433878", shape=8, stroke =2.5)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(x="Initial inventory", y="Cost", title="Inventory problem; 10 stages")

Portfolio problem

Below we consider a simple variation of Merton’s portfolio problem in discrete time and find the optimal solution solely through programming.

Suppose we start with a wealth of $S_0$ and at each stage, $n$, we have the choice to consume $C_n$ units, invest $I_n$ in a risk-free asset with a guaranteed return of $r$, and $J_n$units in a risky asset with a random return taken from a log normal distribution $Z_n$. Our goal is to find out what fraction of our wealth we should allocate to investing in different assets vs consuming to maximize our utility obtained through consumption $(u(c^\beta))$.

Let’s start by defining the optimality function for this problem. One could imagine that if we were to run this problem for a finite horizon for the sake of simplicity, at stage $N$, we no longer need to invest any money and our allocation for $I_{N}$ and $J_{N}$ would be 0. Consequently, all of our money would get allocated to $C_{N}$. Now let us move one step backward. At this stage we have to sum up $C_{N-1}$ and $\alpha(E[V(S_{N})])$. We know that $S_{N}$ is equal to the expected value of $rI_{N}+ZJ_{N}$. That means the optimality equation is equal to $V(S_n) = max_{I_n, J_n, C_n}\[u(S_{n}\ + \alpha V(S_{n+1})]$. To account for $Z_n$, we integrate over the distribution of $Z$.

In the example below, we consider the risky asset to have its return taken from a log normal distribution and use a programming approach to find optimal fractions.

Bellman equation

#' The utility function
#'
#' @param c units of consumption
#' @param beta the beta parameter for calculating utility
#'
#' @return the utility 
#' @export
u <- function(c, beta){
  return(c^beta)
}



#' Bellman equation function
#'
#' @param S the wealth
#' @param N the number of stages
#' @param lambda.c units consumed
#' @param lambda.i units invested in riskless asset
#' @param beta the function parameter for utility function
#' @param r rate of growth for riskless investment
#' @param mean.z the mean of the log normal distribution related to the risky asset
#' @param sd.z standard deviation of the log normal distribution to the risky asset
#' @param alpha the discount factor
#'
#' @return the expected reward
#' @export
V <- function(S, N, lambda.c, lambda.i, beta, r, mean.z = 0, sd.z = 1, alpha=0.95, integration.limit=15){
  
  return(V_helper(1, S, N, lambda.c, lambda.i, beta, r, mean.z, sd.z, alpha, integration.limit))
  
}


#' Helper function for the optimality equation
#'
#' @param n the current stage
#' @param S the wealth
#' @param N the number of stages
#' @param lambda.c units consumed
#' @param lambda.i units invested in riskless asset
#' @param beta the function parameter for utility function
#' @param r rate of growth for riskless investment
#' @param mean.z the mean of the log normal distribution related to the risky asset
#' @param sd.z standard deviation of the log normal distribution to the risky asset
#' @param alpha the discount factor
#'
#' @return the expected reward
#' @export
V_helper <- function(n, S, N, lambda.c, lambda.i, beta, r, mean.z, sd.z, alpha, integration.limit = 15){
  
  if(lambda.c + lambda.i > 1){ # Return Inf if the fractions exceed 1
    return(Inf)
  }
  
  #boundary condition
  if(n == N){
    return(u(S, beta))
  }
  
  lambda.j <- 1 - (lambda.c + lambda.i) 
  
  #integrate over Z distribution
  integrand <- function(z){
    S_next <- (1 + r) * lambda.i * S + z * lambda.j * S
    V_next <- V_helper(n + 1, S_next, N, lambda.c, lambda.i, beta, r, mean.z, sd.z, alpha)
    return(V_next * dlnorm(z, meanlog = mean.z, sdlog = sd.z))
  }
  
  expected_value <- integrate(integrand, lower = 0, upper = integration.limit)$value
  

  return(u(lambda.c * S, beta) + alpha * expected_value)
}

Find optimal fractions for investing and consuming

#' Objective function to minimize
#'
#' @param par parameter set consisting of lambda_c and lambda_i
#' @param S0 the initial wealth
#' @param N the number of stages
#' @param beta the function parameter for utility function
#' @param r rate of growth for riskless investment
#' @param mean.z the mean of the log normal distribution related to the risky asset
#' @param sd.z standard deviation of the log normal distribution to the risky asset
#' @param alpha the discount factor
#'
#' @return -1 times the value of the Bellman equation
#' @export
objective.function <- function(par, S0, N, beta, r, mean.z = 0, sd.z = 1, alpha = 0.95) {
  
  lambda.c <- par[1]
  lambda.i <- par[2]
  
  
  lambda.j <- 1 - lambda.c - lambda.i
  
  if (lambda.j < 0 || lambda.c < 0 || lambda.i < 0) {
    return(Inf)  
  }
  

  value <- V(S0, N, lambda.c, lambda.i, beta, r, mean.z, sd.z, alpha)
  
  #we try to minimize -V
  return(-value)
}

library(stats)


#Define parameters
initial_guess <- c(0.3, 0.3)
S0 <- 10000
N <- 5
beta <- 0.09
r <- 0.3
mean.z <- 0
sd.z <- 1
alpha <- 0.95

#Find optimal fractions

result <- constrOptim(
  theta = initial_guess,
  f = objective.function,
  grad = NULL,
  ui = rbind(    #define constraints
    c(1, 0),     #lambda.c >= 0
    c(0, 1),     #lambda.i >= 0
    c(-1, -1)    #lambda.c + lambda.i <= 1
  ),
  ci = c(0, 0, -1),
  S0 = S0,
  N = N,
  beta = beta,
  r = r,
  mean.z = mean.z,
  sd.z = sd.z,
  alpha = alpha
)


optimal.lambda.c <- result$par[1]
optimal.lambda.i <- result$par[2]
optimal.lambda.j <- 1 - optimal.lambda.c - optimal.lambda.i
optimal.V <- -result$value

Simulate problem

#' Simulate the problem
#'
#' @param S0 the initial wealth
#' @param N the number of stages
#' @param lambda.c units consumed
#' @param lambda.i units invested in riskless asset
#' @param beta the function parameter for utility function
#' @param r rate of growth for riskless investment
#' @param mean.z the mean of the log normal distribution related to the risky asset
#' @param sd.z standard deviation of the log normal distribution to the risky asset
#' @param alpha the discount factor
#'
#' @return the expected utility from simulations
#' @export
simulate.problem.different.proportions <- function(S0, N, lambda.c, lambda.i, beta, r, mean.z = 0, sd.z = 1, alpha=0.95){
  
  S <- S0
  sum.utility <- 0
  lambda.j <- 1 - (lambda.c+lambda.i)
  for(n in 1:(N-1)){
    
    S*c(lambda.c, lambda.i, lambda.j) -> updated.allocations
    Z <- rlnorm(1, meanlog = mean.z, sdlog = sd.z) #Take Z from a log normal distribution
    sum(updated.allocations[2:3]*c(r+1, Z))->S  
    sum.utility<-sum.utility + u(updated.allocations[1], beta)*alpha^(n-1)
    
  }
  
  sum.utility <- sum.utility + u(S, beta)*alpha^(N-1)
  
  return(sum.utility)
}

Visualize results

result.df <- data.frame(lambda.i = numeric(), lambda.c = numeric(), 
                        hold.theory = numeric(), hold.sims = numeric())



for(lambda.i in c(optimal.lambda.i-0.3, optimal.lambda.i-0.2, optimal.lambda.i)){
  
  for(lambda.c in c(optimal.lambda.c-0.2, optimal.lambda.c-0.15, optimal.lambda.c-0.1, optimal.lambda.c-0.05, optimal.lambda.c, optimal.lambda.c+0.1)){
    
    
    #theoretical value
    hold.theory <- V(S0, N, lambda.c, lambda.i, beta, r)

    
    #simulations
    simulations <- replicate(1000, simulate.problem.different.proportions(S0, N, lambda.c, lambda.i, beta, r))

    hold.sims <- mean(simulations)
    result.df <- rbind(result.df, data.frame(lambda.i = lambda.i, lambda.c = lambda.c, 
                                             hold.theory = hold.theory, hold.sims = hold.sims))
  }
}




ggplot(result.df, aes(x = lambda.c, color = factor(lambda.i))) +
  geom_point(aes(y = hold.sims), stroke = 2, shape=8) +
  geom_line(aes(y = hold.theory, group = lambda.i), size = 1) +
  labs(title = "Portfolio problem rewards",
       x = expression(lambda[c] ~ "(Consumption Fraction)"),
       y = "Value",
       color = expression(lambda[i] ~ "(Safe Investment Fraction)")) +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="A")+
  ylim(c(8, 11))+geom_hline(yintercept=optimal.V)+
  geom_vline(xintercept=optimal.lambda.c)+
  scale_color_manual(values = c("0.249538121895296" = "#D96098", "0.349538121895296" = "#BEAEE2", "0.549538121895296" = "navy"))

Negative Dynamic Programming

Non-stable stopping

Assume a stopping problem where a decision maker can either continue with no cost $C(i)=0$ or stop and receive a reward equivalent to $R(i)=i$. At each stage that the decision maker decides to continue, there is a 50% chance that his wealth will change to $i+1$ and a 50% chance that his wealth changes to $i-1$. If we were to define the optimality equation in such a way that the optimal policy minimizes the cost then we should set the boundary condition to $V_0(i) = -R(i)$. However, we know that negative dynamic programming has two requirements that need to be met:

$\inf_i C(i) > 0$,
$\sup_i R(i) < \infty$

Neither one of the conditions are met in this problem. Below, we define the optimality equation and simulate the problem. We see that the optimality equation consistent with simulations return negative the initial wealth value.

R <- function(i) return(i)

#' Optimality equation
#'
#' @param n the stage
#' @param i the wealth
#'
#' @return the expected cost
#' @export
V <- function(n, i){
  
  if(n==0) return (-R(i))
  
  
  return(1/2*V(n-1, i-1)+1/2*V(n-1, i+1))
  
}

#' Simulation function for the non-stable stopping problem
#'
#' @param n the stage
#' @param i the wealth
#'
#' @return the simulated cost
#' @export
simulate.non.stable.problem <- function(n, i){
  
  forward.prob <- rbinom(1, 1, 0.5)
  
  for(j in 1:n){
    
    if(forward.prob==1){
      
      i <- i + 1
      
    }else{
      
      i <- i - 1
      
    }
    
  }
  return(-i)
  
}

results.df <- data.frame(i = integer(), Simulation = numeric(), Theory = numeric())

i.seq <- -5:15
N <- 20

for(i in i.seq){
  
  sims <- mean(replicate(5000, simulate.non.stable.problem(N, i)))
  theory <- V(N, i)
  results.df <- rbind(results.df, data.frame(i = i, Simulation = sims, Theory = theory))
  
}

ggplot(results.df)+geom_point(aes(x=i, y=Simulation), color = "#D96098", stroke = 2, shape = 8)+
  geom_line(aes(x=i, y=Theory), color = "#D96098", size = 1.5)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="A", y="Value", x="i", title="Non-stable stopping; N=20")

Triple or nothing

Suppose you have an initial wealth of $i$ and you are given the option to play a game for as many times as you want with no cost. Each time you play the game, you have a 50% chance of winning and tripling your wealth or losing everything you have. Mathematically if you were to play this game for only 1 stage and your initial wealth is 1000 dollars then your expected reward would be 1500 dollars. However, intuitively you would most likely avoid playing. Similar to the previous problem, neither of the conditions for negative dynamic programming are met for this problem. Below, we simulate this problem and show that consistent with the optimality equation the more we play the game the lower would be our cost. Then we show that the probability of losing all of our money from playing this game repeatedly increases as the number of stages increases.

#' Bellman equation for the triple or nothing problem
#'
#' @param n the number of stages
#' @param i the initial wealth 
#'
#' @return expected cost
#' @export
V <- function(n, i){
  
  if(n==0)return(-i)
  
  return(1/2*V(n-1, 3*i)+1/2*V(n-1, 0))
}



#' Simulating the triple or nothing problem
#'
#' @param n the number of stages
#' @param i the initial wealth 
#'
#' @return simulated cost
#' @export
triple.or.nothing.simulation <- function(n, i){
  for(j in 1:n){
    if(rbinom(1, 1, 0.5)==1){
      i <- i*3
    }else{
      #return(0)
      i<-0
    }
  }
  return(-i)
}


results.df <- data.frame(i = integer(), Simulation = numeric(), Theory = numeric())

i <- 10
N.seq <- seq(1, 10, 1)
num.simulations <- 50000

for(N in N.seq){
  sims <- mean(replicate(num.simulations, triple.or.nothing.simulation(N, i)))
  theory <- V(N, i)
  results.df <- rbind(results.df, data.frame(N = N, Simulation = sims, Theory = theory))
}


ggplot(results.df)+geom_point(aes(x=N, y=Simulation), color = "#D96098", stroke = 2, shape = 8)+
  geom_line(aes(x=N, y=Theory), color = "#D96098", size = 1.5)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+labs(tag="A", y="Value", x="N", title="Triple or nothing; i=10")

Looking at this plot it is tempting to think we should play this game more and more to minimize our expected cost. But let’s try to find out the probability of losing all of our money from playing this game.

loss.sim <- c()
num.simulations <- 1000

for(stage in seq(1, 25)){
  loss.sim <- c(loss.sim, sum(replicate(num.simulations, triple.or.nothing.simulation(stage, 5)) == 0)/num.simulations)
}


#' Probability of losing everything from playing the triple bet or nothing problem
#'
#' @param N the number of stages
#'
#' @return the theoretical probability of losing all of our money
#' @export
loss.theory <- function(N){
  
  sum.hold <- 0
  
  for(n in 1:N){
    sum.hold <- sum.hold + 1/2^n
  }
  return(sum.hold)
}

unlist(lapply(seq(1, 25, 1), loss.theory))->loss.theory

loss.df <- data.frame(i=seq(1, 25, 1), sim=loss.sim, theory=loss.theory)

ggplot(loss.df)+geom_point(aes(x=i, y=sim), color = "#D96098", stroke = 2, shape = 8)+
  geom_line(aes(x=i, y=theory), color = "#D96098", size = 1.5)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18))+labs(tag="B", title="Probability of total loss", x="N", y="Probability")

We can see that the sooner we stop playing this game, the better it is because our chances of losing all of our money is less.

Optimal stopping (one stage look ahead)

Asset problem

Suppose that offers come in daily for an asset we own. Each offer, independent of others, will be $j$ with probability $P_j$, $j \geq 0$. (An offer of zero can be interpreted to mean that no offer arrived that day.) Once the offer is made it must be accepted or rejected, and for each day the item remains unsold we incur a maintenance cost $C$, $C > 0$. What policy maximizes our expected return, where by return we mean the accepted value minus the total maintenance cost incurred?

With the state at any time being the offer made at that time, it is intuitive that the optimal policy should be of the following form: accept any offer whose value is at least $i^*$. However, such a set of states is not closed (because offers can decline), thus this problem will not satisfy the conditions of Theorem 2.2. (Check this last statement directly.)

Let us change the model somewhat by supposing that we are allowed to recall any past offer. In this case, the state at any time will be the maximum offer received by that time. Hence, the transition probabilities $P_{ij}$ are given by \[ P_{ij} = \begin{cases} 0 & \text{if } j < i, \\ \sum_{k=0}^{i} P_k & \text{if } j = i, \\ P_j & \text{if } j > i. \end{cases} \]

The stopping set for the one-stage look-ahead policy is \[ B = \left\{ i : i \geq i \sum_{j=0}^{i} P_j + \sum_{j=i+1}^{\infty} j P_j - C \right\} \] which simplifies to \[ B = \left\{ i : C \geq \sum_{j=i+1}^{\infty} (j - i) P_j \right\} = \left\{ i : C \geq E[(X - i)^+] \right\}, \] where $X$ is a random variable representing the offer in a given day, that is, $P(X = j) = P_j$, and $(X - i)^+ = \max(X - i, 0)$. Because $(X - i)^+$ decreases in $i$, we see that in as much as the state cannot decrease (the maximum offer ever received cannot decrease in time), $B$ is a closed set. Hence, assuming stability (which can be shown to hold when $E(X^2) < \infty$), we see that the optimal policy accepts the first offer that is at least $i^*$, where \[ i^* = \min \left\{ i : C \geq \sum_{j=i+1}^{\infty} (j - i) P_j \right\}. \]

Also, because the optimal policy never recalls a past offer, it is also a legitimate policy for the original problem in which such recall is not allowed. Hence, it must be optimal for that problem also. (This follows because it is clear that the maximal return when no recall is allowed cannot be larger than when recall is allowed.)

#' Simulate the asset problem
#'
#' @param offers sequence of offers
#' @param lambda the Poisson parameter for offer arrivals
#' @param C the cost of holding onto asset
#'
#' @return a list that contains: 1. the index of the best offer 2. the reward found through the optimal strategy
#' @export
simulate.problem <- function(offers, lambda, C){
  
  
  
  sum.hold <- 0
  
  for(offer.idx in seq_along(offers)){
    
    max(offers[1:offer.idx])->i

    for(j in seq(i+1, qpois(0.99999999, lambda), 1)){
      
      sum.hold <- (j-i)*dpois(j, lambda)
      
    }

    if(sum.hold <= C){

      return(list(offer.idx, offers[offer.idx] - (offer.idx-1)*C))
      
    }
    
  }
  
  return(list(length(offers), offers[length(offers)] - (length(offers)-1)*C))
  
}

Simulate the problem, the dashed line indicates how often the difference between the reward found through simulation and the best possible reward falls within one standard deviation from the Poisson distribution.

C <- 10
lambda <- 1000
n <- 1000
num.simulations <- 10000
sim.actual.diff <- c()
sim.reward <- c()
best.reward <- c()


for(sim in 1:num.simulations){
  
  offers <- rpois(n, lambda)
  sim.reward <- c(sim.reward, simulate.problem(offers, lambda, C)[[2]])
  best.reward <- c(best.reward, max(offers - seq(0, length(offers) - 1)*C))
  sim.actual.diff <- c(sim.actual.diff, max(offers - seq(0, length(offers) - 1)*C) - simulate.problem(offers, lambda, C)[[2]])
  
}

sim.rewards <- data.frame(sim.reward, best.reward, sim.actual.diff)

ggplot(sim.rewards) +
  geom_density(aes(x=sim.actual.diff, fill="Difference"), color="black", alpha=0.8)+
  geom_density(aes(x=sim.reward, fill="Optimal strategy"), color="black", alpha=0.8)+
  geom_density(aes(x=best.reward, fill="Best reward"), color="black", alpha=0.8)+
  geom_vline(xintercept=sqrt(lambda), linetype='dashed', size=1.5)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+labs(title = "Best vs optimal strategy reward", x="Reward", y="Density", tag="A", fill="Reward")+
  scale_fill_manual(values=c("Difference" = "#B8001F", "Optimal strategy" = "#257180", "Best reward" = "#001F3F"))

Asset problem with different distributions

Suppose that we are not aware what distribution the offers come from but we know that they may come from $N$ distributions with probabilities $p_1, p_2, p_3, ..., p_N$. In the example below, we assume that the distributions are exponential and we simulate the problem under the one-stage look ahead policy and compare it to the best possible reward we can gain from the offers.

Updating the posteriors

Let us first see if we can figure out which distribution the observations are coming from using Bayesian inference.

The posterior probability would get updated through:

$P(\lambda_i|X_1=x) = \frac{P(X_1 = x \mid \lambda_i) \cdot P(\lambda_i)}{P(X_1 = x) = \sum_{i=1}^{N} P(X_1 = x \mid \lambda_i) \cdot p_i}$

#' Update the posteriors 
#'
#' @param x the observation of offer
#' @param lambdas the lambdas for the exponential distributions from which the observations come from
#' @param priors the priors for each distribution
#'
#' @return the updated posterior distribution
#' @export
update.posterior <- function(x, lambdas, priors){
  
  updated.posteriors <- c()
  
  for(idx in 1:length(lambdas)){
    
    lambda <- lambdas[idx]
    prior <- priors[idx]
    
    sum.hold <- 0
    
    for(j in 1:length(lambdas)){
      sum.hold <- sum.hold + dexp(x, lambdas[j])*priors[j]
    }
    
    updated.posteriors <- c(updated.posteriors, dexp(x, lambda)*prior/sum.hold)
  }
  
  return(updated.posteriors)
}


#Let's first confirm that updating the posterior distribution works. 

Mu.vector <- c(6, 4, 8, 10, 3)
lambdas <- 1/Mu.vector
priors <- c(0.2, 0.65, 0.05, 0.1, 0.1)

chosen.lambda <- sample(lambdas, size = 1, prob = priors)

rexp(100, chosen.lambda)->random.nums

for(i in 1:100){
  priors <- update.posterior(random.nums[i], lambdas, priors)
}

Now that we have confirmed that our code is correct, let’s move on to the optimal stopping problem with the one-step look ahead strategy. We would stop if the cost of holding on to the asset exceeds the cost of the expected value of the difference between the next offer and the current offer.

#' Simulate the asset problem with different distributions
#'
#' @param N the number of stages
#' @param lambdas the parameters for the exponential distributions from which the observations come from
#' @param priors the prior probability of the observations coming from each distribution 
#' @param C the daily cost of holding on to the asset
#'
#' @return the simulated reward
#' @export
simulate.problem <- function(N, lambdas, priors, C){
  
  chosen.lambda <- sample(lambdas, size = 1, prob = priors) #randomly draw the distribution
  observations <- rexp(N, chosen.lambda) #draw the observation from the randomly chosen distribution
  for(n in 1:N){
    
    X <- observations[n]
    priors <- update.posterior(X, lambdas, priors)

    if(C >= (sum(1/lambdas*priors) - X)){ #stopping condition
      return(list(X - (n-1)*C, observations, n, chosen.lambda))
    }
    
  }
  
  return(X - (N-1)*C, observations, N, chosen.lambda) #if didn't stop until the end
}

#for the sake of comparison, let's compare the results to the case where we look at the first 37% of offers and select the first one that exceeds them.



#' Policy ii simulations, under this policy we first reject the first 37% of offers and select the first one that exceeds the first 37%. 
#'
#' @param offers the list of offers
#' @param C the daily cost of holding onto the asset
#'
#' @return the simulated reward
#' @export
policy.ii <- function(offers, C){
  
  threshold.value <-max(offers[1:(0.37*length(offers))])
  remaining.offers <- offers[(0.37*length(offers) + 1):length(offers)]
  first.better.offer <- remaining.offers[which(remaining.offers > threshold.value)][1]
  
  if(is.na(first.better.offer)){
    return(offers[length(offers)] - C*length(offers))
  }
  return(first.better.offer - which(offers==first.better.offer)*C)
  
}


lambdas <- c(600, 300, 800, 1000, 250)
lambdas <- 1/lambdas
priors <- c(0.2, 0.65, 0.05, 0.1, 0.1)


sim.lst <- simulate.problem(100, lambdas, priors, 0.05)


c() -> best.reward
c() -> optimal.policy.reward
c() -> sim.actual.diff
C <- 4
  
for(i in 1:10000){
  
  sim.lst <- simulate.problem(100, lambdas, priors, C)
  offers <- sim.lst[[2]]
  best.reward <- c(best.reward, max(offers - seq(0, length(offers) - 1)*C))
  sim.reward <- c(optimal.policy.reward, sim.lst[[1]])
  sim.actual.diff <- c(sim.actual.diff, policy.ii(offers, C))
  
}



sim.rewards <- data.frame(sim.reward, best.reward, sim.actual.diff)

ggplot(sim.rewards) +
  geom_density(aes(x=sim.actual.diff, fill="Secretary strategy"), color="black", alpha=0.8)+
  geom_density(aes(x=sim.reward, fill="One-stage look ahead"), color="black", alpha=0.8)+
  geom_density(aes(x=best.reward, fill="Best reward"), color="black", alpha=0.8)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+labs(title = "Asset problem/different distributions", x="Reward", y="Density", tag="A", fill="Reward")+
  scale_fill_manual(values=c("Secretary strategy" = "#B8001F", "One-stage look ahead" = "#257180", "Best reward" = "#001F3F"))+
  xlim(c(-1000, 5000))

Burglar problem

On each attempt, a burglar is successful with probability $p$. If successful, his loot will be $j$ with probability $P_j$, where $P_j \geq 0$ and $\sum_j P_j = 1$. If unsuccessful, he loses everything (goes to jail) and the problem ends. If the burglar is allowed to retire at any time and keep all the loot he has accumulated up to that time, what is his optimal policy?

This is a stopping problem in which the state at any time is the burglar’s total loot at that time. The optimality equation is \[ V(i) = \max \left( i, p \sum_{j=0}^{\infty} V(i+j) P_j \right), \quad i > 0. \] The one-stage look-ahead policy is to stop when in $B$, \[ B = \left\{ i : i \geq p \sum_j (i+j) P_j \right\}, \] which simplifies to \[ B = \left\{ i : i \geq \frac{p}{1 - p} E(L) \right\}, \] where $E(L) = \sum_j j P_j$ is the mean loot of a successful burglary, assumed to be finite.

Because the state cannot decrease (let the state be infinite if the burglar is caught), it follows that $B$ is closed, and thus (assuming stability) the one-stage look-ahead policy is optimal.

Comparing theory and simulations for optimal policy

#' Expected accumulated wealth function
#'
#' @param N the number of timepoints for theft
#' @param n the current stage
#' @param i the initial wealth
#' @param p the probability of successful theft at each stage
#' @param lambda the Poisson distribution parameter for the 
#'
#' @return the expected accumulated wealth
#' @export
V <- function(N, n, i, p, lambda){
  
  if(n == N){
    return(i)
  }
  
  
  lower <- qpois(0.0001, lambda) 
  upper <- qpois(0.9999, lambda) 
  
  sum.hold <- 0
  for(j in seq(lower, upper, 1)){
    
    sum.hold <- sum.hold + V(N, n+1, i+j, p, lambda)*dpois(j, lambda)
    
  }
  
  sum.hold * p -> sum.hold
  return(max(i, sum.hold))
  
}




#' Find the critical wealth to stop looting
#'
#' @param p the probability of succeeding at theft
#' @param i the initial wealth
#' @param lambda the Poisson distribution for the value of asset we are stealing
#'
#' @return the critical wealth value
#' @export
find.critical.loot <- function(p, i, lambda){
  
  lower <- qpois(0.0001, lambda) 
  upper <- qpois(0.9999, lambda) 
  
  sum.hold <- 0
  for(j in seq(lower, upper, 1)){
    
    sum.hold <- sum.hold + (i+j)*dpois(j, lambda)
    
  }
  
  sum.hold * p -> sum.hold
  return(sum.hold)
  
}




#' Simulate the burglar problem
#'
#' @param N the number of timepoints for theft
#' @param i the initial wealth
#' @param p the probability of successful theft at each stage
#' @param lambda the Poisson distribution parameter for the 
#' @param tweaking.policy added this for the sake of comparison to optimal strategy, by default the variable is set to 0. If changed it will add the value stored in this variable to the critical wealth and no longer return the optimal simulated wealth.
#' 
#' @return the simulated accumulated wealth
#' @export
simulate.problem <- function(N, i, p, lambda, tweaking.policy = 0){
  
  for(n in 1:N){
    find.critical.loot(p, i, lambda)->critical.loot
    
    if(critical.loot>=(i+tweaking.policy)){
      random.loot <- rpois(1, lambda)
      successful.loot <- rbinom(1, 1, p)
      if(successful.loot){
        i <- random.loot + i
      }else{
        return(0)
      }
      
    }
    
  }
  
  return(i)
  
}



N = 6; initial.wealth = 3; lambda = 5

hold.theory <- c()
hold.simulations <- c()
hold.simulations.pi.1 <- c()
hold.simulations.pi.2 <- c()
for(p in seq(0, 1, 0.1)){
  
  hold.theory <- c(hold.theory, V(N, 1, initial.wealth, p, lambda))
  hold.simulations <- c(hold.simulations, mean(replicate(1000, simulate.problem(N,initial.wealth, p, lambda))))
  hold.simulations.pi.1 <- c(hold.simulations.pi.1, mean(replicate(1000, simulate.problem(N,initial.wealth, p, lambda, tweaking.policy = 3))))
  hold.simulations.pi.2 <- c(hold.simulations.pi.2, mean(replicate(1000, simulate.problem(N,initial.wealth, p, lambda, tweaking.policy = -3))))
}


ggplot()+
  geom_point(aes(x=seq(0, 1, 0.1), y=hold.simulations, colour="Optimal policy sims"), shape=8, stroke=1.5)+
  geom_line(aes(x=seq(0, 1, 0.1), y=hold.theory, colour="Optimal policy theory"), size=1)+
  geom_point(aes(x=seq(0, 1, 0.1), y=hold.simulations.pi.1, colour="Policy I sims"), shape=8, stroke=1)+
  geom_point(aes(x=seq(0, 1, 0.1), y=hold.simulations.pi.2, colour="Policy II sims"), shape=8, stroke=1)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+scale_color_manual(values=c("Optimal policy sims" = "#243642", "Optimal policy theory" = "#387478", 
                                "Policy I sims" = "#7E60BF", "Policy II sims" = "#E4B1F0"))+
  labs(tag = "A", title ="Burglar problem", x="Successful theft probability (p)", y="Accumulated wealth", colour="Policy")

Burglar problem with multiple asset types

The following variation is taken from:

Chen, K., & Ross, S. M. (2014). The burglar problem with multiple options. Naval Research Logistics (NRL), 61(5), 359-364.

Assume that the burglar faces multiple assets at each timepoint with a certain probability of a successful theft and rewards coming from an exponential distribution. The question is how do we change the Bellman equation in the previous problem to understand which asset type to burgle at each period in order to maximize the expected return.

#' Bellman equation function for the multi-option burglar problem
#'
#' @param n Number of timepoints for theft
#' @param r the current wealth of the burglar
#' @param q the vector containing the probability of successful theft for assets
#' @param lambda the vector containing the rate parameters for the exponential distribution from which the 
#' reward comes from
#'
#' @return
#' @export
V <- function(n, r, q, lambda){
  
  if(n == 0){
    return(r)
  }
  
  hold.max <- -Inf
  track.i <- 0
  for(i in 1:length(lambda)){
    
    integrand <- function(t){
      return(lambda[i]*exp(-lambda[i]*t)*V(n-1, r+t, q, lambda)[[1]])
    }
    
    integrated.res <- q[i]*integrate(integrand, lower = 0, upper = Inf)$value
    
    if(integrated.res > hold.max){
      hold.max <- integrated.res
      track.i <- i
    }
    
  }
  
  return(list(max(r, hold.max), track.i))
}

Below, we show the expected return from the optimal policy for three periods only.

q <- c(0.5, 0.6, 0.75)
lambda <- c(1/10, 1/9, 1/4)

accumulated.wealth <- c()
tracking.i <- c()

for(r in seq(0, 100, 1)){
  
  accumulated.wealth <- c(accumulated.wealth, V(3, r, q, lambda)[[1]])
  tracking.i <- c(tracking.i, V(3, r, q, lambda)[[2]])
}

types <- length(q)

for(i in 1:types){
  
  name.first <- paste0("first.", i)
  name.last <- paste0("last.", i)
  assign(name.first, which(tracking.i == i)[1])
  assign(name.last, which(tracking.i == i)[length(which(tracking.i == i))])
  
}


ggplot()+
  #geom_tile(aes(x = seq(0, 100, 1), y = 10, fill = factor(tracking.i)), height = 200)+
  geom_rect(aes(xmin = first.1-1, xmax = last.1, ymin=0, ymax = 100, fill='1'))+
  geom_rect(aes(xmin = first.2-1, xmax = last.2, ymin=0, ymax = 100, fill='2'))+
  geom_rect(aes(xmin = first.3-1, xmax = last.3, ymin=0, ymax = 100, fill='3'))+
  geom_line(aes(x=seq(0, 100, 1), y=accumulated.wealth), size=1.5)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+
  scale_fill_manual(values=c('1' = '#BEAEE2', '2' = '#629584', '3' = '#CDF0EA'))+
  labs(tag = "A", title = "N = 3; multi-option burglar problem", y='Expected return', x='Wealth (r)', fill='Asset type')

Burglar problem with varying probability of success

Consider the classic burglar problem with the change that his success probability changes over time. Specifically, suppose that on his $i$th burglary he will be successful with probability $P_i$, $i \geq 1$.

[(a)] Show that the structure of the optimal policy is as follows:

If the burglar has been successful on his first $(i - 1)$ burglaries, $i \geq 1$, then the next one should be attempted if his total loot is less than $a_i$, for suitably chosen constants $a_i$, $i \geq 1$.

[(b)] Give the one-stage look-ahead policy and determine conditions on the $P_i$, $i \geq 1$, that ensure its optimality. Under these conditions show that $a_i$, $i \geq 1$, is a monotone sequence.

For the example below, we assume that the asset we are burgling comes from an exponential distribution with mean $\frac{1}{\lambda}$ and the probability of success at each stage comes from a truncated normal distribution.

#' The expected threshold for deciding to loot at each stage
#'
#' @param r the wealth of the burglar
#' @param lambda the parameter for the exponential distribution from which the asset comes from
#'
#' @return the expected critical loot value
#' @export
expected.next.loot <- function(r, lambda, mean = 0.7, sd = 0.1){
  

  integrand <- function(t){  
    
    return(lambda*exp(-lambda*t) * (r+t))  
    
  }  
    
  integrated.res <- integrate(integrand, lower = 0, upper = Inf)$value  
    
  return(integrated.res*etruncnorm(a = 0, b = 1, mean = mean, sd = sd))
}


#set parameters
lambda <- 1 / 40
r.vals <- seq(0, 100, by = 10)


expected.loot.thresholds <- sapply(r.vals, function(r) expected.next.loot(r, lambda))

result.df <- data.frame(r = r.vals, LootThreshold = expected.loot.thresholds)

ggplot(result.df)+geom_line(aes(x=r.vals, y=LootThreshold), size=1.5, color ="#001F3F")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+labs(tag="A", x="r", y="Loot threshold", title=TeX("$a_i$ is a monotone sequence"))

Simulate the problem

#' Simulating the burglary problem with varying probability of success
#'
#' @param N the number of stages
#' @param r the burglar's wealth
#' @param lambda the parameter for the exponential distribution from which the asset comes from
#' @param tweaking.policy if you want to change the policy from optimal to a non-optimal policy simulation then change this parameter
#' @param mean the mean for the truncated normal distribution from which the probability of successful burgling comes from
#' @param sd the standard deviation for the truncated normal distribution from which the probability of successful burgling comes from
#'
#' @return the simulated wealth
#' @export
simulate.problem <- function(N, r, lambda, tweaking.policy = 0,mean=0.7, sd=0.2){
  

  for(n in 1:N){ #go over all stages
    
    expected.next.loot(r, lambda)->critical.loot
    
    if(critical.loot>=(r+tweaking.policy)){
      
      random.loot <- rexp(1, lambda)
      
      p <- rtruncnorm(1, a = 0, b = 1, mean = mean, sd = sd)
      successful.loot <- rbinom(1, 1, p)
      
      if(successful.loot){
        
        r <- random.loot + r
        
      }else{ #if caught
        
        return(0)
        
      }
      
    }
    
  }
  
  return(r)
  
}


#To run this fast, we simply take the expected values of the distributions

#' Title
#'
#' @param N the number of stages
#' @param r the burglar's wealth
#' @param lambda the parameter for the exponential distribution from which the asset comes from
#' @param mean the mean for the truncated normal distribution from which the probability of successful burgling comes from
#' @param sd the standard deviation for the truncated normal distribution from which the probability of successful burgling comes from
#' @return the expected accumulated wealth
#' @export
V <- function(N,  r, lambda, mean=0.7, sd=0.2){  
  
  if(N == 0){  
    return(r)  
  }  
  
  
  final.integration.res <- etruncnorm(a = 0, b = 1, mean = 0.7, sd = 0.2)*V(N-1, r + 1/lambda, lambda)
  
  return(max(final.integration.res, r))  
}




varying.success.prob <- c()
simulation.probs <- c()

for(r in seq(0, 100, 10)){
  
  varying.success.prob <- c(varying.success.prob, V(50, r, 1/40))
  simulation.probs <- c(simulation.probs, mean(replicate(1000, simulate.problem(50, r, 1/40))))
  
}

ggplot()+
  geom_line(aes(x=seq(0, 100, 10), y=varying.success.prob), color="#3A6D8C", size=1.5)+
  geom_point(aes(x=seq(0, 100, 10), y=simulation.probs), color="#001F3F", stroke=1.25, shape=8)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+labs(tag="B", title="Burglary with varying success probability\n N=50", x="r", y="Expected theft")

Fishing problem

In a certain fishing location there are $N$ types of fish; and each time a fish is caught, then, independent of the types of those previously caught, it will be of type $i$ with probability $P_i, \; i = 1, \ldots, N$. At each stage the decision maker can either pay a cost $C$ and fish during that time period or he can quit. If he fishes, he will catch a fish with probability $\alpha$; and if he quits, he will receive a terminal reward $R(n)$ if he has caught $n$ distinct types of fish. Let us determine the optimal policy.

The state at any time is the set $S$ of distinct types of fish that have been caught. The one-stage look-ahead policy would stop at $S$ if $S \in B$, where

\[ B = \{S : R(|S|) \geq \alpha \bar{P}(S) R(|S| + 1) + \left[ 1 - \alpha \bar{P}(S) \right] R(|S|) - C \}, \]

where

\[ \bar{P}(S) = \sum_{i \notin S} P_i, \]

and $|S|$ denotes the number of elements in $S$. We can rewrite $B$ as

\[ B = \{S : R(|S| + 1) - R(|S|) \leq \frac{C}{\alpha \bar{P}(S)} \}. \]

If we now suppose that $R(n)$ is a concave function, that is, $R(n+1) - R(n)$ decreases in $n$, then because $S$ can only become larger [and thus $R(|S| + 1) - R(|S|)$ can only decrease, whereas $\frac{C}{\alpha \bar{P}(S)}$ can only increase] it follows that $B$ is closed. Hence, the one-stage policy is optimal when $R$ is concave.

#' P bar
#'
#' @param unique.types the unique types of fish caught so far 
#' @param lambda the lambda parameter for the Poisson distribution from which the fish type is taken from
#'
#' @return P bar
#' @export
P_bar <- function(unique.types, lambda){
  
  lower <- qpois(0.000000001, lambda)
  upper <- qpois(0.999999999, lambda)
  return(sum(dpois(setdiff(seq(lower, upper, 1), unique.types), lambda)))

}



#' Reward function - defined as concave
#'
#' @param x the number of unique fish caught
#'
#' @return the reward
#' @export
R <- function(x){
  
  return(100*sqrt(x))
}

Let’s start with the simulations first.

#' Simulate the fishing problem
#'
#' @param N the number of stages
#' @param alpha the probability of successfully catching a fish
#' @param lambda the parameter for the Poisson distribution from which the type of the fish comes from
#' @param C the cost of fishing at each stage
#'
#' @return the simulated return
#' @export
simulate.fishing.problem <- function(N, alpha, lambda, C){
  
  cost <- 0 
  stopped <- FALSE
  types <- c()
  stages <- 1
  
    
  while(!stopped && stages <= N){
    unique.types <- unique(types)
    
    if(length(unique.types)>=1 && ((R(length(unique.types)+1) - R(length(unique.types))) <= C/(alpha*P_bar(unique.types, lambda)))){
      stopped <- TRUE
    }
    cost <- C + cost
    stages <- stages + 1
    probability.success.fish <- rbinom(1, 1, alpha)
    
    if(probability.success.fish){
      
      types <- c(types, (rpois(1, lambda)))
      
    }
    
  }
    

  return(R(length(unique(types))) - cost)
}

#' The optimality equation function for the fishing problem
#'
#' @param N the number of stages
#' @param S the number of unique fish types caught so far
#' @param alpha the probability of catching a fish successfully
#' @param lambda the parameter for the Poisson distribution from which the type of the fish comes from
#' @param C the cost of fishing at each stage
#'
#' @return the expected return
#' @export
V <- function(N, S, alpha, lambda, C) {
  
  #Base case, you have no stages left to fish your reward depends on how many unique fish types you have caught. 
  if (N == 0) {
    return(R(S))
  }
  
  unique.types <- 0:(S - 1)  #Let's store fish types up to this point in a variable
  

  P.new <- P_bar(unique.types, lambda) #Probability of catching a new fish
  
  
  #We have two actions:
  #I. We stop and receive a reward based on the number of unique fish we have caught
  #II. We continue and we either fail to successfully catch a fish or we fail to catch a unique fish OR we catch a unique fish successfully. 
  return(max(R(S), -C + alpha * P.new * V(N - 1, S + 1, alpha, lambda, C) + (alpha * (1 - P.new) + (1 - alpha)) * V(N - 1, S, alpha, lambda, C)))
}



theory.vector <- c()
sim.vector <- c()
lambda <- 3
for(C in seq(0,40 ,1)){
  
  c(theory.vector, V(10, 0, 0.8, lambda, C)) -> theory.vector
  c(sim.vector, mean(replicate(1000, simulate.fishing.problem(10, 0.8, lambda, C)[[1]]))) -> sim.vector
  
}

results <- data.frame(cost = seq(0, 40, 1), theory = theory.vector, simulations = sim.vector)

ggplot(results)+
  geom_line(aes(x=cost, y=theory, colour="Theory"), size=1.5)+
  geom_point(aes(x=cost, y=simulations, colour="Sims"), shape=8, stroke=1.5)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18))+
  scale_color_manual(values=c("Theory" = "#387478", "Sims" = "#243642"))+
  labs(x="Cost (C)", y= "Reward", title = "Fishing N=10", colour = "", tag="A")

Parking problem

We start a distance of $N$ parking places from our destination. As we drive along we can see only one parking place at a time. Each parking place is, independently, free with probability $p$. Determine the policy that minimizes the expected distance we need to walk.

For the problems below, we are consider a cost of $C$ if we fail to find a parking at the end. Let’s first start by defining the Bellman equation. The boundary condition would be passing all the available parking slots and incurring a very large cost $C$ ($V_n(0)=C$). The bellman equation can be written as $p min[V_{n}(i-1), i] + (1-p)V_{n}(i-1)$.

#' Optimality equation
#'
#' @param N number of stages
#' @param i 
#' @param p the probability of a parking slot being empty at each stage
#' @param C the cost that we incur if we fail to park our car
#'
#' @return the expected cost
#' @export
V <- function(N, i, p, C){
  
  if(i==0){ #boundary condition
    return(C)
  }
  
  return(p*min(i, V(N, i-1, p, C)) + (1-p)*V(N, i-1, p, C))
  
}

To simulate the problem, we use a one-stage look ahead policy where we consider the expected cost of continuing to search for parking slots vs stopping.

#' Function for finding the threshold
#'
#' @param N number of stages
#' @param i 
#' @param p the probability of a parking slot being empty at each stage
#' @param C the cost that we incur if we fail to park our car
#'
#' @return the calculated threshold for the one stage look-ahead policy
#' @export
cost.threshold <- function(N, i, p, C){
  
  if(i==0){
    return((1-p)*(C))
  }
  
  return(p*i+(1-p)*cost.threshold(N, i-1, p, C))
  
}


#' Simulation function for expected cost
#'
#' @param N number of stages
#' @param i 
#' @param p the probability of a parking slot being empty at each stage
#' @param C the cost that we incur if we fail to park our car
#'
#' @return the simulated cost
#' @export
simulate.problem <- function(N, i, p, C){
  
  stages <- rev(seq(0, N, 1))
  
  for(i in seq_along(stages)){
    
    threshold <- cost.threshold(N, stages[i], p, C) #one stage look ahead policy
    
    if(stages[i] <= threshold){
      
      if(rbinom(1, 1, p)){
        
        return(stages[i])
        
      }
      
    }
    
  }
  
  return(C)
  
}


C <- 100
N <- 10


sim.vector <- c()
theory.vector <- c()

for(p in seq(0.1, 0.9, 0.05)){
  
  sim.vector<- c(sim.vector, mean(replicate(1000, simulate.problem(N, N, p, C))))
  theory.vector <- c(theory.vector, V(N, N, p, C))
  
}
ggplot()+geom_line(aes(x=seq(0.1, 0.9, 0.05), y=theory.vector), size=1.5, color="#D96098")+
  geom_point(aes(x=seq(0.1, 0.9, 0.05), y=sim.vector), stroke=1.5, shape=8, color="#D96098")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+
  labs(title="Parking problem, N=10", x="Probability of empty parking slot", y="Cost")

Optimal search

Classic problem

An object is located in one of $n$ possible locations, being in location $i$ with probability $P_i^0$, $i = 1, \ldots, n$, $\sum_i P_i^0 = 1$. A search of location $i$ costs $C_i$, $C_i > 0$, and if the object is present, the probability that it will be discovered is $\alpha_i$, $i = 1, \ldots, n$. That is, $1 - \alpha_i$ is the overlook probability for location $i$. The objective is to discover the object at minimal expected cost.

This is a decision process whose state at any time is the posterior probability vector $\mathbf{P} = (P_1, \dots, P_n)$, with $P_i$ representing the posterior probability, given all that has occurred, that the object is in box $i$. Letting $V(\mathbf{P})$ denote the minimal expected cost function, then the optimality equation is given by

\[ V(\mathbf{P}) = \min_i \left[ C_i + (1 - \alpha_i P_i) V(T_i(\mathbf{P})) \right], \]

where $T_i(\mathbf{P}) = \left[(T_i(\mathbf{P}))_1, \dots, (T_i(\mathbf{P}))_n \right]$ is the vector of posterior probabilities given the prior probability vector $\mathbf{P}$ and given that a search of location $i$ was unsuccessful. Hence,

\[ (T_i(\mathbf{P}))_j = P(\text{in } j \mid \text{search of } i \text{ unsuccessful}) \]

\[ = \begin{cases} \frac{P_j}{1 - \alpha_i P_i}, & j \neq i, \\ \frac{P_i (1 - \alpha_i)}{1 - \alpha_i P_i}, & j = i. \end{cases} \]

For a given state $\mathbf{P}$, a policy can be regarded as a sequence of locations $(i_1, i_2, \dots)$ with the interpretation that the locations are searched in that order until the object is found.

#' Updating the posterior probability
#'
#' @param P the prior probability vector of the item being located at any location
#' @param i the location we are searching
#' @param alpha the probability of finding the item if it is present at that location
#'
#' @return the updated posterior probability of finding the item 
#' @export
T_P <- function(P, i, alpha){
  
  for(j in 1:length(P)){

    if(i==j){
      P[j] = P[i]*(1-alpha[i])/(1-alpha[i]*P[i])
    }else{
      P[j] = P[j]/(1-alpha[i]*P[i])
    }
    
  }
  
  return(P)
  
}

#' The optimality equation
#'
#' @param N the number of stages
#' @param C the vector containing the cost of searching one location
#' @param alpha the vector of successfully finding the item at the location, if the item is located there
#' @param P the vector containing the probability of the item being located at a specific location
#'
#' @return the expected cost
#' @export 
V <- function(N, C, alpha, P){
  
  if(N==0){ #boundary condition, no stages left, we cannot search anymore
    return(0)
  }
  hold.min <- Inf
  
  for(i in 1:length(alpha)){ #find out which location we should search to minimize the cost
    
    val <- C[i]+(1-alpha[i]*P[i])*V(N-1, C, alpha, T_P(P, i, alpha))

    if(val < hold.min){
      hold.min <- val
    }
    
  }
  
  return(hold.min)
}

Simulate the problem

The optimal strategy is to search the location with the highest $\max_i \frac{\alpha_i P_i}{C_i}$

#' The simulation function
#'
#' @param N the number of stages
#' @param C the vector containing the cost of searching one location
#' @param alpha the vector of successfully finding the item at the location, if the item is located there
#' @param P the vector containing the probability of the item being located at a specific location
#'
#' @return the simulated cost
#' @export 
simulate.problem <- function(N, C, alpha, P){
  
  total.cost <- 0
  actual.location <- sample(seq(1, length(P), 1), prob = P)[1] #place the item at one location randomly
  
  for(n in 1:N){
    
    search.location <- which.max(alpha*P/C) #find out where to search
    total.cost <- total.cost + C[search.location]
    
    found.item <- rbinom(1, 1, alpha[search.location])*(actual.location == search.location)
    if(found.item==1){ #if the item is found, stop searching
      
      return(total.cost)
      
    }else{ #otherwise update the posterior probabilities
      
      P <- T_P(P, search.location, alpha)
      
    }
    
  }
  
  return(total.cost)
  
}

Compare simulations and theory

N <- 10
C <- c(10, 15, 20)
alpha <- c(0.7, 0.8, 0.65)
P_1 <- 0.3

V.vector <- c()
sim.vector <- c()

for(P_2 in seq(0.1, 0.6, 0.05)){
  
  P <- c(P_1, P_2, 1-(P_1+P_2))
  
  V.vector<- c(V.vector, V(N, C, alpha, P))
  
  sim.vector <- c(sim.vector, mean(replicate(1000, simulate.problem(N, C, alpha, P))))
  
  
}

ggplot()+geom_point(aes(x=seq(0.1, 0.6, 0.05), y=sim.vector), shape=8, stroke=2, color = "#D96098")+
  geom_line(aes(x=seq(0.1, 0.6, 0.05), y=V.vector), size=1.5, color = "#F7DBF0")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) + labs(tag="A", title=TeX("Optimal search; $P_1$ = 0.3, N = 10"), y="Cost", x=TeX("$p_2$"))

Optimal search with rewards

Now suppose that finding the item at a particular location $i$ will result in a reward of $R_i$. Let us simulate this problem under the optimal policy and compare it to the Bellman equation

#' The optimality equation
#'
#' @param N the number of stages
#' @param C the vector containing the cost of searching one location
#' @param alpha the vector of successfully finding the item at the location, if the item is located there
#' @param P the vector containing the probability of the item being located at a specific location
#' @param R the vector containing the reward for finding the item in a specific location 
#'
#' @return the expected cost
#' @export
V <- function(N, C, alpha, P, R){
  
  if(N==0){ #boundary condition
    return(0)
  }
  hold.min <- Inf
  
  for(i in 1:length(alpha)){

    val <- C[i]- alpha[i]*P[i]*R[i] + (1-alpha[i]*P[i])*V(N-1, C, alpha, T_P(P, i, alpha), R)
    
    if(val < hold.min){
      hold.min <- val
    }
    
  }
  
  return(min(0, hold.min))
  
}



#' Simulate the problem
#'
#' @param N the number of stages
#' @param C the vector containing the cost of searching one location
#' @param alpha the vector of successfully finding the item at the location, if the item is located there
#' @param P the vector containing the probability of the item being located at a specific location
#' @param R the vector containing the reward for finding the item in a specific location 
#'
#' @return the simulated cost
#' @export
simulate.problem <- function(N, C, alpha, P, R){
  
  total.cost <- 0
  actual.location <- sample(seq(1, length(P), 1), prob = P)[1]
  
  for(n in 1:N){
    
    if(sum(P*alpha*R>=C)==0){ #stop if there is no point in continuing; for all of the
      #locations the cost of continuing is more than the expected reward
      return(total.cost)
    }
    search.location <- which.max(alpha*P/C)
    total.cost <- total.cost + C[search.location]
    
    found.item <- rbinom(1, 1, alpha[search.location])*(actual.location == search.location)
    if(found.item==1){
      
      return(total.cost - R[search.location])
      
    }else{
      
      P <- T_P(P, search.location, alpha) #updated posterior probability
      
    }
    
  }
  
  return(total.cost)
  
  
}


#set parameters

N <- 10
C <- c(10, 15, 20)
alpha <- c(0.7, 0.8, 0.65)
P_1 <- 0.3
P_2 <- 0.2
P <- c(P_1, P_2, 1-(P_1+P_2))
R <- c(600, 500, 1000)



V.vector <- c()
sim.vector <- c()

for(P_2 in seq(0.1, 0.6, 0.05)){
  
  P <- c(P_1, P_2, 1-(P_1+P_2))
  
  V.vector<- c(V.vector, V(N, C, alpha, P, R))
  
  sim.vector <- c(sim.vector, mean(replicate(1000, simulate.problem(N, C, alpha, P, R))))
  
  
}


ggplot()+geom_point(aes(x=seq(0.1, 0.6, 0.05), y=sim.vector), shape=8, stroke=2, color = "#D96098")+
  geom_line(aes(x=seq(0.1, 0.6, 0.05), y=V.vector), size=1.5, color = "#F7DBF0")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  ) + labs(tag="B", title=TeX("Optimal search with rewards; $P_1$ = 0.3, N = 10"), y="Cost", x=TeX("$p_2$"))

Positive Dynamic Programming

Gambling

Suppose a gambler enters a casino with a fortune of $i$ and has the option of betting any positive integer amount from 0 to his current fortune ($0 \l j \leq i$). At each stage he has a probability $p$ of winning the bet and earning $j$ fortune and a probability $1-p$ of losing $j$. His goal is to reach a specific wealth $N$. The question is what strategy maximizes his chance of reaching a fortune of $N$ before going broke.

Timid strategy is defined as betting only 1 unit at each stage and a bold strategy is defined as betting his entire fortune or $N-i$ at every stage.

Below we show that the optimal strategy would be to play timidly when the probability of winning the bet is greater than $0.5$ and to play boldly otherwise.

#' Simulation function for gambling
#'
#' @param i the gambler's current fortune
#' @param N the amount the gambler wants to reach
#' @param p the probability of winning a bet
#' @param strategy either 'timid' or 'bold'
#'
#' @return the vector containing the gambler's wealth at every stage
#' @export
play.gambling.casino <- function(i, N, p, strategy){
  
  broke <- FALSE
  happy <- FALSE
  wealth <- c(i)
  
  while(!broke && !happy){ #process continues until the gambler is happy with his fortune reaching N units or goes broke
    
    prob <- rbinom(1, 1, p)
    
    if(strategy == "bold"){
      
      bet <- if(i<=N/2)i else N-i
      
    }else if(strategy=="timid"){
      
      bet <- 1
      
    }else{
      
      stop("Invalid strategy provided. Use 'bold' or 'timid'.")
      
    }
    
    if(prob){
      
      wealth <- c(wealth, wealth[length(wealth)] + bet)
      
    }else{
      
      wealth <- c(wealth, wealth[length(wealth)] - bet)
      
    }
    
    if(wealth[length(wealth)]==0){
      
      broke<-TRUE
      
    }else if(wealth[length(wealth)]==N){
      
      happy<-TRUE
      
    }
  }
  return(wealth)
}


#' Function for finding the probability of reaching a fortune of $N$
#'
#' @param i the gambler's current fortune
#' @param N the amount the gambler wants to reach
#' @param p the probability of winning a bet
#' @param strategy either 'timid' or 'bold'
#'
#' @return the probability of reaching fortune N
#' @export
U <- function(i, N, p, strategy){
  
  q <- 1-p
  
  if(i==0){
    return(0)
  }
  
  if(strategy == "bold"){

    if(i==N){
      return(1)
    }
    if(N==0){
      return(0)
    }
    if(i<=N/2) {return(p*U(2*i, N-1, p,strategy))} else {return(p+(1-p)*U(2*i-N,N-1, p, strategy))}
  
  }else if(strategy == "timid"){
    
    if(p==1/2) {return(i/N)} else {return((1-(q/p)^i)/(1-(q/p)^N))}
    
  }else{
    
    stop("Invalid strategy provided. Use 'bold' or 'timid'.")
  }
}

Compare simulations and theory

N <- 20
i <- 10
p.seq <- seq(0, 1, 0.01)
num.simulations <- 1000

results.df <- data.frame(P = numeric(), BoldSimulation = numeric(), BoldTheory = numeric(), TimidSimulation = numeric(), TimidTheory = numeric())

for(p in p.seq){
  
  timid.sims <- replicate(num.simulations, tail(play.gambling.casino(i, N, p, "timid"), 1))
  timid.sims.res <- (sum(timid.sims == N))/num.simulations
  timid.theory.res <- U(i, N, p, "timid")
  
  bold.sims <- replicate(num.simulations, tail(play.gambling.casino(i, N, p, "bold"), 1))
  bold.sims.res <- (sum(bold.sims == N))/num.simulations
  bold.theory.res <- U(i, N, p, "bold")
  
  results.df <- rbind(results.df, data.frame(P = p, BoldSimulation = bold.sims.res, BoldTheory = bold.theory.res, TimidSimulation = timid.sims.res, TimidTheory = timid.theory.res))
}



ggplot(results.df)+
  geom_rect(aes(xmin=0, xmax=0.5, ymin=0, ymax=1), fill="#C4E1F6")+
  geom_rect(aes(xmin=0.5, xmax=1, ymin=0, ymax=1), fill="#E2F1E7")+
  geom_point(aes(x=P, y=BoldSimulation, colour="Bold sims"), shape=8, stroke=1.5)+
  geom_point(aes(x=P, y=TimidSimulation, colour = "Timid sims"), shape=8, stroke=1.5)+
  geom_line(aes(x=P, y=BoldTheory, colour = "Bold theory"), size=1.5)+
  geom_line(aes(x=P, y=TimidTheory, colour = "Timid theory"), size=1.5)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size=20),
    axis.text = element_text(size = 15),
    text = element_text(size=18))+
  scale_color_manual(values=c("Bold sims" = "#629584", "Timid sims" = "#789DBC", "Bold theory" = "#629584",  "Timid theory" = "#789DBC"))+
  labs(x="Probability of winning", y="Probability of reaching N", title="N=20, i=10", colour="Strategy")

Agent Type	Technology A	Technology B
R agent	\(a_R + r n_A\)	\(b_R + r n_B\)
S agent	\(a_S + s n_A\)	\(b_S + s n_B\)

Stochastic Dynamic Programming

Finite Stage Models

Gambling Model

Define theory and simulation functions

Case I) Let’s try for when p is actually more than q

Simulate the first case

Case II) Let’s try p<1/2

Simulate the second case

Gambling model with changing win probabilities

Define relevant functions

Calculate the maximal expected reward

Stock option model

Define the maximum expected return function

Proving Lemma 3.1

Optimal allocation problems

Load relevant libraries

Concave p(x) function

Define our strategies

Optimal strategy for a concave p(x)

Convex p(x) function

Optimal strategy for a convex p(x)

k>1

Simulate problem for k>1 successful components needed

Comparison of strategies

On special case: p(x) = x

Compare simulations and theory

Writing the optimality equation

Sequential allocation model

Sequential allocation model with certainty

Sequential allocation model with uncertainty

Optimal stopping problems

Finding the best offer

Secretary problem with the possibility of rejection

Secretary problem with recall

Full information secretary problem

The Cayley - Moser problem

Guessing one card from a deck

Pick any of the thirteen spades

Taking balls from an urn

Game of pure strategy

Timid/bold play

Optimal strategy

Compare simulations and theory

Multi armed bandit problems

Competing technologies problem

Thompson sampling

Upper Confidence Bound (UCB)

Epsilon-greedy

Compare strategies

Gold mining problem

Discounted Programming

Machine replacement model

Define probability transition matrix

In search for \(\breve{\imath}\)

Quality control models

Quality control model with two hidden states

Continuous time variation

Quality control model with inspection

Define critical posterior probability

Simulate the problem

Red and green coin

Bayesian inference

Simulate the problem

Theoretical function

Inventory problem

\(L(j)\) is convex

Optimal ordering

Simulate the problem

Optimality equation

Compare simulations and theory

Portfolio problem

Bellman equation

Find optimal fractions for investing and consuming

Simulate problem

Visualize results

Negative Dynamic Programming

Non-stable stopping

Triple or nothing

Optimal stopping (one stage look ahead)

Asset problem