Code reference to UCB Multiarmed Bandits Framework
#Import Libraries
library(ggplot2)
library(reshape2)
# Normal Distribution of arms and rewards with small variance
mean_reward = c(5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25, 26)
reward_dist = c(function(n) rnorm(n = n, mean = mean_reward[1], sd = 2.5),
function(n) rnorm(n = n, mean = mean_reward[2], sd = 2.5),
function(n) rnorm(n = n, mean = mean_reward[3], sd = 2.5),
function(n) rnorm(n = n, mean = mean_reward[4], sd = 2.5),
function(n) rnorm(n = n, mean = mean_reward[5], sd = 2.5),
function(n) rnorm(n = n, mean = mean_reward[6], sd = 2.5),
function(n) rnorm(n = n, mean = mean_reward[7], sd = 2.5),
function(n) rnorm(n = n, mean = mean_reward[8], sd = 2.5),
function(n) rnorm(n = n, mean = mean_reward[9], sd = 2.5),
function(n) rnorm(n = n, mean = mean_reward[10],sd = 2.5))
#Preparing the simulation data
dataset = matrix(nrow = 10000, ncol = 10)
for(i in 1:10) {
dataset[, i] = reward_dist[[i]](n = 10000)
}
#Assigning column names and viewing datasets
colnames(dataset) <- 1:10
View(dataset)
#Create a melted dataset with arm and reward combination
dataset_p = melt(dataset)[, 2:3]
colnames(dataset_p) <- c("Bandit", "Reward")
#Converting the arms column in the dataset to nominal type
dataset_p$Bandit = as.factor(dataset_p$Bandit)
View(dataset_p)
#Plot sample distributions
ggplot(dataset_p, aes(x = Reward, col = Bandit, fill = Bandit)) +
geom_density(alpha = 0.3) +
labs(title = "Reward from different bandits")
# implementing upper confidence bound algorithm
UCB <- function(N = 1000, reward_data){
d = ncol(reward_data)
bandit_selected = integer(0)
numbers_of_selections = integer(d)
sums_of_rewards = integer(d)
total_reward = 0
for (n in 1:N) {
max_upper_bound = 0
for (i in 1:d) {
if (numbers_of_selections[i] > 0){
average_reward = sums_of_rewards[i] / numbers_of_selections[i]
delta_i = sqrt(2 * log(1 + n * log(n)^2) /
numbers_of_selections[i])
upper_bound = average_reward + delta_i
} else {
upper_bound = 1e400
}
if (upper_bound > max_upper_bound){
max_upper_bound = upper_bound
bandit = i
}
}
bandit_selected = append(bandit_selected, bandit)
numbers_of_selections[bandit] = numbers_of_selections[bandit] + 1
reward = reward_data[n, bandit]
sums_of_rewards[bandit] = sums_of_rewards[bandit] + reward
total_reward = total_reward + reward
}
return(list(total_reward = total_reward, bandit_selected,
numbers_of_selections = numbers_of_selections, sums_of_rewards =
sums_of_rewards))
}
# running the UCB algorithm on our
UCB(N = 1000, reward_data = dataset)
## $total_reward
## 1
## 25835.75
##
## [[2]]
## [1] 1 2 3 4 5 6 7 8 9 10 9 9 9 9 9 10 10 10 10 9 10 10 10 10
## [25] 10 10 9 10 9 9 9 9 9 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [49] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [73] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [97] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 9 10 10 10 10 10 10 10 10
## [121] 10 10 10 10 10 10 10 10 10 9 9 9 10 10 10 10 10 10 10 10 10 10 10 10
## [145] 10 10 10 10 10 9 9 9 9 9 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [169] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [193] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [217] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [241] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [265] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [289] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [313] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [337] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [361] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [385] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [409] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [433] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [457] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [481] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [505] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [529] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [553] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [577] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [601] 10 10 10 10 10 10 10 10 7 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [625] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [649] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 9 10 10 10
## [673] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [697] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [721] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [745] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [769] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [793] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [817] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [841] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [865] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [889] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [913] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [937] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [961] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [985] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
##
## $numbers_of_selections
## [1] 1 1 1 1 1 1 2 1 23 968
##
## $sums_of_rewards
## [1] 4.790936 5.996144 8.783295 11.584010 15.660482
## [6] 20.982918 38.793841 21.139662 577.834643 25130.183185