Simulating data with different normal distribution scope and feeding the data to a Generalized UCB Multi-Armed Bandit model.
## Inverse Logit Function
invlogit <- function(p) {
return(exp(p)/(1 + exp(p)))
}
## Generate Bernoulli data
K <- 10
n <- 1000
b0 <- 0.2 # True value for intercept
b1 <- 0.5 # True value for the slope
Y <- X <- matrix(0, ncol = K, nrow = n)
for(i in 1:K){
set.seed(100+i) #update / replace seed
X[,i] <- runif(n, 0, 0.5) #uniform distrib
set.seed(100+i)
Y[,i] <- rbinom(n, 1, invlogit(b0 + b1*X)) #binomial distrib
}
## UCB-GLM
UCBGLM <- function(tau, alpha, reward, feature){
T <- dim(Y)[1] #set the dim for Y
# Initialization t = 1,...,tau
a <- sample(1:K, size = tau, replace = TRUE) # initial bandit
X <- rep(0,tau) #Replicating vectors
Y <- rep(0,tau)
for (t in 1:tau){
X[t] <- feature[t,a[t]] #set t feature based on bandit a
Y[t] <- reward[t,a[t]]
}
X <- cbind(X,1) #combining X
V <- X %*% t(X) # multiplication X to X trans
# t = tau+1,...,T
for (t in (tau+1):T){
# Maximum likelihood for theta
model <- glm(Y ~ -1 + X, family = binomial(link = logit)) #leave out intercept
theta <- matrix(model$coefficients, ncol = 1)
# Choose bandit a(t) = argmax function
x <- feature[t,]
f <- rep(0,K)
for(j in 1:K){
v <- V^(-1)
f[j] <- c(x[j],1) %*% theta + alpha * sqrt(x%*%(v%*%x))
}
a <- which.max(f) #for chosen bandit append values
Y <- append(Y, reward[t,a])
X <- append(X, cbind(x[a],1))
V <- sum(X %*% t(X))
}
return(sum(Y))
}
rm(list=ls())