R Notebook

Simulating data with different normal distribution scope and feeding the data to a Generalized UCB Multi-Armed Bandit model.

## Inverse Logit Function

invlogit <- function(p) {
  return(exp(p)/(1 + exp(p)))
}

## Generate Bernoulli data

K <- 10 
n <- 1000 
b0 <- 0.2 # True value for intercept
b1 <- 0.5 # True value for the slope
Y <- X <- matrix(0, ncol = K, nrow = n)

for(i in 1:K){
  set.seed(100+i) #update / replace seed
  X[,i] <- runif(n, 0, 0.5) #uniform distrib
  set.seed(100+i) 
  Y[,i] <- rbinom(n, 1, invlogit(b0 + b1*X)) #binomial distrib 
}

## UCB-GLM

UCBGLM <- function(tau, alpha, reward, feature){
  T <- dim(Y)[1] #set the dim for Y
  
  # Initialization t = 1,...,tau
  a <- sample(1:K, size = tau, replace = TRUE) # initial bandit
  X <- rep(0,tau) #Replicating vectors
  Y <- rep(0,tau)
  for (t in 1:tau){
    X[t] <- feature[t,a[t]] #set t feature based on bandit a
    Y[t] <- reward[t,a[t]]
  }
  X <- cbind(X,1) #combining X
  V <- X %*% t(X) # multiplication X to X trans
  
  # t = tau+1,...,T
  for (t in (tau+1):T){
    # Maximum likelihood for theta
    model <- glm(Y ~ -1 + X, family = binomial(link = logit)) #leave out intercept
    theta <- matrix(model$coefficients, ncol = 1)
    # Choose bandit a(t) = argmax function
    x <- feature[t,]
    f <- rep(0,K)
    for(j in 1:K){
      v <- V^(-1)
      f[j] <- c(x[j],1) %*% theta + alpha * sqrt(x%*%(v%*%x))
    }
    a <- which.max(f) #for chosen bandit append values
    Y <- append(Y, reward[t,a]) 
    X <- append(X, cbind(x[a],1))
    V <- sum(X %*% t(X))
  }
  
  return(sum(Y))
}

rm(list=ls())