# Load data and feature functions
set.seed(43234)
setwd("~/Documents/thesis/R/IE_reed_thesis/data")
df <- read.csv(file="train_tidy.csv")
df$obs <- as.character(df$obs)
df$state <- as.character(df$state)
df$prev_state <- as.character(df$prev_state)
setwd("~/Documents/thesis/R/IE_reed_thesis/code")
source("features.R")

Binomial Logistic Regression

##or simple logistic regression, we don't need a list of matrices for each state
# for X, let's use a simple feature function that checks if the word is uppercase
X <- ifelse((df$obs == toupper(df$obs)), 1, 0)
Y <- ifelse(df$state == "target", 1, 0)
betas <- c(0, 0)

# (negative) log likelihood function
nLL_simple <- function(nLL.par, nLL.Y, nLL.X){
  beta_0 <- nLL.par[1]
  beta_1 <- nLL.par[2]
  
  -sum(nLL.Y*(beta_0+beta_1*nLL.X)) +
    sum(log(1 + exp(beta_0+beta_1*nLL.X)))
}
# use optim to computer MLEs of nLL
MLE <- optim(betas, nLL_simple, nLL.Y = Y, nLL.X = X, hessian = T)
MLE$par
## [1] -0.8462374  1.5675730
simple_df <- data.frame(cbind(Y, X))
 
# check results with glm()
glm_mle <- glm(Y ~ X, family=binomial(link="logit"), data=simple_df)
coef(glm_mle)
## (Intercept)           X 
##  -0.8462191   1.5683538

Multinomial Logistic Regression

In multinomial logistic regression, each observation is a vector of \(d\) feature values, which belongs to class \(k\) if the response vector \(y = [y_1,\ldots,y_K]\) is 0 except for \(y_k=1\). The parameters are stored in a \(d\times K\) matrix \(\textbf{B} = [\beta_1,\ldots,\beta_K]\) where each column \(\beta_k\) corresponds to class \(k\). Then for observation \(x_i\), \[P(y_{ik}=1|x_i, \textbf{B}) = \frac{exp(\beta_k^Tx_i)}{\sum_{k'}exp(\beta_k^Tx_i)}\] To perform estimation, we assign \(x_i\) the class with the highest probability: \[\hat{y}_i = \text{argmax}_k\;p(y_{ik}=1|x)\] From the above probability equation, we get the probability of a given observation \((x_i,y_i)\): \[P(x_i, y_i) = \frac{exp(\sum_ky_{ik}\beta_k^Tx_i)}{\sum_{k'}exp(\beta_{k'}^Tx_i)}\] This gives us the log-likelihood of the parameters for some training data \(D\): \[\ell(\textbf{B}|D) = \sum_{i}\Big(\sum_ky_{ik}\beta_k^Tx_i - ln\sum_{k'}exp(\beta_{k'}^Tx_i)\Big)\]

# The two states we're interested in
states <- c("target", "other")

# Create X matrix of feature values for training data
# list of d feature functions
function_list <- list(f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12)

X_list <- list(length(function_list))
for(d in 1:length(function_list)){
  # find the value of the function for each observation
  X_list[[d]] <- mapply(function_list[[d]],
                       df$obs, df$state, df$prev_state)
}
# combine columns into a matrix
X <- do.call(cbind, X_list)

# Create Y matrix of class indicator variables for each obs
class_list <- list(length(states))
for(s in 1:length(states)){
  class_list[[s]] <- ifelse(df$state == states[s], 1, 0)
}
Y <- do.call(cbind, class_list)

# Initialize B matrix of parameters
B <- matrix(0, nrow=length(function_list), ncol=length(states))
BT <- t(B)
# n observations
# d features
# K states
# X: n x d matrix of feature values
# Y: n x K indicator matrix of each observation's state
# B: d x K matrix of parameters/weights

# Negative log-likelihood function for multinomial logistic regression
nLL_multi <- function(params, nLL.X, nLL.Y, nLL.states){
  # Convert parameters back into matrix
  weights <- matrix(params, ncol = length(nLL.states), byrow=T)
  final_sum <- 0
  for(i in 1:nrow(nLL.X)){ 
    first <- 0
    second <- 0
    for(k in 1:ncol(weights)){
      first <- first + nLL.Y[i,k]*sum(weights[,k]*nLL.X[i,])
      second <- second + exp(sum(weights[,k]*nLL.X[i,]))
    }
    final_sum <- final_sum + first - log(second)
  }
  -final_sum
}

mle <- optim(B, nLL_multi, nLL.X=X, nLL.Y=Y, nLL.states=states)
mle$par
##              [,1]        [,2]
##  [1,]  0.57662824  0.52213457
##  [2,] -0.56388016 -0.74549306
##  [3,]  0.55598885  0.57442260
##  [4,] -0.21384619 -0.34871681
##  [5,] -0.82131557 -0.29214702
##  [6,] -0.29685807  0.73391805
##  [7,] -0.62832822 -1.42368554
##  [8,] -0.21335090  1.61584982
##  [9,] -0.17972631 -0.01552761
## [10,] -0.12507031 -0.18475657
## [11,] -0.01882782 -0.61550595
## [12,]  1.18214637  0.42971116
newB <- mle$par
df_multi <- as.data.frame(X)
df_multi <- df_multi %>%
  mutate(state = df$state)

multi <- multinom(state~.+0, data=df_multi)
## # weights:  13 (12 variable)
## initial  value 1871.497388 
## iter  10 value 1046.165993
## iter  20 value 959.515978
## iter  30 value 959.500799
## iter  40 value 959.496083
## final  value 959.492405 
## converged
summary(multi)
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## multinom(formula = state ~ . + 0, data = df_multi)
## 
## Coefficients:
##           Values    Std. Err.
## V1    0.82692280   0.46134967
## V2    1.38830965   0.28393777
## V3    2.44936413   0.53726259
## V4   -0.05449003   0.22258103
## V5    0.00000000          NaN
## V6   -0.60819189   0.46243506
## V7    0.90917194   0.12254066
## V8    1.07163796   0.12050553
## V9   -1.63568950   0.48984393
## V10  -4.05116645   0.33270167
## V11 -12.27605335 391.89673502
## V12  -1.13151515   0.08579476
## 
## Residual Deviance: 1918.985 
## AIC: 1940.985
# Test handwritten multinomial predictions against multinom()

# Randomly generate fake observations
test_df <- NULL
n <- 1000
for(i in 1:n){
  pred <- sample(c(0, 1), 12, replace=T)
  test_df <- rbind(test_df, pred)
}
test_df <- as.data.frame(test_df)

# Compare two models in logical vector
results <- logical(n)

for(i in 1:n){
  test <- test_df[i,]
  
  # Predict the probability of observation being a target
  numerator <- exp(sum(newB[,1]*test))
  denom <- 0
  for(k in 1:ncol(newB)){
    denom <- denom + exp(sum(newB[,k]*test))
  }
  pred <- ifelse((numerator/denom) > .5, "target", "other")
  
  results[i] <- (predict(multi, type="class", newdata=test) == pred)
}

mean(results)
## [1] 0.535