# Load data and feature functions
set.seed(43234)
setwd("~/Documents/thesis/R/IE_reed_thesis/data")
df <- read.csv(file="train_tidy.csv")
df$obs <- as.character(df$obs)
df$state <- as.character(df$state)
df$prev_state <- as.character(df$prev_state)
setwd("~/Documents/thesis/R/IE_reed_thesis/code")
source("features.R")
##or simple logistic regression, we don't need a list of matrices for each state
# for X, let's use a simple feature function that checks if the word is uppercase
X <- ifelse((df$obs == toupper(df$obs)), 1, 0)
Y <- ifelse(df$state == "target", 1, 0)
betas <- c(0, 0)
# (negative) log likelihood function
nLL_simple <- function(nLL.par, nLL.Y, nLL.X){
beta_0 <- nLL.par[1]
beta_1 <- nLL.par[2]
-sum(nLL.Y*(beta_0+beta_1*nLL.X)) +
sum(log(1 + exp(beta_0+beta_1*nLL.X)))
}
# use optim to computer MLEs of nLL
MLE <- optim(betas, nLL_simple, nLL.Y = Y, nLL.X = X, hessian = T)
MLE$par
## [1] -0.8462374 1.5675730
simple_df <- data.frame(cbind(Y, X))
# check results with glm()
glm_mle <- glm(Y ~ X, family=binomial(link="logit"), data=simple_df)
coef(glm_mle)
## (Intercept) X
## -0.8462191 1.5683538
In multinomial logistic regression, each observation is a vector of \(d\) feature values, which belongs to class \(k\) if the response vector \(y = [y_1,\ldots,y_K]\) is 0 except for \(y_k=1\). The parameters are stored in a \(d\times K\) matrix \(\textbf{B} = [\beta_1,\ldots,\beta_K]\) where each column \(\beta_k\) corresponds to class \(k\). Then for observation \(x_i\), \[P(y_{ik}=1|x_i, \textbf{B}) = \frac{exp(\beta_k^Tx_i)}{\sum_{k'}exp(\beta_k^Tx_i)}\] To perform estimation, we assign \(x_i\) the class with the highest probability: \[\hat{y}_i = \text{argmax}_k\;p(y_{ik}=1|x)\] From the above probability equation, we get the probability of a given observation \((x_i,y_i)\): \[P(x_i, y_i) = \frac{exp(\sum_ky_{ik}\beta_k^Tx_i)}{\sum_{k'}exp(\beta_{k'}^Tx_i)}\] This gives us the log-likelihood of the parameters for some training data \(D\): \[\ell(\textbf{B}|D) = \sum_{i}\Big(\sum_ky_{ik}\beta_k^Tx_i - ln\sum_{k'}exp(\beta_{k'}^Tx_i)\Big)\]
# The two states we're interested in
states <- c("target", "other")
# Create X matrix of feature values for training data
# list of d feature functions
function_list <- list(f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12)
X_list <- list(length(function_list))
for(d in 1:length(function_list)){
# find the value of the function for each observation
X_list[[d]] <- mapply(function_list[[d]],
df$obs, df$state, df$prev_state)
}
# combine columns into a matrix
X <- do.call(cbind, X_list)
# Create Y matrix of class indicator variables for each obs
class_list <- list(length(states))
for(s in 1:length(states)){
class_list[[s]] <- ifelse(df$state == states[s], 1, 0)
}
Y <- do.call(cbind, class_list)
# Initialize B matrix of parameters
B <- matrix(0, nrow=length(function_list), ncol=length(states))
BT <- t(B)
# n observations
# d features
# K states
# X: n x d matrix of feature values
# Y: n x K indicator matrix of each observation's state
# B: d x K matrix of parameters/weights
# Negative log-likelihood function for multinomial logistic regression
nLL_multi <- function(params, nLL.X, nLL.Y, nLL.states){
# Convert parameters back into matrix
weights <- matrix(params, ncol = length(nLL.states), byrow=T)
final_sum <- 0
for(i in 1:nrow(nLL.X)){
first <- 0
second <- 0
for(k in 1:ncol(weights)){
first <- first + nLL.Y[i,k]*sum(weights[,k]*nLL.X[i,])
second <- second + exp(sum(weights[,k]*nLL.X[i,]))
}
final_sum <- final_sum + first - log(second)
}
-final_sum
}
mle <- optim(B, nLL_multi, nLL.X=X, nLL.Y=Y, nLL.states=states)
mle$par
## [,1] [,2]
## [1,] 0.57662824 0.52213457
## [2,] -0.56388016 -0.74549306
## [3,] 0.55598885 0.57442260
## [4,] -0.21384619 -0.34871681
## [5,] -0.82131557 -0.29214702
## [6,] -0.29685807 0.73391805
## [7,] -0.62832822 -1.42368554
## [8,] -0.21335090 1.61584982
## [9,] -0.17972631 -0.01552761
## [10,] -0.12507031 -0.18475657
## [11,] -0.01882782 -0.61550595
## [12,] 1.18214637 0.42971116
newB <- mle$par
df_multi <- as.data.frame(X)
df_multi <- df_multi %>%
mutate(state = df$state)
multi <- multinom(state~.+0, data=df_multi)
## # weights: 13 (12 variable)
## initial value 1871.497388
## iter 10 value 1046.165993
## iter 20 value 959.515978
## iter 30 value 959.500799
## iter 40 value 959.496083
## final value 959.492405
## converged
summary(multi)
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## multinom(formula = state ~ . + 0, data = df_multi)
##
## Coefficients:
## Values Std. Err.
## V1 0.82692280 0.46134967
## V2 1.38830965 0.28393777
## V3 2.44936413 0.53726259
## V4 -0.05449003 0.22258103
## V5 0.00000000 NaN
## V6 -0.60819189 0.46243506
## V7 0.90917194 0.12254066
## V8 1.07163796 0.12050553
## V9 -1.63568950 0.48984393
## V10 -4.05116645 0.33270167
## V11 -12.27605335 391.89673502
## V12 -1.13151515 0.08579476
##
## Residual Deviance: 1918.985
## AIC: 1940.985
# Test handwritten multinomial predictions against multinom()
# Randomly generate fake observations
test_df <- NULL
n <- 1000
for(i in 1:n){
pred <- sample(c(0, 1), 12, replace=T)
test_df <- rbind(test_df, pred)
}
test_df <- as.data.frame(test_df)
# Compare two models in logical vector
results <- logical(n)
for(i in 1:n){
test <- test_df[i,]
# Predict the probability of observation being a target
numerator <- exp(sum(newB[,1]*test))
denom <- 0
for(k in 1:ncol(newB)){
denom <- denom + exp(sum(newB[,k]*test))
}
pred <- ifelse((numerator/denom) > .5, "target", "other")
results[i] <- (predict(multi, type="class", newdata=test) == pred)
}
mean(results)
## [1] 0.535