Implementation of the equation for LDA and QDA found in the “an introduction to Statistical learning” book.
library(MASS)
## Warning: package 'MASS' was built under R version 3.3.3
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.3.3
# mean parameters of the k class
pi_lda <- function(y){
pi_est <- table(y) / length(y)
return(as.matrix(pi_est))
}
# Output the group means
# these are the average of the predictor within each class, and are used by LDA as estimates of mu k
mu_lda <- function(X, y){
data_est <- as.data.frame(cbind(X,y))
data_est$X <- as.numeric(as.character(data_est$X))
mu <- aggregate(data = data_est, X ~ y, FUN = "mean")
colnames(mu) <- c("y", "X")
return(mu)
}
# variance parameter of the k class
var_lda <- function(X, y, mu){
n <- length(X)
K <- length(unique(y))
k <- unique(y)
var_est <- 0
for (i in 1:K){
var_est <- sum((X[y == k[i]] - mu$X[k[i] == mu$y])^2) + var_est
}
var_est <- (1 / (n - K)) * var_est
return(var_est)
}
# discriminant function for p = 1
discriminant_lda <- function(X, pi, mu, var){
K <- length(unique(y))
k <- unique(y)
disc <- matrix(nrow = length(X), ncol = K)
colnames(disc) <- k
for (i in 1:K){
disc[ ,i] <- X * (mu$X[i] / var) - ((mu$X[i]^2) / (2 * var)) + log(pi[i])
}
disc <- as.data.frame(disc)
disc$predict <- apply(disc, 1, FUN = "which.max")
return(disc)
}
X <- iris[ ,1]
y <- as.character(iris[ ,5])
pi_est <- pi_lda(y)
mu_est <- mu_lda(X, y)
var_est <- var_lda(X, y, mu_est)
discriminant_est <- discriminant_lda(X, pi_est, mu_est, var_est)
table(discriminant_est$predict, iris$Species) # confusion matrix
##
## setosa versicolor virginica
## 1 45 6 1
## 2 5 30 12
## 3 0 14 37
# It gives us 74.5% accuracy
# Compared with the lda function from the MASS package
lda_fit <- lda(data= iris, Species ~ Sepal.Length)
predict_lda_fit <- predict(lda_fit, iris[, c(1,5)])
table(predict_lda_fit$class, iris$Species)
##
## setosa versicolor virginica
## setosa 45 6 1
## versicolor 5 30 12
## virginica 0 14 37
# same result
# the predictor variable shows the probability that a given observastion is part of the k class (calculated from the Bayes theorem)
# the sum of the probabilities = 1
head(predict_lda_fit$posterior,5)
## setosa versicolor virginica
## 1 0.7766431 0.21124550 0.0121114410
## 2 0.8775438 0.11830923 0.0041469416
## 3 0.9361050 0.06255445 0.0013405050
## 4 0.9543491 0.04489860 0.0007523042
## 5 0.8332780 0.15956874 0.0071533030
# example of how to calculate it
A <- (1 / (sqrt(2 * pi_est[1])) * sqrt(var_est)) * exp((-1 / (2 * var_est)) * (X[1] - mu_est[1,2])^2)
B <- (1 / (sqrt(2 * pi_est[2])) * sqrt(var_est)) * exp((-1 / (2 * var_est)) * (X[1] - mu_est[2,2])^2)
C <- (1 / (sqrt(2 * pi_est[3])) * sqrt(var_est)) * exp((-1 / (2 * var_est)) * (X[1] - mu_est[3,2])^2)
D <- pi_est[1] * A + pi_est[2] * B + pi_est[3] * C
print((pi_est[1] * A) / D)
## [1] 0.7766431
print((pi_est[2] * B) / D)
## [1] 0.2112455
print((pi_est[3] * C) / D)
## [1] 0.01211144
predict_lda_fit$posterior[1,]
## setosa versicolor virginica
## 0.77664306 0.21124550 0.01211144
mu_lda <- function(X, y){
mu <- aggregate(X ~ y, FUN = "mean")
row.names(mu) <- mu[ ,1]
mu <- mu[ ,-1]
mu <- as.matrix(mu)
return(mu)
}
multi_discriminant_lda <- function(X, y, pi, mu, var){
disc <- matrix(nrow = nrow(X), ncol = length(unique(y)))
colnames(disc) <- unique(y)
for (i in 1 : length(unique(y))){
for (j in 1: nrow(X)){
disc[j,i] <- X[j, ] %*% var %*% mu[i, ] - (1/2) * t(mu[i, ]) %*% var %*% mu[i, ] + log(pi[i])
}
}
disc <- as.data.frame(disc)
disc$predict <- apply(disc, 1, FUN = "which.max")
return(disc)
}
# # other possibility (same result but vector implementation)
# A <- X %*% solve(cov(X)) %*% mu_est[1,]
# B <- - ((1/2) * t(mu_est[1,]) %*% solve(cov(X)) %*% mu_est[1,]) + log(pi_est[1])
# A + as.numeric(B)
X <- as.matrix(iris[, 1:4])
y <- as.character(iris[ ,5])
# let's try with scaled X
X <- scale(X)
pi_est <- pi_lda(y)
mu_est <- mu_lda(X, y)
var_est <- solve(cov(X))
discriminant_est <- multi_discriminant_lda(X, y, pi_est, mu_est, var_est)
table(discriminant_est$predict, iris$Species)
##
## setosa versicolor virginica
## 1 49 0 0
## 2 1 42 11
## 3 0 8 39
We will now allow the observations in the K class to have specific variance. mu and pi functions stay similar to the lda ones
var_qda <- function(X,y){
K <- length(unique(y))
k <- unique(y)
for (i in 1:K){
var <- solve(cov(X[y == k[i], ]))
ifelse(exists("var2"), var2 <- c(var2, list(var)), var2 <- list(var))
}
var2 <- structure(var2, names= k)
return(var2)
}
qda_fct <- function(X, y, pi, mu, var){
m <- length(y)
K <- length(unique(y))
k <- unique(y)
qda_result <- matrix(0, nrow = m, ncol = K + 1)
colnames(qda_result) <- c(k, "Result")
for (j in 1:K){
for (i in 1:m){
qda_result[i,j] <- -(1/2) * X[i, ] %*% var_est[[j]] %*% X[i, ] + X[i, ] %*% var_est[[j]] %*% mu_est[j, ] - (1/2) * mu_est[j, ] %*% var_est[[j]] %*% mu_est[j, ] + log(pi_est[j])
}
}
qda_result <- as.data.frame(qda_result)
qda_result$Result <- apply(qda_result[, 1:K], 1, FUN= "which.max")
return(qda_result)
}
pi_est <- pi_lda(y)
mu_est <- mu_lda(X,y)
var_est <- var_qda(X,y)
qda_training <- qda_fct(X, y, pi_est, mu_est, var_est)
table(qda_training$Result, iris$Species) #confusion matrix
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 47 0
## 3 0 3 50
# test with QDA function from R
qda_fit <- qda(data= iris, Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width)
table(predict(qda_fit, iris[, 1:4])$class, iris$Species)
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 1
## virginica 0 2 49