Lecture 272 Intuition https://www.udemy.com/machinelearning/learn/lecture/10628136
Lecture 274 https://www.udemy.com/machinelearning/learn/lecture/6453304
knitr::include_graphics("LDAvPCA.png")
Python view of dataset
Both Linear Discriminant Analysis (LDA) and Principal Component Analysis (PCA) are
linear transformation techniques that are commonly used for dimensionality reduction. PCA can be described as an “unsupervised” algorithm, since it “ignores” class labels and its goal is to find the directions (the so-called principal components) that maximize the variance in a dataset. In contrast to PCA, LDA is “supervised” and computes the directions (“linear discriminants”) that will represent the axes that that maximize the separation between multiple classes.
https://sebastianraschka.com/Articles/2014_python_lda.html is Python but explanation is good.
check working directory getwd()
dataset = read.csv('Wine.csv')
knitr::include_graphics("Datasetinformation.png")
Python view of dataset
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
training_set[-14] = scale(training_set[-14])
test_set[-14] = scale(test_set[-14])
library(MASS)
lda = lda(formula = Customer_Segment ~ ., data = training_set)
training_set = as.data.frame(predict(lda, training_set)) # LDA needs a datafram, in PCA we got a dataframe in our preprocessing
head(training_set)
## class posterior.1 posterior.2 posterior.3 x.LD1 x.LD2
## 1 1 1.0000000 1.402325e-09 5.656888e-17 -4.656187 2.081444
## 2 1 0.9999999 1.142655e-07 5.546095e-16 -4.336729 1.267238
## 3 1 0.9999936 6.408747e-06 1.321058e-12 -3.292202 1.167575
## 6 1 1.0000000 1.901419e-11 1.858141e-16 -4.515987 3.265418
## 7 1 1.0000000 8.480654e-12 8.206972e-17 -4.627289 3.369602
## 9 1 1.0000000 2.703595e-08 1.193997e-14 -3.936469 1.967177
Lets clean up the columns and get them in order
training_set = training_set[c(5, 6, 1)] # we need to get the columns in the right order
# same for the test set
test_set = as.data.frame(predict(lda, test_set)) # LDA needs a datafram, in PCA we got a dataframe in our preprocessing
test_set = test_set[c(5, 6, 1)]
head(training_set)
## x.LD1 x.LD2 class
## 1 -4.656187 2.081444 1
## 2 -4.336729 1.267238 1
## 3 -3.292202 1.167575 1
## 6 -4.515987 3.265418 1
## 7 -4.627289 3.369602 1
## 9 -3.936469 1.967177 1
# install.packages('e1071')
library(e1071)
classifier = svm(formula = class ~ ., # note the customer segment is now called class
data = training_set,
type = 'C-classification',
kernel = 'linear')
y_pred = predict(classifier, newdata = test_set[-3])
cm = table(test_set[, 3], y_pred)
cm
## y_pred
## 1 2 3
## 1 12 0 0
## 2 1 13 0
## 3 0 0 10
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('x.LD1', 'x.LD2') # note here we need the real names of the extracted features
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
main = 'Linear Discriminant Analysis (LDA) (Training set)',
xlab = 'LD1', ylab = 'LD2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('x.LD1', 'x.LD2') # note here we need the real names of the extracted features
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'Linear Discriminant Analysis (LDA) (Test set)',
xlab = 'LD1', ylab = 'LD2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))