Dimensionality Reduction through Principal Component Analysis (PCA) and Linear Discriminant Analysis (LDA)

Introduction

In this paper we will analyse a list of 178 wines split into 3 customer segments with different wine taste. The dataset has different wines characteristics and we will build a model that will be able to define in which customer segment a certain wine has to be categorised.

To do so we will use the Principal Component Analysis and the Linear Discriminant Analysis. Both are linear transformation techniques whereas the Principal Component Analysis is unsupervised and the Linear Discriminant Analysis supervised. We will use 80% of the data to train the model and 20% to test the model.

The dataset has been provided in the course " Machine Learning A-Z™: Hands-On Python & R In Data Science" and is available here : https://db.tt/yiP3SR7Mi1.

Dataset and libraries

The dataset is a list of 178 wines split into 3 customer segment.

library(caTools) # for feature scaling
library(caret)# to use preprocess function
library(e1071) # for SVM
library(ElemStatLearn) # for plotting
library(MASS) # to apply LDA
dataset = read.csv('Wine.csv')

Here is a description of the different variables available to categorise the wines.

head(dataset,1)

##   Alcohol Malic_Acid  Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
## 1   14.23       1.71 2.43         15.6       127           2.8       3.06
##   Nonflavanoid_Phenols Proanthocyanins Color_Intensity  Hue OD280 Proline
## 1                 0.28            2.29            5.64 1.04  3.92    1065
##   Customer_Segment
## 1                1

Splitting the dataset into the Training set and Test set

set.seed(123) # set seed for for reproducibility
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

Feature Scaling

training_set[-14] = scale(training_set[-14])
test_set[-14] = scale(test_set[-14])

Applying PCA

pca = preProcess(x = training_set[-14], method = 'pca', pcaComp = 2)
training_set_pca = predict(pca, training_set)
training_set_pca = training_set_pca[c(2, 3, 1)]
test_set_pca = predict(pca, test_set)
test_set_pca = test_set_pca[c(2, 3, 1)]
# Fitting SVM to the Training set
classifier_pca = svm(formula = Customer_Segment ~ .,
                 data = training_set_pca,
                 type = 'C-classification',
                 kernel = 'linear')
# Predicting the Test set results
y_pred_pca = predict(classifier_pca, newdata = test_set_pca[-3])
# Create the Confusion Matrix
cm_pca = table(test_set_pca[, 3], y_pred_pca)

Applying LDA

lda = lda(formula = Customer_Segment ~ ., data = training_set)
training_set_lda = as.data.frame(predict(lda, training_set))
training_set_lda = training_set_lda[c(5, 6, 1)]
test_set_lda = as.data.frame(predict(lda, test_set)) # we need to set it as dataframe
test_set_lda = test_set_lda[c(5, 6, 1)]
# Fitting SVM to the Training set
classifier_lda = svm(formula = class ~ ., # the predictor is now called "class"" due to dataframe transformation
                 data = training_set_lda,
                 type = 'C-classification',
                 kernel = 'linear')
# Predicting the Test set results
y_pred_lda = predict(classifier_lda, newdata = test_set_lda[-3])
# Making the Confusion Matrix
cm_lda = table(test_set_lda[, 3], y_pred_lda)

Plot PCA results

# Visualising the Training set results
par(mfrow=c(1,2))
set = training_set_pca
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier_pca, newdata = grid_set)
plot(set[, -3],
     main = 'SVM (Training set)',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))
# Visualising the Test set results
set = test_set_pca
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier_pca, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))

Plot LDA results

# Visualising the Training set results
par(mfrow=c(1,2))
set = training_set_lda
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('x.LD1', 'x.LD2')
y_grid = predict(classifier_lda, newdata = grid_set)
plot(set[, -3],
     main = 'SVM (Training set)',
     xlab = 'LD1', ylab = 'LD2',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))
# Visualising the Test set results
set = test_set_lda
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('x.LD1', 'x.LD2')
y_grid = predict(classifier_lda, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
     xlab = 'LD1', ylab = 'LD2',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))

Conclusion

pcasuccess = sum(diag(cm_pca))/sum(cm_pca)*100
ldasucess = sum(diag(cm_lda))/sum(cm_lda)*100

The success of the PCA is 100 % and the success rate of the LDA is 97.22%. While the PCA has shown better results both results are both satisfactory.