In this paper we will analyse a list of 178 wines split into 3 customer segments with different wine taste. The dataset has different wines characteristics and we will build a model that will be able to define in which customer segment a certain wine has to be categorised.
To do so we will use the Principal Component Analysis and the Linear Discriminant Analysis. Both are linear transformation techniques whereas the Principal Component Analysis is unsupervised and the Linear Discriminant Analysis supervised. We will use 80% of the data to train the model and 20% to test the model.
The dataset has been provided in the course " Machine Learning A-Zâ„¢: Hands-On Python & R In Data Science" and is available here : https://db.tt/yiP3SR7Mi1.
The dataset is a list of 178 wines split into 3 customer segment.
library(caTools) # for feature scaling
library(caret)# to use preprocess function
library(e1071) # for SVM
library(ElemStatLearn) # for plotting
library(MASS) # to apply LDA
dataset = read.csv('Wine.csv')
Here is a description of the different variables available to categorise the wines.
head(dataset,1)
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
## 1 14.23 1.71 2.43 15.6 127 2.8 3.06
## Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue OD280 Proline
## 1 0.28 2.29 5.64 1.04 3.92 1065
## Customer_Segment
## 1 1
set.seed(123) # set seed for for reproducibility
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
training_set[-14] = scale(training_set[-14])
test_set[-14] = scale(test_set[-14])
pca = preProcess(x = training_set[-14], method = 'pca', pcaComp = 2)
training_set_pca = predict(pca, training_set)
training_set_pca = training_set_pca[c(2, 3, 1)]
test_set_pca = predict(pca, test_set)
test_set_pca = test_set_pca[c(2, 3, 1)]
# Fitting SVM to the Training set
classifier_pca = svm(formula = Customer_Segment ~ .,
data = training_set_pca,
type = 'C-classification',
kernel = 'linear')
# Predicting the Test set results
y_pred_pca = predict(classifier_pca, newdata = test_set_pca[-3])
# Create the Confusion Matrix
cm_pca = table(test_set_pca[, 3], y_pred_pca)
lda = lda(formula = Customer_Segment ~ ., data = training_set)
training_set_lda = as.data.frame(predict(lda, training_set))
training_set_lda = training_set_lda[c(5, 6, 1)]
test_set_lda = as.data.frame(predict(lda, test_set)) # we need to set it as dataframe
test_set_lda = test_set_lda[c(5, 6, 1)]
# Fitting SVM to the Training set
classifier_lda = svm(formula = class ~ ., # the predictor is now called "class"" due to dataframe transformation
data = training_set_lda,
type = 'C-classification',
kernel = 'linear')
# Predicting the Test set results
y_pred_lda = predict(classifier_lda, newdata = test_set_lda[-3])
# Making the Confusion Matrix
cm_lda = table(test_set_lda[, 3], y_pred_lda)
# Visualising the Training set results
par(mfrow=c(1,2))
set = training_set_pca
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier_pca, newdata = grid_set)
plot(set[, -3],
main = 'SVM (Training set)',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))
# Visualising the Test set results
set = test_set_pca
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier_pca, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))
# Visualising the Training set results
par(mfrow=c(1,2))
set = training_set_lda
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('x.LD1', 'x.LD2')
y_grid = predict(classifier_lda, newdata = grid_set)
plot(set[, -3],
main = 'SVM (Training set)',
xlab = 'LD1', ylab = 'LD2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))
# Visualising the Test set results
set = test_set_lda
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('x.LD1', 'x.LD2')
y_grid = predict(classifier_lda, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
xlab = 'LD1', ylab = 'LD2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))
pcasuccess = sum(diag(cm_pca))/sum(cm_pca)*100
ldasucess = sum(diag(cm_lda))/sum(cm_lda)*100
The success of the PCA is 100 % and the success rate of the LDA is 97.22%. While the PCA has shown better results both results are both satisfactory.