Section 34: Principal Component Analysis
df <- read.csv("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 9 - Dimensionality Reduction\\Section 43 - Principal Component Analysis (PCA)\\PCA\\Wine.csv")
head(df)
Split dataset into training and test set (300 training, 100 test)
library(caTools)
set.seed(1234)
split <- sample.split(df$Customer_Segment, SplitRatio = 0.80)
training_set <- subset(df, split == TRUE)
test_set <- subset(df, split == FALSE)
For PCA ,it is better to do feature scaling (normalization)
# Feature Scaling 1 age, 2 is salary
training_set[,1:13] <- scale(training_set[,1:13])
test_set[,1:13] <- scale(test_set[,1:13])
Applying PCA to the data
# install.packages("caret")
library(caret)
library(e1071)
pca = preProcess(x = training_set[-14], method = "pca", pcaComp = 2)
training_set <- predict(pca, training_set)
# put customer segment in to the last postion or column
training_set <- training_set[c(2,3,1)]
# do the same for the test dataset
test_set <- predict(pca, test_set)
test_set <- test_set[c(2,3,1)]
Building the classification model
classifier <- svm(formula = Customer_Segment ~ .,
data = training_set,
type = "C-classification",
kernel = "linear")
summary(classifier)
Call:
svm(formula = Customer_Segment ~ ., data = training_set, type = "C-classification", kernel = "linear")
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 1
gamma: 0.5
Number of Support Vectors: 25
( 9 11 5 )
Number of Classes: 3
Levels:
1 2 3
Predicting the test set results
y_pred <- predict(classifier, newdata = test_set[-3]
)
y_pred
5 14 16 26 28 29 36 39 40 50 53 58 60 61 72 81 86 90 92 113 116 117 120 122 123 124 131
1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 3
135 137 140 142 149 154 156 158 169
2 3 3 3 3 3 3 3 3
Levels: 1 2 3
Evaluate the prediction using confusion Matrix.
# Making the confusion matrix
# [3] refers to the outcome
cm <- table(test_set[,3], y_pred)
cm
y_pred
1 2 3
1 12 0 0
2 1 13 0
3 0 1 9
Plot
# install.packages("ElemStatLearn")
library(ElemStatLearn)
set <- training_set
X1 <- seq(min(set[,1]) - 1, max(set[,1]) + 1, by = 0.01)
X2 <- seq(min(set[,2]) - 1, max(set[,2]) + 1, by = 0.01)
grid_set <- expand.grid(X1, X2)
colnames(grid_set) = c('PC1','PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[,-3],
main = 'SVM (Training Set)',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[,3] == 2 , "blue3", ifelse(set[,3] == 1,'green4', 'red3')))

Plot description
The red region is predicted by the classifier as “customer segment 3” The green region is predicted by the classifier as “customer segment 2” The blue region is predicted by the svm classifier as “customer segment 1”
Now we see the result of the test set
set <- test_set
X1 <- seq(min(set[,1]) - 1, max(set[,1]) + 1, by = 0.01)
X2 <- seq(min(set[,2]) - 1, max(set[,2]) + 1, by = 0.01)
grid_set <- expand.grid(X1, X2)
colnames(grid_set) = c('PC1','PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[,-3],
main = 'Logistic Regression(Test Set)',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg= ifelse(set[,3] == 2 , "blue3", ifelse(set[,3] == 1,'green4', 'red3')))

LS0tDQp0aXRsZTogIk1MIFVzaW5nIFIgU2VjdGlvbjM0IFBDQSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMgU2VjdGlvbiAzNDogUHJpbmNpcGFsIENvbXBvbmVudCBBbmFseXNpcw0KDQpgYGB7cn0NCmRmIDwtICByZWFkLmNzdigiRzpcXFJTdHVkaW9cXHVkZW15XFxtbFxcTWFjaGluZSBMZWFybmluZyBBWlxcUGFydCA5IC0gRGltZW5zaW9uYWxpdHkgUmVkdWN0aW9uXFxTZWN0aW9uIDQzIC0gUHJpbmNpcGFsIENvbXBvbmVudCBBbmFseXNpcyAoUENBKVxcUENBXFxXaW5lLmNzdiIpDQpoZWFkKGRmKQ0KYGBgDQoNCg0KIyBTcGxpdCBkYXRhc2V0IGludG8gdHJhaW5pbmcgYW5kIHRlc3Qgc2V0ICgzMDAgdHJhaW5pbmcsIDEwMCB0ZXN0KQ0KYGBge3J9DQpsaWJyYXJ5KGNhVG9vbHMpDQpzZXQuc2VlZCgxMjM0KQ0Kc3BsaXQgPC0gc2FtcGxlLnNwbGl0KGRmJEN1c3RvbWVyX1NlZ21lbnQsIFNwbGl0UmF0aW8gPSAwLjgwKQ0KdHJhaW5pbmdfc2V0IDwtIHN1YnNldChkZiwgc3BsaXQgPT0gVFJVRSkNCnRlc3Rfc2V0IDwtIHN1YnNldChkZiwgc3BsaXQgPT0gRkFMU0UpDQoNCmBgYA0KDQojIEZvciBQQ0EgLGl0IGlzIGJldHRlciB0byBkbyBmZWF0dXJlIHNjYWxpbmcgKG5vcm1hbGl6YXRpb24pDQoNCmBgYHtyfQ0KIyBGZWF0dXJlIFNjYWxpbmcgMSBhZ2UsIDIgaXMgc2FsYXJ5DQp0cmFpbmluZ19zZXRbLDE6MTNdIDwtICBzY2FsZSh0cmFpbmluZ19zZXRbLDE6MTNdKQ0KdGVzdF9zZXRbLDE6MTNdIDwtICBzY2FsZSh0ZXN0X3NldFssMToxM10pDQpgYGANCg0KIyBBcHBseWluZyBQQ0EgdG8gdGhlIGRhdGEgDQoNCmBgYHtyfQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJjYXJldCIpDQpsaWJyYXJ5KGNhcmV0KQ0KbGlicmFyeShlMTA3MSkNCnBjYSA9IHByZVByb2Nlc3MoeCA9IHRyYWluaW5nX3NldFstMTRdLCBtZXRob2QgPSAicGNhIiwgcGNhQ29tcCA9IDIpDQp0cmFpbmluZ19zZXQgPC0gcHJlZGljdChwY2EsIHRyYWluaW5nX3NldCkNCg0KIyBwdXQgY3VzdG9tZXIgc2VnbWVudCBpbiB0byB0aGUgbGFzdCBwb3N0aW9uIG9yIGNvbHVtbg0KdHJhaW5pbmdfc2V0IDwtIHRyYWluaW5nX3NldFtjKDIsMywxKV0NCg0KDQpgYGANCg0KYGBge3J9DQojIGRvIHRoZSBzYW1lIGZvciB0aGUgdGVzdCBkYXRhc2V0DQp0ZXN0X3NldCA8LSBwcmVkaWN0KHBjYSwgdGVzdF9zZXQpDQp0ZXN0X3NldCA8LSB0ZXN0X3NldFtjKDIsMywxKV0NCg0KYGBgDQoNCiMgQnVpbGRpbmcgdGhlIGNsYXNzaWZpY2F0aW9uIG1vZGVsIA0KDQpgYGB7cn0NCmNsYXNzaWZpZXIgPC0gc3ZtKGZvcm11bGEgPSBDdXN0b21lcl9TZWdtZW50IH4gLiwNCiAgICAgICAgICAgICAgICAgIGRhdGEgPSB0cmFpbmluZ19zZXQsDQogICAgICAgICAgICAgICAgICB0eXBlID0gIkMtY2xhc3NpZmljYXRpb24iLA0KICAgICAgICAgICAgICAgICAga2VybmVsID0gImxpbmVhciIpDQoNCnN1bW1hcnkoY2xhc3NpZmllcikNCg0KYGBgDQoNCiMgUHJlZGljdGluZyB0aGUgdGVzdCBzZXQgcmVzdWx0cw0KDQpgYGB7cn0NCnlfcHJlZCA8LSAgcHJlZGljdChjbGFzc2lmaWVyLCBuZXdkYXRhID0gdGVzdF9zZXRbLTNdDQogICAgICAgICAgICAgICAgICAgICAgKQ0KeV9wcmVkDQoNCmBgYA0KDQojIEV2YWx1YXRlIHRoZSBwcmVkaWN0aW9uIHVzaW5nIGNvbmZ1c2lvbiBNYXRyaXguDQoNCmBgYHtyfQ0KIyBNYWtpbmcgdGhlIGNvbmZ1c2lvbiBtYXRyaXgNCiMgWzNdIHJlZmVycyB0byB0aGUgb3V0Y29tZQ0KDQpjbSA8LSB0YWJsZSh0ZXN0X3NldFssM10sIHlfcHJlZCkNCmNtDQpgYGANCg0KIyBQbG90DQoNCmBgYHtyfQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJFbGVtU3RhdExlYXJuIikNCmxpYnJhcnkoRWxlbVN0YXRMZWFybikNCnNldCA8LSB0cmFpbmluZ19zZXQNClgxIDwtIHNlcShtaW4oc2V0WywxXSkgLSAxLCBtYXgoc2V0WywxXSkgKyAxLCBieSA9IDAuMDEpDQpYMiA8LSBzZXEobWluKHNldFssMl0pIC0gMSwgbWF4KHNldFssMl0pICsgMSwgYnkgPSAwLjAxKQ0KZ3JpZF9zZXQgPC0gZXhwYW5kLmdyaWQoWDEsIFgyKQ0KY29sbmFtZXMoZ3JpZF9zZXQpID0gYygnUEMxJywnUEMyJykNCnlfZ3JpZCA9IHByZWRpY3QoY2xhc3NpZmllciwgbmV3ZGF0YSA9IGdyaWRfc2V0KQ0KcGxvdChzZXRbLC0zXSwNCiAgICAgbWFpbiA9ICdTVk0gKFRyYWluaW5nIFNldCknLA0KICAgICB4bGFiID0gICdQQzEnLCB5bGFiID0gJ1BDMicsDQogICAgIHhsaW0gPSByYW5nZShYMSksIHlsaW0gPSByYW5nZShYMikpDQpjb250b3VyKFgxLCBYMiwgbWF0cml4KGFzLm51bWVyaWMoeV9ncmlkKSwgbGVuZ3RoKFgxKSwgbGVuZ3RoKFgyKSksIGFkZCA9IFRSVUUpDQpwb2ludHMoZ3JpZF9zZXQsIHBjaCA9ICcuJywgY29sID0gaWZlbHNlKHlfZ3JpZCA9PSAyLCAnZGVlcHNreWJsdWUnLCBpZmVsc2UoeV9ncmlkID09IDEsICdzcHJpbmdncmVlbjMnLCAndG9tYXRvJykpKQ0KcG9pbnRzKHNldCwgcGNoID0gMjEsIGJnID0gaWZlbHNlKHNldFssM10gPT0gMiAsICJibHVlMyIsIGlmZWxzZShzZXRbLDNdID09IDEsJ2dyZWVuNCcsICdyZWQzJykpKQ0KDQpgYGANCg0KIyBQbG90IGRlc2NyaXB0aW9uDQpUaGUgcmVkIHJlZ2lvbiBpcyBwcmVkaWN0ZWQgYnkgdGhlIGNsYXNzaWZpZXIgYXMgImN1c3RvbWVyIHNlZ21lbnQgMyIgPC9icj4NClRoZSBncmVlbiByZWdpb24gaXMgcHJlZGljdGVkIGJ5IHRoZSBjbGFzc2lmaWVyIGFzICJjdXN0b21lciBzZWdtZW50IDIiIDwvYnI+DQpUaGUgYmx1ZSByZWdpb24gaXMgcHJlZGljdGVkIGJ5IHRoZSBzdm0gY2xhc3NpZmllciBhcyAiY3VzdG9tZXIgc2VnbWVudCAxIiA8L3A+DQoNCg0KIyBOb3cgd2Ugc2VlIHRoZSByZXN1bHQgb2YgdGhlIHRlc3Qgc2V0DQoNCmBgYHtyfQ0Kc2V0IDwtIHRlc3Rfc2V0DQpYMSA8LSBzZXEobWluKHNldFssMV0pIC0gMSwgbWF4KHNldFssMV0pICsgMSwgYnkgPSAwLjAxKQ0KWDIgPC0gc2VxKG1pbihzZXRbLDJdKSAtIDEsIG1heChzZXRbLDJdKSArIDEsIGJ5ID0gMC4wMSkNCmdyaWRfc2V0IDwtIGV4cGFuZC5ncmlkKFgxLCBYMikNCmNvbG5hbWVzKGdyaWRfc2V0KSA9IGMoJ1BDMScsJ1BDMicpDQp5X2dyaWQgPSBwcmVkaWN0KGNsYXNzaWZpZXIsICBuZXdkYXRhID0gZ3JpZF9zZXQpDQpwbG90KHNldFssLTNdLA0KICAgICBtYWluID0gJ0xvZ2lzdGljIFJlZ3Jlc3Npb24oVGVzdCBTZXQpJywNCiAgICAgeGxhYiA9ICAnUEMxJywgeWxhYiA9ICdQQzInLA0KICAgICB4bGltID0gcmFuZ2UoWDEpLCB5bGltID0gcmFuZ2UoWDIpKQ0KY29udG91cihYMSwgWDIsIG1hdHJpeChhcy5udW1lcmljKHlfZ3JpZCksIGxlbmd0aChYMSksIGxlbmd0aChYMikpLCBhZGQgPSBUUlVFKQ0KcG9pbnRzKGdyaWRfc2V0LCBwY2ggPSAnLicsIGNvbCA9IGlmZWxzZSh5X2dyaWQgPT0gMiwgJ2RlZXBza3libHVlJywgaWZlbHNlKHlfZ3JpZCA9PSAxLCAnc3ByaW5nZ3JlZW4zJywgJ3RvbWF0bycpKSkNCnBvaW50cyhzZXQsIHBjaCA9IDIxLCBiZz0gaWZlbHNlKHNldFssM10gPT0gMiAsICJibHVlMyIsIGlmZWxzZShzZXRbLDNdID09IDEsJ2dyZWVuNCcsICdyZWQzJykpKQ0KDQpgYGANCg==