Lecture 264 PCA Intuition
https://www.udemy.com/machinelearning/learn/lecture/10628128

Great presentation and tutorial
https://plot.ly/ipython-notebooks/principal-component-analysis/

Another explanation of topic
http://setosa.io/ev/principal-component-analysis/
home page of this stuff http://setosa.io/ev/

PCA Wikipedia https://en.wikipedia.org/wiki/Principal_component_analysis

PCA in a nutshell

What we are doing is taking a large number of independent variables and extracting them down to a core group of new independent variables that best describe the relationship (most variance) of the data in the dataset. Because this extraction is done without knowledge of the dependent variable the PCA process is considered un-supervised dimensionality reduction technique.

Both PCA and LDA are applicable to data that can be linearly separable.

check working directory getwd()

Importing the dataset

dataset = read.csv('Wine.csv')
knitr::include_graphics("Datasetinformation.png")
Python view of dataset

Python view of dataset

Splitting the dataset into the Training set and Test set

# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

Feature Scaling

We’ll scale all the Independent variables, not the customer segment (Dependent variable).

training_set[-14] = scale(training_set[-14])
test_set[-14] = scale(test_set[-14])

Applying PCA

Thresh is a cutoff for the cumulative percent of variance to be retained by PCA. We won’t use this but if we wanted a particular cut off of explanation from our extracted features we’d use this parameter. pcaComp is the specific number of PCA components to keep. If specified, this over-rides thresh, we’ll go with 2.

# install.packages('caret')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
# install.packages('e1071')
library(e1071)
# we will remove dependent variable from the training_set as PCA is an unsupervised dimensionality reduction technique
pca = preProcess(x = training_set[-14], method = 'pca', pcaComp = 2)
# we'll use predict to apply our pca object to our training_set
training_set = predict(pca, training_set)
# we need to put the columns back in order we want PC1, PC2, Customer_segment
training_set = training_set[c(2, 3, 1)]
test_set = predict(pca, test_set)
# we need to put the columns back in order we want
test_set = test_set[c(2, 3, 1)]

PC1 and PC2

Principle Component 1 and 2 are our new variables.

head(training_set)
##         PC1        PC2 Customer_Segment
## 1 -3.249569  1.5661160                1
## 2 -2.165889 -0.3186768                1
## 3 -2.501192  1.2353892                1
## 6 -2.941040  2.2999654                1
## 7 -2.393131  1.3228050                1
## 9 -2.418465  1.0367916                1

Fitting SVM to the Training set

# install.packages('e1071')
library(e1071)
classifier = svm(formula = Customer_Segment ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'linear')

Predicting the Test set results

y_pred = predict(classifier, newdata = test_set[-3])
y_pred
##   4   5   8  11  16  20  21  24  31  32  50  59  65  67  68  69  87  88 
##   1   1   1   1   1   1   1   1   1   1   1   1   2   2   2   2   2   2 
##  89 104 106 107 111 114 118 126 132 134 137 138 139 145 151 167 173 174 
##   2   2   2   2   2   2   2   2   3   3   3   3   3   3   3   3   3   3 
## Levels: 1 2 3

Making the Confusion Matrix

Perefect results

cm = table(test_set[, 3], y_pred)
cm
##    y_pred
##      1  2  3
##   1 12  0  0
##   2  0 14  0
##   3  0  0 10

Predicting the Training set results

y_predTR = predict(classifier, newdata = training_set[-3])
y_predTR
##   1   2   3   6   7   9  10  12  13  14  15  17  18  19  22  23  25  26 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  27  28  29  30  33  34  35  36  37  38  39  40  41  42  43  44  45  46 
##   1   1   1   1   1   1   1   1   1   1   2   1   1   1   1   1   1   1 
##  47  48  49  51  52  53  54  55  56  57  58  60  61  62  63  64  66  70 
##   1   1   1   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2 
##  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  90  91 
##   2   2   2   1   2   2   2   2   2   2   2   2   2   3   2   2   2   2 
##  92  93  94  95  96  97  98  99 100 101 102 103 105 108 109 110 112 113 
##   2   2   2   2   1   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 115 116 117 119 120 121 122 123 124 125 127 128 129 130 131 133 135 136 
##   2   2   2   2   2   2   1   2   2   2   2   2   2   2   2   3   3   3 
## 140 141 142 143 144 146 147 148 149 150 152 153 154 155 156 157 158 159 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## 160 161 162 163 164 165 166 168 169 170 171 172 175 176 177 178 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## Levels: 1 2 3

Making the Confusion Matrix - Training set

Not bad, couple of mistakes

cmTR = table(training_set[, 3], y_predTR)
cmTR
##    y_predTR
##      1  2  3
##   1 46  1  0
##   2  3 53  1
##   3  0  1 37

Visualising the Training set results

library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
     main = 'Principal Component Analysis (PCA) (Training set)',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))

Visualising the Test set results

library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'Principal Component Analysis (PCA) (Test set)',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))