Lecture 264 PCA Intuition
https://www.udemy.com/machinelearning/learn/lecture/10628128
Great presentation and tutorial
https://plot.ly/ipython-notebooks/principal-component-analysis/
Another explanation of topic
http://setosa.io/ev/principal-component-analysis/
home page of this stuff http://setosa.io/ev/
PCA Wikipedia https://en.wikipedia.org/wiki/Principal_component_analysis
What we are doing is taking a large number of independent variables and extracting them down to a core group of new independent variables that best describe the relationship (most variance) of the data in the dataset. Because this extraction is done without knowledge of the dependent variable the PCA process is considered un-supervised dimensionality reduction technique.
Both PCA and LDA are applicable to data that can be linearly separable.
check working directory getwd()
dataset = read.csv('Wine.csv')
knitr::include_graphics("Datasetinformation.png")
Python view of dataset
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
We’ll scale all the Independent variables, not the customer segment (Dependent variable).
training_set[-14] = scale(training_set[-14])
test_set[-14] = scale(test_set[-14])
Thresh is a cutoff for the cumulative percent of variance to be retained by PCA. We won’t use this but if we wanted a particular cut off of explanation from our extracted features we’d use this parameter. pcaComp is the specific number of PCA components to keep. If specified, this over-rides thresh, we’ll go with 2.
# install.packages('caret')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
# install.packages('e1071')
library(e1071)
# we will remove dependent variable from the training_set as PCA is an unsupervised dimensionality reduction technique
pca = preProcess(x = training_set[-14], method = 'pca', pcaComp = 2)
# we'll use predict to apply our pca object to our training_set
training_set = predict(pca, training_set)
# we need to put the columns back in order we want PC1, PC2, Customer_segment
training_set = training_set[c(2, 3, 1)]
test_set = predict(pca, test_set)
# we need to put the columns back in order we want
test_set = test_set[c(2, 3, 1)]
Principle Component 1 and 2 are our new variables.
head(training_set)
## PC1 PC2 Customer_Segment
## 1 -3.249569 1.5661160 1
## 2 -2.165889 -0.3186768 1
## 3 -2.501192 1.2353892 1
## 6 -2.941040 2.2999654 1
## 7 -2.393131 1.3228050 1
## 9 -2.418465 1.0367916 1
# install.packages('e1071')
library(e1071)
classifier = svm(formula = Customer_Segment ~ .,
data = training_set,
type = 'C-classification',
kernel = 'linear')
y_pred = predict(classifier, newdata = test_set[-3])
y_pred
## 4 5 8 11 16 20 21 24 31 32 50 59 65 67 68 69 87 88
## 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2
## 89 104 106 107 111 114 118 126 132 134 137 138 139 145 151 167 173 174
## 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
## Levels: 1 2 3
Perefect results
cm = table(test_set[, 3], y_pred)
cm
## y_pred
## 1 2 3
## 1 12 0 0
## 2 0 14 0
## 3 0 0 10
y_predTR = predict(classifier, newdata = training_set[-3])
y_predTR
## 1 2 3 6 7 9 10 12 13 14 15 17 18 19 22 23 25 26
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 27 28 29 30 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1
## 47 48 49 51 52 53 54 55 56 57 58 60 61 62 63 64 66 70
## 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
## 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 90 91
## 2 2 2 1 2 2 2 2 2 2 2 2 2 3 2 2 2 2
## 92 93 94 95 96 97 98 99 100 101 102 103 105 108 109 110 112 113
## 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## 115 116 117 119 120 121 122 123 124 125 127 128 129 130 131 133 135 136
## 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 3 3 3
## 140 141 142 143 144 146 147 148 149 150 152 153 154 155 156 157 158 159
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 160 161 162 163 164 165 166 168 169 170 171 172 175 176 177 178
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## Levels: 1 2 3
Not bad, couple of mistakes
cmTR = table(training_set[, 3], y_predTR)
cmTR
## y_predTR
## 1 2 3
## 1 46 1 0
## 2 3 53 1
## 3 0 1 37
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
main = 'Principal Component Analysis (PCA) (Training set)',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'Principal Component Analysis (PCA) (Test set)',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))