Lecture 277 https://www.udemy.com/machinelearning/learn/lecture/6270782
Remember that we have non-linearly separable data here so we don’t want to use LDA or PCA (alone) Let’s remember our Logistic Regression Model example from classification part of the course. This allows us to see that our data is not linearly separable.
knitr::include_graphics("LogisticRegression_dataExplained.png")
Basically what we’ll be doing here is applying Kernel PCA to our data before we then apply a linear model. This application is called the Kernel Trick.
check working directory getwd()
dataset = read.csv('Social_Network_Ads.csv')
dataset = dataset[, 3:5]
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
training_set[, 1:2] = scale(training_set[, 1:2])
test_set[, 1:2] = scale(test_set[, 1:2])
This is the new bit, the Kernel Trick :~)
# install.packages('kernlab')
library(kernlab)
# ~ . is our entire 'model', our X
# our data is our data without the dependent variable
# our Gausian Kernel
# features is the heart of dimensionality reductions, again we want to use 2 so we can visualize.
kpca = kpca(~., data = training_set[-3], kernel = 'rbfdot', features = 2)
Now lets build our datasets with our new object kpca
training_set_pca = as.data.frame(predict(kpca, training_set))
Let’s look; we removed the dependent variable above to apply the dimensionality reduction but we’ll need it so lets move it back in.
head(training_set_pca)
## V1 V2
## 1 -10.069428 -1.4873996
## 3 -8.293841 -1.6232411
## 6 -6.498572 -3.1438574
## 7 -2.933372 -6.4951512
## 8 7.088254 -9.3473575
## 10 -2.317751 -0.3716837
training_set_pca$Purchased = training_set$Purchased
And now we have our data ready for fitting to our regression.
head(training_set_pca)
## V1 V2 Purchased
## 1 -10.069428 -1.4873996 0
## 3 -8.293841 -1.6232411 0
## 6 -6.498572 -3.1438574 0
## 7 -2.933372 -6.4951512 0
## 8 7.088254 -9.3473575 1
## 10 -2.317751 -0.3716837 0
Let’s fix the test set.
test_set_pca = as.data.frame(predict(kpca, test_set))
test_set_pca$Purchased = test_set$Purchased
classifier = glm(formula = Purchased ~ .,
family = binomial,
data = training_set_pca)
prob_pred = predict(classifier, type = 'response', newdata = test_set_pca[-3])
y_pred = ifelse(prob_pred > 0.5, 1, 0)
cm = table(test_set_pca[, 3], y_pred)
cm
## y_pred
## 0 1
## 0 57 7
## 1 10 26
knitr::include_graphics("Confusion_Matrix_Explained.png")
# Visualising the Training set results
# install.packages('ElemStatLearn')
library(ElemStatLearn)
set = training_set_pca
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('V1', 'V2') # align with PC1 and PC2
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[, -3],
main = 'Logistic Regression with Kernel PCA (Training set)',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Visualising the Test set results
# install.packages('ElemStatLearn')
library(ElemStatLearn)
set = test_set_pca
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('V1', 'V2') # align with PC1 and PC2
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[, -3],
main = 'Logistic Regression with Kernel PCA (Test set)',
xlab = 'V1 (PC1)', ylab = 'V2 (PC2)',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
knitr::include_graphics("LogReg_BeforeAfterKernelPCA.png")