Support Vector Machine (SVM)

Lecture 107 https://www.udemy.com/machinelearning/learn/lecture/5739456

Check Working directory getwd() to always know where you are working.

Importing the dataset

we are after the age and salary and the y/n purchased so in R that’s columns 3-5

dataset = read.csv('Social_Network_Ads.csv')
dataset = dataset[3:5]

Have a look at data

summary(dataset)
##       Age        EstimatedSalary    Purchased     
##  Min.   :18.00   Min.   : 15000   Min.   :0.0000  
##  1st Qu.:29.75   1st Qu.: 43000   1st Qu.:0.0000  
##  Median :37.00   Median : 70000   Median :0.0000  
##  Mean   :37.66   Mean   : 69742   Mean   :0.3575  
##  3rd Qu.:46.00   3rd Qu.: 88000   3rd Qu.:1.0000  
##  Max.   :60.00   Max.   :150000   Max.   :1.0000
head(dataset)
##   Age EstimatedSalary Purchased
## 1  19           19000         0
## 2  35           20000         0
## 3  26           43000         0
## 4  27           57000         0
## 5  19           76000         0
## 6  27           58000         0

Encoding the target feature as factor

dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

Let’s look again

summary(dataset)
##       Age        EstimatedSalary  Purchased
##  Min.   :18.00   Min.   : 15000   0:257    
##  1st Qu.:29.75   1st Qu.: 43000   1:143    
##  Median :37.00   Median : 70000            
##  Mean   :37.66   Mean   : 69742            
##  3rd Qu.:46.00   3rd Qu.: 88000            
##  Max.   :60.00   Max.   :150000

Splitting the dataset into the Training set and Test set

# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

Feature Scaling

Feature Scaling - for classification it’s better to do feature scalling additionally we have variables where the units are not the same

training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

Let’s have a look.

head(training_set)
##           Age EstimatedSalary Purchased
## 1  -1.7655475      -1.4733414         0
## 3  -1.0962966      -0.7883761         0
## 6  -1.0006894      -0.3602727         0
## 7  -1.0006894       0.3817730         0
## 8  -0.5226531       2.2654277         1
## 10 -0.2358313      -0.1604912         0

Fitting SVM to the Training set

We’ll use kernel = linear as our example. kernel - the kernel used in training and predicting. You might consider changing some of the following parameters, depending on the kernel type. Options; linear, polynomial, radial basis, sigmoid.

The formula = the dependent variable ~ other variables you want to test in this case ‘.’ for all. data = data you want to train on

# install.packages('e1071')
library(e1071)
classifierL = svm(formula = Purchased ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'linear')

Predict the Test set results

y_predL = predict(classifierL, newdata = test_set[-3])

Making the Confusion Matrix

cmL = table(test_set[, 3], y_predL)
cmL
##    y_predL
##      0  1
##   0 57  7
##   1 13 23
A caption

A caption

Visualising the Training set results

library(ElemStatLearn)
# declare set as the training set
set = training_set
# this section creates the background region red/green. It does that by the 'by' which you can think of as the steps in python, so each 0.01 is interpreted as 0 or 1 and is either green or red. The -1 and +1 give us the space around the edges so the dots are not jammed
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
# just giving a name to the X and Y 
colnames(grid_set) = c('Age', 'EstimatedSalary')
# this is the MAGIC of the background coloring
# here we use the classifier to predict the result of each of each of the pixel bits noted above
y_gridL = predict(classifierL, newdata = grid_set)
# that's the end of the background
# now we plat the actual data 
plot(set[, -3],
     main = 'SVM Linear Kernel (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2)) # this bit creates the limits to the values plotted this is also a part of the MAGIC as it creates the line between green and red
contour(X1, X2, matrix(as.numeric(y_gridL), length(X1), length(X2)), add = TRUE)
# here we run through all the y_pred data and use ifelse to color the dots
# note the dots are the real data, the background is the pixel by pixel determination of y/n
# graph the dots on top of the background give you the image
points(grid_set, pch = '.', col = ifelse(y_gridL == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

# Visualising the Test set results

library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_gridL = predict(classifierL, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_gridL), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_gridL == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

=========================

Github files; https://github.com/ghettocounselor

Useful PDF for common questions in Lectures;

https://github.com/ghettocounselor/Machine_Learning/blob/master/Machine-Learning-A-Z-Q-A.pdf