Lecture 107 https://www.udemy.com/machinelearning/learn/lecture/5739456
Check Working directory getwd() to always know where you are working.
we are after the age and salary and the y/n purchased so in R that’s columns 3-5
dataset = read.csv('Social_Network_Ads.csv')
dataset = dataset[3:5]
Have a look at data
summary(dataset)
## Age EstimatedSalary Purchased
## Min. :18.00 Min. : 15000 Min. :0.0000
## 1st Qu.:29.75 1st Qu.: 43000 1st Qu.:0.0000
## Median :37.00 Median : 70000 Median :0.0000
## Mean :37.66 Mean : 69742 Mean :0.3575
## 3rd Qu.:46.00 3rd Qu.: 88000 3rd Qu.:1.0000
## Max. :60.00 Max. :150000 Max. :1.0000
head(dataset)
## Age EstimatedSalary Purchased
## 1 19 19000 0
## 2 35 20000 0
## 3 26 43000 0
## 4 27 57000 0
## 5 19 76000 0
## 6 27 58000 0
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
Let’s look again
summary(dataset)
## Age EstimatedSalary Purchased
## Min. :18.00 Min. : 15000 0:257
## 1st Qu.:29.75 1st Qu.: 43000 1:143
## Median :37.00 Median : 70000
## Mean :37.66 Mean : 69742
## 3rd Qu.:46.00 3rd Qu.: 88000
## Max. :60.00 Max. :150000
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
Feature Scaling - for classification it’s better to do feature scalling additionally we have variables where the units are not the same
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
Let’s have a look.
head(training_set)
## Age EstimatedSalary Purchased
## 1 -1.7655475 -1.4733414 0
## 3 -1.0962966 -0.7883761 0
## 6 -1.0006894 -0.3602727 0
## 7 -1.0006894 0.3817730 0
## 8 -0.5226531 2.2654277 1
## 10 -0.2358313 -0.1604912 0
We’ll use kernel = linear as our example. kernel - the kernel used in training and predicting. You might consider changing some of the following parameters, depending on the kernel type. Options; linear, polynomial, radial basis, sigmoid.
The formula = the dependent variable ~ other variables you want to test in this case ‘.’ for all. data = data you want to train on
# install.packages('e1071')
library(e1071)
classifierL = svm(formula = Purchased ~ .,
data = training_set,
type = 'C-classification',
kernel = 'linear')
y_predL = predict(classifierL, newdata = test_set[-3])
cmL = table(test_set[, 3], y_predL)
cmL
## y_predL
## 0 1
## 0 57 7
## 1 13 23
A caption
library(ElemStatLearn)
# declare set as the training set
set = training_set
# this section creates the background region red/green. It does that by the 'by' which you can think of as the steps in python, so each 0.01 is interpreted as 0 or 1 and is either green or red. The -1 and +1 give us the space around the edges so the dots are not jammed
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
# just giving a name to the X and Y
colnames(grid_set) = c('Age', 'EstimatedSalary')
# this is the MAGIC of the background coloring
# here we use the classifier to predict the result of each of each of the pixel bits noted above
y_gridL = predict(classifierL, newdata = grid_set)
# that's the end of the background
# now we plat the actual data
plot(set[, -3],
main = 'SVM Linear Kernel (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2)) # this bit creates the limits to the values plotted this is also a part of the MAGIC as it creates the line between green and red
contour(X1, X2, matrix(as.numeric(y_gridL), length(X1), length(X2)), add = TRUE)
# here we run through all the y_pred data and use ifelse to color the dots
# note the dots are the real data, the background is the pixel by pixel determination of y/n
# graph the dots on top of the background give you the image
points(grid_set, pch = '.', col = ifelse(y_gridL == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_gridL = predict(classifierL, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_gridL), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_gridL == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
=========================
Github files; https://github.com/ghettocounselor
Useful PDF for common questions in Lectures;
https://github.com/ghettocounselor/Machine_Learning/blob/master/Machine-Learning-A-Z-Q-A.pdf