We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary.We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features.
x1 = runif(500) -0.5
x2 = runif(500) -0.5
y = 1*(x1^2-x2^2 > 0)
plot(x1, x2, xlab = "X1", ylab = "X2",col = (5-y), pch = (3-y))
glm.fit = glm(y~x1+x2, family="binomial")
summary(glm.fit)
##
## Call:
## glm(formula = y ~ x1 + x2, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.254 -1.160 -1.100 1.179 1.250
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.005725 0.090056 -0.064 0.949
## x1 -0.020337 0.309920 -0.066 0.948
## x2 0.354065 0.306982 1.153 0.249
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 693.14 on 499 degrees of freedom
## Residual deviance: 691.81 on 497 degrees of freedom
## AIC: 697.81
##
## Number of Fisher Scoring iterations: 3
data = data.frame(x1 = x1, x2 = x2, y = y)
probs = predict(glm.fit, data, type='response')
preds = rep(0,500)
preds[probs > 0.48] = 1
plot(data[preds == 1,]$x1, data[preds == 1,]$x2, col = (6-1), pch = (5-1), xlab="X1", ylab="X2")
points(data[preds == 0,]$x1, data[preds == 0,]$x2, col = 6, pch = 5)
The plot shows a clear linear decision boundary.
glmnl.fit = glm(y~ poly(x1, 2) + poly(x2,2) + I(x1*x2), family="binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glmnl.fit)
##
## Call:
## glm(formula = y ~ poly(x1, 2) + poly(x2, 2) + I(x1 * x2), family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.445e-03 -2.000e-08 -2.000e-08 2.000e-08 1.183e-03
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.074 3609.621 0.002 0.998
## poly(x1, 2)1 -3816.432 104232.660 -0.037 0.971
## poly(x1, 2)2 33685.176 871702.786 0.039 0.969
## poly(x2, 2)1 857.480 37877.695 0.023 0.982
## poly(x2, 2)2 -35267.157 915778.982 -0.039 0.969
## I(x1 * x2) 156.776 41676.458 0.004 0.997
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.9314e+02 on 499 degrees of freedom
## Residual deviance: 5.3187e-06 on 494 degrees of freedom
## AIC: 12
##
## Number of Fisher Scoring iterations: 25
The predictors are even less statisticaly significant than the linear model.
probs = predict(glmnl.fit, data, type='response')
preds = rep(0,500)
preds[probs > 0.48] = 1
plot(data[preds == 1,]$x1, data[preds == 1,]$x2, col = (6-1), pch = (5-1), xlab="X1", ylab="X2")
points(data[preds == 0,]$x1, data[preds == 0,]$x2, col = 6, pch = 5)
data$y = as.factor(data$y)
set.seed(123)
tune.out = tune(svm,y~x1+x2, data = data, kernel='linear',
ranges=list(cost=c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
tune.out$best.parameters
## cost
## 5 5
svm.fit = svm(y~x1+x2, data, kernel='linear', cost=0.1)
preds = predict(svm.fit, data)
plot(data[preds == 0,]$x1, data[preds == 0,]$x2, col=(4-1), pch=(3-1), xlab = "X1", ylab="X2")
points(data[preds == 1,]$x1, data[preds == 1,]$x2, col=4, pch=3)
This support vector clssifier classifies all points to a single class.
set.seed(123)
tune.out1 = tune(svm,y~x1+x2, data = data, kernel='radial',
ranges=list(cost=c(0.001, 0.01, 0.1, 1, 5, 10, 100),
gamma=c(0.5,1,2,3,4)))
tune.out1$best.parameters
## cost gamma
## 7 100 0.5
svmnl.fit = svm(y~x1+x2, data, kernel="radial",
cost=100, gamma=0.5)
preds = predict(svmnl.fit, data)
plot(data[preds == 0,]$x1, data[preds == 0,]$x2, col=(4-1), pch=(3-1), xlab = "X1", ylab="X2")
points(data[preds == 1,]$x1, data[preds == 1,]$x2, col=4, pch=3)
In this problem, you will use support vector approaches in order to predict whether a given car gets high or low gas mileage based on the Auto data set.
bin.var = ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto$mpglevel = as.factor(bin.var)
set.seed(123)
tune.out = tune(svm,mpglevel~., data = Auto, kernel='linear',
ranges=list(cost=c(0.001, 0.01, 0.1, 1, 5, 10, 100)),
scale=T)
tune.out$best.parameters
## cost
## 4 1
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.01025641
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.09935897 0.04521036
## 2 1e-02 0.07634615 0.03928191
## 3 1e-01 0.04333333 0.03191738
## 4 1e+00 0.01025641 0.01792836
## 5 5e+00 0.01538462 0.01792836
## 6 1e+01 0.01788462 0.01727588
## 7 1e+02 0.03320513 0.02720447
The lowest error seems to be occurring when cost=1.
set.seed(123)
tune.out.rad = tune(svm,mpglevel~., data = Auto, kernel='radial',
ranges=list(gamma = c(0.01,0.1,1,5,10,100),
cost = c(0.01, 0.1,1,5,10,100),
scale = T))
tune.out.rad$best.parameters
## gamma cost scale
## 31 0.01 100 TRUE
summary(tune.out.rad)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost scale
## 0.01 100 TRUE
##
## - best performance: 0.01025641
##
## - Detailed performance results:
## gamma cost scale error dispersion
## 1 1e-02 1e-02 TRUE 0.58173077 0.04740051
## 2 1e-01 1e-02 TRUE 0.21391026 0.09431095
## 3 1e+00 1e-02 TRUE 0.58173077 0.04740051
## 4 5e+00 1e-02 TRUE 0.58173077 0.04740051
## 5 1e+01 1e-02 TRUE 0.58173077 0.04740051
## 6 1e+02 1e-02 TRUE 0.58173077 0.04740051
## 7 1e-02 1e-01 TRUE 0.08916667 0.04345384
## 8 1e-01 1e-01 TRUE 0.07634615 0.03928191
## 9 1e+00 1e-01 TRUE 0.58173077 0.04740051
## 10 5e+00 1e-01 TRUE 0.58173077 0.04740051
## 11 1e+01 1e-01 TRUE 0.58173077 0.04740051
## 12 1e+02 1e-01 TRUE 0.58173077 0.04740051
## 13 1e-02 1e+00 TRUE 0.07378205 0.04185248
## 14 1e-01 1e+00 TRUE 0.05852564 0.03960325
## 15 1e+00 1e+00 TRUE 0.05865385 0.04942437
## 16 5e+00 1e+00 TRUE 0.51544872 0.06790600
## 17 1e+01 1e+00 TRUE 0.54602564 0.06355090
## 18 1e+02 1e+00 TRUE 0.58173077 0.04740051
## 19 1e-02 5e+00 TRUE 0.04589744 0.03136327
## 20 1e-01 5e+00 TRUE 0.03057692 0.02611396
## 21 1e+00 5e+00 TRUE 0.05608974 0.04595880
## 22 5e+00 5e+00 TRUE 0.51544872 0.06790600
## 23 1e+01 5e+00 TRUE 0.54102564 0.06959451
## 24 1e+02 5e+00 TRUE 0.58173077 0.04740051
## 25 1e-02 1e+01 TRUE 0.02032051 0.02305327
## 26 1e-01 1e+01 TRUE 0.03314103 0.02942215
## 27 1e+00 1e+01 TRUE 0.05608974 0.04595880
## 28 5e+00 1e+01 TRUE 0.51544872 0.06790600
## 29 1e+01 1e+01 TRUE 0.54102564 0.06959451
## 30 1e+02 1e+01 TRUE 0.58173077 0.04740051
## 31 1e-02 1e+02 TRUE 0.01025641 0.01792836
## 32 1e-01 1e+02 TRUE 0.03326923 0.02434857
## 33 1e+00 1e+02 TRUE 0.05608974 0.04595880
## 34 5e+00 1e+02 TRUE 0.51544872 0.06790600
## 35 1e+01 1e+02 TRUE 0.54102564 0.06959451
## 36 1e+02 1e+02 TRUE 0.58173077 0.04740051
The lowest error is obtained when gamma=0.01 and cost=100
set.seed(123)
tune.out.poly = tune(svm,mpglevel~., data = Auto, kernel='polynomial',
ranges=list(degree = seq(0, 4, by = 1),
cost = c(0.01, 0.1,1,5,10,100)))
tune.out.poly$best.parameters
## degree cost
## 27 1 100
summary(tune.out.poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## degree cost
## 1 100
##
## - best performance: 0.02038462
##
## - Detailed performance results:
## degree cost error dispersion
## 1 0 1e-02 0.58173077 0.04740051
## 2 1 1e-02 0.58173077 0.04740051
## 3 2 1e-02 0.58173077 0.04740051
## 4 3 1e-02 0.58173077 0.04740051
## 5 4 1e-02 0.58173077 0.04740051
## 6 0 1e-01 0.58173077 0.04740051
## 7 1 1e-01 0.18339744 0.12223061
## 8 2 1e-01 0.58173077 0.04740051
## 9 3 1e-01 0.58173077 0.04740051
## 10 4 1e-01 0.58173077 0.04740051
## 11 0 1e+00 0.58173077 0.04740051
## 12 1 1e+00 0.08147436 0.03707182
## 13 2 1e+00 0.58173077 0.04740051
## 14 3 1e+00 0.58173077 0.04740051
## 15 4 1e+00 0.58173077 0.04740051
## 16 0 5e+00 0.58173077 0.04740051
## 17 1 5e+00 0.07378205 0.04185248
## 18 2 5e+00 0.58173077 0.04740051
## 19 3 5e+00 0.58173077 0.04740051
## 20 4 5e+00 0.58173077 0.04740051
## 21 0 1e+01 0.58173077 0.04740051
## 22 1 1e+01 0.06608974 0.04785032
## 23 2 1e+01 0.57147436 0.04575370
## 24 3 1e+01 0.58173077 0.04740051
## 25 4 1e+01 0.58173077 0.04740051
## 26 0 1e+02 0.58173077 0.04740051
## 27 1 1e+02 0.02038462 0.01617396
## 28 2 1e+02 0.30346154 0.10917787
## 29 3 1e+02 0.35211538 0.13782036
## 30 4 1e+02 0.58173077 0.04740051
The polynomial SVM is returning the lowest amount of error when cost=100 and degree=1
Hint: In the lab, we used the plot() function for svm objects only in cases with p = 2. When p > 2, you can use the plot() function to create plots displaying pairs of variables at a time. Essentially, instead of typing
svm.linear <- svm(mpglevel ~ ., data = Auto, kernel = "linear", cost = 1)
svm.poly <- svm(mpglevel ~ ., data = Auto, kernel = "polynomial", cost = 100, degree = 1)
svm.radial <- svm(mpglevel ~ ., data = Auto, kernel = "radial", cost = 100, gamma = 0.01)
plotpairs = function(fit) {
for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
plot(fit, Auto, as.formula(paste("mpg~", name, sep = "")))
}
}
Linear SVM plots
plotpairs(svm.linear)
Polynomial SVM plots
plotpairs(svm.poly)
Radial SVM plots
plotpairs(svm.radial)
OJ data set which is part of the ISLR package.set.seed(123)
ind = sample(nrow(OJ), 800)
train = OJ[ind,]
test = OJ[-ind,]
svm.fit = svm(Purchase~., data = train, kernel='linear', cost=0.01)
summary(svm.fit )
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 442
##
## ( 220 222 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred = predict(svm.fit, train)
table(train$Purchase, train.pred)
## train.pred
## CH MM
## CH 426 61
## MM 71 242
# training error rate
(71 + 61) / (426 + 242 + 71 + 61)
## [1] 0.165
test.pred = predict(svm.fit, test)
table(test$Purchase, test.pred)
## test.pred
## CH MM
## CH 145 21
## MM 27 77
(27 + 21) / (145 + 77 + 27 + 21)
## [1] 0.1777778
The training error rate is 16.5% and the test error rate is 17.7%.
set.seed(123)
tune.out = tune(svm, Purchase~., data=train, kernel='linear',
ranges=list(cost = c(0.01, 0.1,1,5,10,100)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.16875
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.17375 0.04910660
## 2 1e-01 0.17500 0.04823265
## 3 1e+00 0.16875 0.03963812
## 4 5e+00 0.17250 0.04241004
## 5 1e+01 0.17000 0.04005205
## 6 1e+02 0.17000 0.04216370
tune.out$best.parameters
## cost
## 3 1
The optimal cost is 1.
svm.linear <- svm(Purchase ~ ., kernel = "linear", data = train, cost = tune.out$best.parameter$cost)
train.pred <- predict(svm.linear, train)
table(train$Purchase, train.pred)
## train.pred
## CH MM
## CH 428 59
## MM 69 244
(69 + 59) / (428 + 244 + 69 + 59)
## [1] 0.16
test.pred <- predict(svm.linear, test)
table(test$Purchase, test.pred)
## test.pred
## CH MM
## CH 148 18
## MM 24 80
(24 + 18) / (148 + 80 + 24 + 18)
## [1] 0.1555556
The test error rate is lower than the training error in this model that uses a tuned hyperparameter for cost.
svm.radial <- svm(Purchase ~ ., kernel = "radial", data = train)
summary(svm.radial)
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 367
##
## ( 181 186 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred <- predict(svm.radial, train)
table(train$Purchase, train.pred)
## train.pred
## CH MM
## CH 446 41
## MM 70 243
(70 + 41) / (446 + 243 + 70 + 41)
## [1] 0.13875
test.pred = predict(svm.radial, test)
table(test$Purchase, test.pred)
## test.pred
## CH MM
## CH 149 17
## MM 34 70
(34 + 17) / (149 + 70 + 34 + 17)
## [1] 0.1888889
Radial kernel with default gamma creates 367 support vectors. The classifier has a training error of 13.8% and a test error of 18.8%. It is doing worse on new data compared to our linear SVM with tuned cost. We will now use cross-validation to find the optimal cost.
set.seed(123)
tune.out = tune(svm, Purchase~., data=train, kernel = 'radial',
ranges=list(cost=c(0.01, 0.1,1,5,10,100)))
svm.radial <- svm(Purchase ~ ., kernel = "radial", data = train, cost = tune.out$best.parameter$cost)
summary(svm.radial)
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "radial", cost = tune.out$best.parameter$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 367
##
## ( 181 186 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred <- predict(svm.radial, train)
table(train$Purchase, train.pred)
## train.pred
## CH MM
## CH 446 41
## MM 70 243
(70 + 41) / (446 + 41 + 70 + 243)
## [1] 0.13875
test.pred <- predict(svm.radial, test)
table(test$Purchase, test.pred)
## test.pred
## CH MM
## CH 149 17
## MM 34 70
(34 + 17) / (149 + 70 + 34 + 17)
## [1] 0.1888889
The Radial SVM with a tune cost parameter achieves a test error of 18.8% Tuning does not alter our training and test error rates.
svm.poly <- svm(Purchase ~ ., kernel = "polynomial", data = train, degree = 2)
summary(svm.poly)
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "polynomial",
## degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 445
##
## ( 219 226 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred = predict(svm.poly, train)
table(train$Purchase, train.pred)
## train.pred
## CH MM
## CH 454 33
## MM 105 208
(105 + 33) / (454 + 208 + 105 + 33)
## [1] 0.1725
test.pred = predict(svm.poly, test)
table(test$Purchase, test.pred)
## test.pred
## CH MM
## CH 153 13
## MM 47 57
(47 + 13) / (153 + 57 + 47 + 13)
## [1] 0.2222222
The test error rate greatly increases compared with our other models. Let’s see if tuning improves this.
set.seed(123)
tune.out = tune(svm, Purchase~., data=train, kernel='polynomial',
degree = 2,
ranges = list(cost=c(0.01, 0.1,1,5,10,100)))
tune.out$best.parameters
## cost
## 6 100
svm.poly = svm(Purchase~., kernel='polynomial', degree=2, data=train, cost=tune.out$best.parameters$cost)
summary(svm.poly)
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "polynomial",
## degree = 2, cost = tune.out$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 100
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 309
##
## ( 154 155 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred = predict(svm.poly, train)
table(train$Purchase, train.pred)
## train.pred
## CH MM
## CH 445 42
## MM 61 252
(61 + 42) / (445 + 252 + 61 +42)
## [1] 0.12875
test.pred = predict(svm.poly, test)
table(test$Purchase, test.pred)
## test.pred
## CH MM
## CH 147 19
## MM 37 67
(37 + 19) / (147 + 67 + 37 + 19)
## [1] 0.2074074
The tuned linear SVM seems to perform the best with the lowest test error rate of 15%.