We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary. We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features.
x1=runif(500)-0.5
x2=runif(500)-0.5
y=1*(x12-x22 > 0)
set.seed(421)
x1 = runif(500) - 0.5
x2 = runif(500) - 0.5
y = 1 * (x1^2 - x2^2 > 0)
plot(x1[y==0], x2[y==0], col="green", xlab="X1", ylab="X2", pch="+")
points(x1[y==1], x2[y==1], col="blue", pch=4)
lm.fit = glm(y~ x1+x2, family=binomial)
summary(lm.fit)
##
## Call:
## glm(formula = y ~ x1 + x2, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.278 -1.227 1.089 1.135 1.175
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.11999 0.08971 1.338 0.181
## x1 -0.16881 0.30854 -0.547 0.584
## x2 -0.08198 0.31476 -0.260 0.795
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 691.35 on 499 degrees of freedom
## Residual deviance: 690.99 on 497 degrees of freedom
## AIC: 696.99
##
## Number of Fisher Scoring iterations: 3
data = data.frame(x1=x1, x2=x2, y=y)
lm.prob = predict(lm.fit, data, type = "response")
lm.pred = ifelse(lm.prob > 0.52, 1, 0)
data.pos = data[lm.pred == 1, ]
data.neg = data[lm.pred == 0, ]
plot(data.pos$x1, data.pos$x2, col="green", xlab="X1", ylab="X2", pch="+")
points(data.neg$x1, data.neg$x2, col='red', pch = 4)
lm.fit = glm(y ~ poly(x1, 2) + poly(x2,2) + I(x1*x2), data=data, family=binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.prob = predict(lm.fit, data, type = "response")
lm.pred = ifelse(lm.prob > 0.5, 1, 0)
data.pos = data[lm.pred == 1, ]
data.neg = data[lm.pred == 0, ]
plot(data.pos$x1, data.pos$x2, col = "blue", xlab = "X1", ylab = "X2", pch = "+")
points(data.neg$x1, data.neg$x2, col = "red", pch = 4)
library(e1071)
svm.fit = svm(as.factor(y) ~ x1 + x2, data, kernel = 'linear', cost = 0.1)
svm.pred = predict(svm.fit, data)
data.pos = data[svm.pred == 1, ]
data.neg = data[svm.pred == 0, ]
plot(data.pos$x1, data.pos$x2, col='green', xlab="X1", ylab="X2", pch="+")
points(data.neg$x1, data.neg$x2, col='red', pch=4)
svm.fit = svm(as.factor(y) ~ x1 + x2, data, gamma = 1)
svm.pred = predict(svm.fit, data)
data.pos = data[svm.pred == 1, ]
data.neg = data[svm.pred == 0, ]
plot(data.pos$x1, data.pos$x2, col="green", xlab = "X2", ylab="X2", pch="+")
points(data.neg$x1, data.neg$x2, col='red', pch = 4)
Logistic with no interactions and SVMs with linear kernels were not able to find the decision boundary. Adding interactions to the logistic regression did appear to help a little, however all of the above shows us that SVMs with non-linear kernels are good at finding the non-linear boundary.
In this problem, you will use support vector approaches in order to predicct whether a given car gets a high or low gas mileage based on the Auto data set.
library(ISLR)
gas.med = median(Auto$mpg)
new.var = ifelse(Auto$mpg > gas.med, 1, 0)
Auto$mpglevel = as.factor(new.var)
library(e1071)
set.seed(3255)
tune.out = tune(svm, mpglevel ~ ., data = Auto, kernel = "linear", ranges = list(cost = c(0.1, 1, 5, 10, 100)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.01269231
##
## - Detailed performance results:
## cost error dispersion
## 1 0.1 0.05102564 0.06923024
## 2 1.0 0.01269231 0.02154160
## 3 5.0 0.01519231 0.01760469
## 4 10.0 0.02025641 0.02303772
## 5 100.0 0.03294872 0.02898463
The cross-validation error is minimized when cost = 1.
set.seed(21)
tune.out = tune(svm, mpglevel ~ ., data=Auto, kernel="polynomial", ranges=list(cost = c(0.1, 1, 5, 10), degree=c(2,3,4)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 10 2
##
## - best performance: 0.5435897
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 0.5587821 0.04538579
## 2 1.0 2 0.5587821 0.04538579
## 3 5.0 2 0.5587821 0.04538579
## 4 10.0 2 0.5435897 0.05611162
## 5 0.1 3 0.5587821 0.04538579
## 6 1.0 3 0.5587821 0.04538579
## 7 5.0 3 0.5587821 0.04538579
## 8 10.0 3 0.5587821 0.04538579
## 9 0.1 4 0.5587821 0.04538579
## 10 1.0 4 0.5587821 0.04538579
## 11 5.0 4 0.5587821 0.04538579
## 12 10.0 4 0.5587821 0.04538579
The cross-validation error is minimized when cost = 10 and degree is = 2.
svm.linear = svm(mpglevel ~ ., data = Auto, kernel = "linear", cost = 1)
svm.poly = svm(mpglevel ~ ., data = Auto, kernel = "polynomial", cost = 10,
degree = 2)
svm.radial = svm(mpglevel ~ ., data = Auto, kernel = "radial", cost = 10, gamma = 0.01)
plotpairs = function(fit) {
for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
plot(fit, Auto, as.formula(paste("mpg~", name, sep = "")))
}
}
plotpairs(svm.linear)
plotpairs(svm.poly)
plotpairs(svm.radial)
This package involves the OJ data set which is part of the ISLR package.
library(ISLR)
set.seed(9004)
train = sample(dim(OJ)[1],800)
OJ.train = OJ[train, ]
OJ.test = OJ[-train, ]
library(e1071)
svm.linear = svm(Purchase ~ ., kernel='linear', data = OJ.train, cost=0.01)
summary(svm.linear)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ.train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 442
##
## ( 222 220 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
442 support vectors out of the 800 training points. 222 belong to the CH level and 215 belong to the MM level.
train.pred = predict(svm.linear, OJ.train)
table(OJ.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 432 51
## MM 80 237
(80+51)/(80+51+432+237)
## [1] 0.16375
test.pred = predict(svm.linear, OJ.test)
table(OJ.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 146 24
## MM 22 78
(22+24)/(22+24+146+78)
## [1] 0.1703704
The training error rate is 16.4% and the test error rate is 17%.
set.seed(1554)
tune.out = tune(svm, Purchase ~ ., data = OJ.train, kernel = "linear", ranges=list(cost=10^seq(-2, 1, by =0.25)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 3.162278
##
## - best performance: 0.1625
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.16750 0.03395258
## 2 0.01778279 0.16875 0.02960973
## 3 0.03162278 0.16625 0.02638523
## 4 0.05623413 0.16875 0.03076005
## 5 0.10000000 0.16875 0.02901748
## 6 0.17782794 0.16750 0.02838231
## 7 0.31622777 0.17000 0.02898755
## 8 0.56234133 0.16875 0.02841288
## 9 1.00000000 0.16500 0.03106892
## 10 1.77827941 0.16500 0.03106892
## 11 3.16227766 0.16250 0.03118048
## 12 5.62341325 0.16375 0.02664713
## 13 10.00000000 0.16750 0.02581989
svm.linear = svm(Purchase ~ ., kernel = "linear", data = OJ.train, cost = tune.out$best.parameters$cost)
train.pred = predict(svm.linear, OJ.train)
table(OJ.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 428 55
## MM 74 243
(74+55)/(428+55+74+243)
## [1] 0.16125
test.pred = predict(svm.linear, OJ.test)
table(OJ.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 146 24
## MM 20 80
(20+24)/(146+24+20+80)
## [1] 0.162963
The training error is 16.25% and the test error rate is 16.23%.
set.seed(410)
svm.radial = svm(Purchase ~ ., data=OJ.train, kernel="radial")
summary(svm.radial)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ.train, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 371
##
## ( 188 183 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred = predict(svm.radial, OJ.train)
table(OJ.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 441 42
## MM 74 243
(74+42)/(74+42+441+243)
## [1] 0.145
test.pred = predict(svm.radial, OJ.test)
table(OJ.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 148 22
## MM 27 73
(27+22)/(27+22+148+73)
## [1] 0.1814815
set.seed(755)
tune.out = tune(svm, Purchase ~ ., data = OJ.train, kernel = "radial", ranges = list(cost = 10^seq(-2,
1, by = 0.25)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.3162278
##
## - best performance: 0.1675
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.39625 0.06615691
## 2 0.01778279 0.39625 0.06615691
## 3 0.03162278 0.35375 0.09754807
## 4 0.05623413 0.20000 0.04249183
## 5 0.10000000 0.17750 0.04073969
## 6 0.17782794 0.17125 0.03120831
## 7 0.31622777 0.16750 0.04216370
## 8 0.56234133 0.16750 0.03782269
## 9 1.00000000 0.17250 0.03670453
## 10 1.77827941 0.17750 0.03374743
## 11 3.16227766 0.18000 0.04005205
## 12 5.62341325 0.18000 0.03446012
## 13 10.00000000 0.18625 0.04427267
svm.radial = svm(Purchase ~ ., data = OJ.train, kernel = "radial", cost = tune.out$best.parameters$cost)
train.pred = predict(svm.radial, OJ.train)
table(OJ.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 440 43
## MM 81 236
(81+43)/(81+43+440+236)
## [1] 0.155
test.pred = predict(svm.radial, OJ.test)
table(OJ.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 145 25
## MM 28 72
(28+25)/(145+72+28+25)
## [1] 0.1962963
The training error is 15.5% and the test error rate is about 20%. The training error rate is 14.5% and the test error rate is 18.1%.
set.seed(8112)
svm.poly = svm(Purchase ~ ., data = OJ.train, kernel = "poly", degree = 2)
summary(svm.poly)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ.train, kernel = "poly", degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 456
##
## ( 232 224 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train.pred = predict(svm.poly, OJ.train)
table(OJ.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 450 33
## MM 111 206
(111+33)/(111+33+450+206)
## [1] 0.18
test.pred = predict(svm.poly, OJ.test)
table(OJ.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 149 21
## MM 34 66
(34+21)/(149+34+21+66)
## [1] 0.2037037
The training error is 18% and the test error is about 20%.
set.seed(322)
tune.out = tune(svm, Purchase ~ ., data = OJ.train, kernel = "poly", degree = 2,
ranges = list(cost = 10^seq(-2, 1, by = 0.25)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.18
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.39250 0.05749396
## 2 0.01778279 0.37500 0.05863020
## 3 0.03162278 0.36375 0.05756940
## 4 0.05623413 0.33875 0.06626179
## 5 0.10000000 0.30375 0.05172376
## 6 0.17782794 0.24000 0.04440971
## 7 0.31622777 0.21000 0.04362084
## 8 0.56234133 0.20250 0.03987829
## 9 1.00000000 0.20375 0.03634805
## 10 1.77827941 0.19500 0.04866267
## 11 3.16227766 0.18750 0.04409586
## 12 5.62341325 0.18875 0.04185375
## 13 10.00000000 0.18000 0.03593976
svm.poly = svm(Purchase ~ ., data = OJ.train, kernel = "poly", degree = 2, cost = tune.out$best.parameters$cost)
train.pred = predict(svm.poly, OJ.train)
table(OJ.train$Purchase, train.pred)
## train.pred
## CH MM
## CH 447 36
## MM 85 232
(85+36)/(447+36+85+232)
## [1] 0.15125
test.pred = predict(svm.poly, OJ.test)
table(OJ.test$Purchase, test.pred)
## test.pred
## CH MM
## CH 148 22
## MM 28 72
(28+22)/(28+22+148+72)
## [1] 0.1851852
The training error is 15.13% and the test error is about 19%.
The radial basis kernel appears to be the best at producing the minimum misclassification error on the training and testing data.