We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary. We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features.
set.seed(100)
x1 = runif(500) - 0.5
x2 = runif(500) - 0.5
y = 1 * (x1^2 - x2^2 > 0)
plot(x1[y == 0], x2[y == 0], col = "red", xlab = "X1", ylab = "X2")
points(x1[y == 1], x2[y == 1], col = "blue", pch = 4)
data <- data.frame(x1 = x1, x2 = x2, y = as.factor(y))
lm_fit <- glm(y ~ ., data = data, family = 'binomial')
lm_probs <- predict(lm_fit, newdata = data, type = 'response')
lm_preds <- ifelse(lm_probs > 0.5, 1, 0)
plot(data$x1, data$x2, col = lm_preds + 2)
lm_fit_nl = glm(y ~ poly(x1, 2) + poly(x2, 2) + I(x1 * x2), data = data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(lm_fit_nl)
##
## Call:
## glm(formula = y ~ poly(x1, 2) + poly(x2, 2) + I(x1 * x2), family = binomial,
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.564e-03 -2.000e-08 -2.000e-08 2.000e-08 1.502e-03
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -222.21 13258.36 -0.017 0.987
## poly(x1, 2)1 3902.33 364800.33 0.011 0.991
## poly(x1, 2)2 33807.47 937118.99 0.036 0.971
## poly(x2, 2)1 -656.84 387630.34 -0.002 0.999
## poly(x2, 2)2 -35213.02 1032138.51 -0.034 0.973
## I(x1 * x2) 64.72 130327.27 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.9276e+02 on 499 degrees of freedom
## Residual deviance: 5.1269e-06 on 494 degrees of freedom
## AIC: 12
##
## Number of Fisher Scoring iterations: 25
lm_probs = predict(lm_fit, data, type = "response")
lm_preds = ifelse(lm_probs > 0.5, 1, 0)
data_pos = data[lm_preds == 1, ]
data_neg = data[lm_preds == 0, ]
plot(data_pos$x1, data_pos$x2, col = "blue", xlab = "X1", ylab = "X2", pch = "+")
points(data_neg$x1, data_neg$x2, col = "red", pch = 4)
library(e1071)
svm_fit <- svm(y ~ ., data = data, kernel = 'linear', cost = 0.01)
plot(svm_fit, data)
svm_fit = svm(as.factor(y) ~ x1 + x2, data, gamma = 1)
svm_preds = predict(svm_fit, data)
data_pos = data[svm_preds == 1, ]
data_neg = data[svm_preds == 0, ]
plot(data_pos$x1, data_pos$x2, col = "blue", xlab = "X1", ylab = "X2", pch = "+")
points(data_neg$x1, data_neg$x2, col = "red", pch = 4)
Using SVMs are best to use for non-linear models.
In this problem, you will use support vector approaches in order to predict whether a given car gets high or low gas mileage based on the Auto data set.
library(ISLR)
gas_med = median(Auto$mpg)
new_gas_var = ifelse(Auto$mpg > gas_med, 1, 0)
Auto$mpglevel = as.factor(new_gas_var)
set.seed(100)
horl_gas = tune(svm, mpglevel ~ ., data = Auto, kernel = "linear", ranges = list(cost = c(0.01,
0.1, 1, 5, 10, 100)))
summary(horl_gas)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.01512821
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.07391026 0.04398186
## 2 1e-01 0.05102564 0.03408666
## 3 1e+00 0.01512821 0.02421271
## 4 5e+00 0.01775641 0.01700310
## 5 1e+01 0.02538462 0.02372507
## 6 1e+02 0.03564103 0.02125655
The best performance is 0.01512821 for the best cost of 1.
set.seed(100)
horl_gas2 = tune(svm, mpglevel ~ ., data = Auto, kernel = "polynomial", ranges = list(cost = c(0.1,
1, 5, 10), degree = c(2, 3, 4)))
summary(horl_gas2)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 0.1 3
##
## - best performance: 0.5435897
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 0.5485897 0.06532230
## 2 1.0 2 0.5485897 0.06532230
## 3 5.0 2 0.5485897 0.06532230
## 4 10.0 2 0.5539744 0.07981094
## 5 0.1 3 0.5435897 0.07674778
## 6 1.0 3 0.5435897 0.07674778
## 7 5.0 3 0.5435897 0.07674778
## 8 10.0 3 0.5435897 0.07674778
## 9 0.1 4 0.5485897 0.06532230
## 10 1.0 4 0.5485897 0.06532230
## 11 5.0 4 0.5485897 0.06532230
## 12 10.0 4 0.5485897 0.06532230
The best performance is 0.5435897 for the best cost of 1 with a degree of 3.
svm_linear = svm(mpglevel ~ ., data = Auto, kernel = "linear", cost = 1)
svm_poly = svm(mpglevel ~ ., data = Auto, kernel = "polynomial", cost = 10,
degree = 2)
svm_radial = svm(mpglevel ~ ., data = Auto, kernel = "radial", cost = 10, gamma = 0.01)
plotpairs = function(fit) {
for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
plot(fit, Auto, as.formula(paste("mpg~", name, sep = "")))
}
}
plotpairs(svm_linear)
set.seed(100)
train = sample(dim(OJ)[1], 800)
oj_train = OJ[train, ]
oj_test = OJ[-train, ]
svm_linear = svm(Purchase ~ ., kernel = "linear", data = oj_train, cost = 0.01)
summary(svm_linear)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 432
##
## ( 216 216 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train_preds = predict(svm_linear, oj_train)
table(oj_train$Purchase, train_preds)
## train_preds
## CH MM
## CH 433 55
## MM 78 234
test_preds = predict(svm_linear, oj_test)
table(oj_test$Purchase, test_preds)
## test_preds
## CH MM
## CH 147 18
## MM 26 79
set.seed(100)
tune_out_op_cost = tune(svm, Purchase ~ ., data = oj_train, kernel = "linear", ranges = list(cost = 10^seq(-2,
1, by = 0.25)))
summary(tune_out_op_cost)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.05623413
##
## - best performance: 0.16625
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.17500 0.04639804
## 2 0.01778279 0.17375 0.03972562
## 3 0.03162278 0.16875 0.04299952
## 4 0.05623413 0.16625 0.04411554
## 5 0.10000000 0.16875 0.03875224
## 6 0.17782794 0.16875 0.04007372
## 7 0.31622777 0.17375 0.03928617
## 8 0.56234133 0.17500 0.03908680
## 9 1.00000000 0.17375 0.04101575
## 10 1.77827941 0.17125 0.03230175
## 11 3.16227766 0.17375 0.03251602
## 12 5.62341325 0.17000 0.04005205
## 13 10.00000000 0.17000 0.03782269
svm_linear = svm(Purchase ~ ., kernel = "linear", data = oj_train, cost = tune_out_op_cost$best.parameters$cost)
train_preds = predict(svm_linear, oj_train)
table(oj_train$Purchase, train_preds)
## train_preds
## CH MM
## CH 427 61
## MM 67 245
test_preds = predict(svm_linear, oj_test)
table(oj_test$Purchase, test_preds)
## test_preds
## CH MM
## CH 141 24
## MM 25 80
set.seed(100)
svm_radial = svm(Purchase ~ ., data = oj_train, kernel = "radial")
summary(svm_radial)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 368
##
## ( 187 181 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train_preds = predict(svm_radial, oj_train)
table(oj_train$Purchase, train_preds)
## train_preds
## CH MM
## CH 448 40
## MM 69 243
test_preds = predict(svm_radial, oj_test)
table(oj_test$Purchase, test_preds)
## test_preds
## CH MM
## CH 147 18
## MM 32 73
set.seed(755)
tune_out_rad = tune(svm, Purchase ~ ., data = oj_train, kernel = "radial", ranges = list(cost = 10^seq(-2,
1, by = 0.25)))
summary(tune_out_rad)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1.778279
##
## - best performance: 0.15875
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01000000 0.39000 0.06529846
## 2 0.01778279 0.39000 0.06529846
## 3 0.03162278 0.34000 0.08203150
## 4 0.05623413 0.19375 0.05535554
## 5 0.10000000 0.18250 0.06015027
## 6 0.17782794 0.17125 0.06010696
## 7 0.31622777 0.16750 0.05109903
## 8 0.56234133 0.16625 0.04715886
## 9 1.00000000 0.16000 0.04241004
## 10 1.77827941 0.15875 0.04084609
## 11 3.16227766 0.17000 0.04377975
## 12 5.62341325 0.17000 0.04090979
## 13 10.00000000 0.17375 0.04143687
Overall, the models are very similar, but the radial kernel is a slightly better than the others.