library(ISLR2)
library(ggplot2)
library(e1071)
We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary. We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features. (a) Generate a data set with n = 500 and p = 2, such that the observations belong to two classes with a quadratic decision boundary between them. For instance, you can do this as follows:
x1 <- runif (500) - 0.5
x2 <- runif (500) - 0.5
y <- 1 * (x1^2 - x2^2 > 0)
plot(x1, x2, col = y + 1, pch = 20, xlab = "X1", ylab = "X2")
(c) Fit a logistic regression model to the data, using X1 and X2 as
predictors.
log.mod = glm(y ~ x1 + x2, family = "binomial")
summary(log.mod)
##
## Call:
## glm(formula = y ~ x1 + x2, family = "binomial")
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.05078 0.08980 -0.565 0.572
## x1 0.26418 0.32027 0.825 0.409
## x2 0.40897 0.31439 1.301 0.193
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 692.86 on 499 degrees of freedom
## Residual deviance: 690.42 on 497 degrees of freedom
## AIC: 696.42
##
## Number of Fisher Scoring iterations: 3
df = data.frame(y=factor(y),x1=x1,x2=x2)
y_prob <- predict(log.mod,df, type = "response")
df$y_pred <- ifelse(y_prob > 0.5, 1, 0)
plot(x1, x2, col = df$y_pred +1, pch = 20, xlab = "X1", ylab = "X2")
(e) Now fit a logistic regression model to the data using non-linear
functions of X1 and X2 as predictors (e.g. X12, X1 ×X2, log(X2), and so
forth).
mod_nonlinear <- glm(y ~ poly(x1, 2) + log(x2),df, family = "binomial")
## Warning in log(x2): NaNs produced
summary(mod_nonlinear)
##
## Call:
## glm(formula = y ~ poly(x1, 2) + log(x2), family = "binomial",
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.3051 0.7492 -7.081 1.43e-12 ***
## poly(x1, 2)1 4.5152 4.7104 0.959 0.338
## poly(x1, 2)2 60.9615 8.2324 7.405 1.31e-13 ***
## log(x2) -3.3592 0.4646 -7.230 4.85e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 364.41 on 262 degrees of freedom
## Residual deviance: 147.05 on 259 degrees of freedom
## (237 observations deleted due to missingness)
## AIC: 155.05
##
## Number of Fisher Scoring iterations: 6
nl_y_prob <- predict(mod_nonlinear,df, type = "response")
## Warning in log(x2): NaNs produced
df$nl_y_pred <- ifelse(nl_y_prob > 0.5, 1, 0)
plot(x1, x2, col = df$nl_y_pred +1, pch = 20, xlab = "X1", ylab = "X2")
(g) Fit a support vector classifier to the data with X1 and X2 as
predictors. Obtain a class prediction for each training observation.
Plot the observations, colored according to the predicted class
labels.
svm_linear <- svm(y ~ x1 + x2, data = data.frame(x1, x2, y), kernel = "linear")
y_pred_svm <- predict(svm_linear, data.frame(x1, x2))
plot(x1, x2, col = y_pred_svm + 1, pch = 20, xlab = "X1", ylab = "X2")
(h) Fit a SVM using a non-linear kernel to the data. Obtain a class
prediction for each training observation. Plot the observations, colored
according to the predicted class labels.
svm_nonlinear <- svm(y ~ x1 + x2, data = data.frame(x1, x2, y), kernel = "radial")
y_pred_svm_nl <- predict(svm_nonlinear, data.frame(x1, x2))
plot(x1, x2, col = y_pred_svm_nl + 1, pch = 20, xlab = "X1", ylab = "X2")
(i) Comment on your results.
The results show how SVM can do a good job with non-linear patterns in data.
In this problem, you will use support vector approaches in order to predict whether a given car gets high or low gas mileage based on the Auto data set. (a) Create a binary variable that takes on a 1 for cars with gas mileage above the median, and a 0 for cars with gas mileage below the median.
Auto$mpg_binary <- ifelse(Auto$mpg > median(Auto$mpg),1,0)
Auto$mpg_binary <- as.factor(Auto$mpg_binary)
set.seed(123)
svm_cv <- tune(svm, mpg_binary ~ . - mpg, data = Auto, kernel="linear",
ranges = list(cost = c(0.01,
0.1, 1, 5, 10, 100)))
summary(svm_cv)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.01
##
## - best performance: 0.08910256
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.08910256 0.03791275
## 2 1e-01 0.09403846 0.04842472
## 3 1e+00 0.09147436 0.05817937
## 4 5e+00 0.10423077 0.06425850
## 5 1e+01 0.10673077 0.06562804
## 6 1e+02 0.12724359 0.06371052
The lowest error rate is 0.089 when cost is 1e-02.
set.seed(123)
svm_radial_cv <- tune(svm, mpg_binary ~ . - mpg, data = Auto, kernel="radial",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10, 100), gamma = c(0.1, 1, 10)))
summary(svm_radial_cv)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 5 0.1
##
## - best performance: 0.07365385
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-02 0.1 0.26000000 0.09153493
## 2 1e-01 0.1 0.08910256 0.03979296
## 3 1e+00 0.1 0.08653846 0.03578151
## 4 5e+00 0.1 0.07365385 0.05975763
## 5 1e+01 0.1 0.07365385 0.05852240
## 6 1e+02 0.1 0.10442308 0.05095556
## 7 1e-02 1.0 0.58173077 0.04740051
## 8 1e-01 1.0 0.58173077 0.04740051
## 9 1e+00 1.0 0.08660256 0.04466420
## 10 5e+00 1.0 0.08916667 0.05328055
## 11 1e+01 1.0 0.08916667 0.05328055
## 12 1e+02 1.0 0.08916667 0.05328055
## 13 1e-02 10.0 0.58173077 0.04740051
## 14 1e-01 10.0 0.58173077 0.04740051
## 15 1e+00 10.0 0.53583333 0.07213142
## 16 5e+00 10.0 0.53333333 0.07201901
## 17 1e+01 10.0 0.53333333 0.07201901
## 18 1e+02 10.0 0.53333333 0.07201901
set.seed(123)
svm_poly_cv <- tune(svm, mpg_binary ~ . - mpg, data = Auto, kernel="polynomial",
ranges = list(cost = c(0.001,0.01, 0.1, 1, 5, 10), gamma = c(0.01,0.1, 1),degree = 2))
summary(svm_poly_cv)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma degree
## 5 0.1 2
##
## - best performance: 0.1655769
##
## - Detailed performance results:
## cost gamma degree error dispersion
## 1 1e-03 0.01 2 0.5817308 0.04740051
## 2 1e-02 0.01 2 0.5817308 0.04740051
## 3 1e-01 0.01 2 0.5817308 0.04740051
## 4 1e+00 0.01 2 0.5714744 0.04575370
## 5 5e+00 0.01 2 0.3623077 0.12042520
## 6 1e+01 0.01 2 0.3112179 0.10756812
## 7 1e-03 0.10 2 0.5817308 0.04740051
## 8 1e-02 0.10 2 0.5714744 0.04575370
## 9 1e-01 0.10 2 0.3112179 0.10756812
## 10 1e+00 0.10 2 0.2776923 0.10407970
## 11 5e+00 0.10 2 0.1655769 0.06295559
## 12 1e+01 0.10 2 0.1706410 0.07111103
## 13 1e-03 1.00 2 0.3112179 0.10756812
## 14 1e-02 1.00 2 0.2776923 0.10407970
## 15 1e-01 1.00 2 0.1706410 0.07111103
## 16 1e+00 1.00 2 0.1935897 0.07764147
## 17 5e+00 1.00 2 0.2239103 0.09715624
## 18 1e+01 1.00 2 0.2240385 0.08919616
best_poly <- svm_poly_cv$best.parameters
best_radial <- svm_radial_cv$best.parameters
best_linear <- svm_cv$best.parameters
# Combine results into a data frame
best_results <- data.frame(
Kernel = c("Polynomial", "Radial", "Linear"),
Cost = c(best_poly$cost, best_radial$cost, best_linear$cost),
Gamma = c(best_radial$gamma, NA, NA),
Degree = c(best_poly$degree, NA, NA),
Performance = c(svm_poly_cv$best.performance, svm_radial_cv$best.performance, svm_cv$best.performance)
)
barplot(best_results$Performance,
names.arg = best_results$Kernel,
main = "Performance",
xlab = "Kernel",
ylab = "Error",
col = "skyblue")
This problem involves the OJ data set which is part of the ISLR2 package. (a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.
data("OJ"); set.seed(123)
index <- sample(nrow(OJ), 800)
ojtrain <- OJ[index, ]
ojtest <- OJ[-index, ]
svm_linear <- svm(Purchase ~ ., data = ojtrain, kernel = "linear", cost = 0.01)
summary(svm_linear)
##
## Call:
## svm(formula = Purchase ~ ., data = ojtrain, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 442
##
## ( 220 222 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
There are 442 support vectors, with 220 belonging to class CH and 222 belonging to class MM.
ojtrain_pred <- predict(svm_linear,ojtrain)
table(ojtrain$Purchase, ojtrain_pred)
## ojtrain_pred
## CH MM
## CH 426 61
## MM 71 242
(61+71)/800
## [1] 0.165
16.5% train error rate
ojtest_pred <- predict(svm_linear,ojtest)
table(ojtest$Purchase, ojtest_pred)
## ojtest_pred
## CH MM
## CH 145 21
## MM 27 77
(21+27)/800
## [1] 0.06
6% test error rate
tuned_linear <- tune(svm, Purchase ~ ., data = ojtrain, kernel = "linear", ranges = list(cost = 10^(-2:1)))
summary(tuned_linear)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.16875
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17625 0.03143004
## 2 0.10 0.17250 0.03425801
## 3 1.00 0.16875 0.03596391
## 4 10.00 0.17250 0.02751262
Suggests optimal cost is 1.00.
svm_linear_tuned <- svm(Purchase ~ ., data = ojtrain, kernel = "linear", cost = 1)
summary(svm_linear_tuned)
##
## Call:
## svm(formula = Purchase ~ ., data = ojtrain, kernel = "linear", cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 340
##
## ( 170 170 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
ojtrain_pred_tuned <- predict(svm_linear_tuned,ojtrain)
table(ojtrain$Purchase, ojtrain_pred_tuned)
## ojtrain_pred_tuned
## CH MM
## CH 428 59
## MM 69 244
(69+59)/800
## [1] 0.16
16% error rate for train
ojtest_pred_tuned <- predict(svm_linear_tuned,ojtest)
table(ojtest$Purchase, ojtest_pred_tuned)
## ojtest_pred_tuned
## CH MM
## CH 148 18
## MM 24 80
(18+24)/800
## [1] 0.0525
5.25% test error rate.
#untuned radial
svm_radial <- svm(Purchase ~ ., data = ojtrain, kernel = "radial", cost = 0.01)
train_pred_rad <- predict(svm_radial,ojtrain)
table(ojtrain$Purchase, train_pred_rad)
## train_pred_rad
## CH MM
## CH 487 0
## MM 313 0
test_pred_rad <- predict(svm_radial,ojtest)
table(ojtest$Purchase, test_pred_rad)
## test_pred_rad
## CH MM
## CH 166 0
## MM 104 0
tuned_radial <- tune(svm, Purchase ~ ., data = ojtrain, kernel = "radial", ranges = list(cost = 10^(-2:1)))
summary(tuned_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.1625
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39125 0.04411554
## 2 0.10 0.18125 0.03691676
## 3 1.00 0.16250 0.03486083
## 4 10.00 0.16500 0.03622844
#best cost is 1
#tuned
svm_radial_tuned <- svm(Purchase ~ ., data = ojtrain, kernel = "radial", cost = 1)
train_pred_rad_tuned <- predict(svm_radial_tuned,ojtrain)
table(ojtrain$Purchase, train_pred_rad_tuned)
## train_pred_rad_tuned
## CH MM
## CH 446 41
## MM 70 243
(41+70)/800
## [1] 0.13875
test_pred_rad_tuned <- predict(svm_radial_tuned,ojtest)
table(ojtest$Purchase, test_pred_rad_tuned)
## test_pred_rad_tuned
## CH MM
## CH 149 17
## MM 34 70
(17+34)/800
## [1] 0.06375
#untuned polynomial
svm_poly <- svm(Purchase ~ ., data = ojtrain, kernel = "polynomial", cost = 0.01, degree = 2)
train_pred_poly <- predict(svm_poly,ojtrain)
table(ojtrain$Purchase, train_pred_poly)
## train_pred_poly
## CH MM
## CH 485 2
## MM 296 17
(2+296)/800
## [1] 0.3725
test_pred_poly <- predict(svm_poly,ojtest)
table(ojtest$Purchase, test_pred_poly)
## test_pred_poly
## CH MM
## CH 165 1
## MM 100 4
(1+100)/800
## [1] 0.12625
tuned_poly <- tune(svm, Purchase ~ ., data = ojtrain, kernel = "polynomial", ranges = list(cost = 10^(-2:1)),degree=2)
summary(tuned_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.17125
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.38750 0.04289846
## 2 0.10 0.30875 0.01772671
## 3 1.00 0.18375 0.03335936
## 4 10.00 0.17125 0.03729108
#best cost is 10
svm_poly_tuned <- svm(Purchase ~ ., data = ojtrain, kernel = "polynomial", cost = 10, degree = 2)
train_pred_poly_tuned <- predict(svm_poly_tuned,ojtrain)
table(ojtrain$Purchase, train_pred_poly_tuned)
## train_pred_poly_tuned
## CH MM
## CH 451 36
## MM 79 234
(36+79)/800
## [1] 0.14375
test_pred_poly_tuned <- predict(svm_poly_tuned,ojtest)
table(ojtest$Purchase, test_pred_poly_tuned)
## test_pred_poly_tuned
## CH MM
## CH 150 16
## MM 39 65
(16+39)/800
## [1] 0.06875
The lowest error rate was for the tuned linear svm model.