knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(123)
Generate a simulated two-class data set with 100 observations and two features in which there is a visible but non-linear separation be tween the two classes. Show that in this setting, a support vector machine with a polynomial kernel (with degree greater than 1) or a radial kernel will outperform a support vector classifier on the training data. Which technique performs best on the test data? Make plots and report training and test error rates in order to back up your assertions.
# Simulate circular data
n <- 100
x <- matrix(rnorm(2 * n), ncol = 2)
y <- ifelse(x[,1]^2 + x[,2]^2 > 1.5, 1, 0)
data <- data.frame(x1 = x[,1], x2 = x[,2], y = as.factor(y))
# Plot the data
ggplot(data, aes(x1, x2, color = y)) + geom_point() +
labs(title = "Simulated Data with Non-Linear Boundary")
svm.linear <- svm(y ~ ., data = data, kernel = "linear", cost = 1)
plot(svm.linear, data)
svm.poly <- svm(y ~ ., data = data, kernel = "polynomial", degree = 3, cost = 1)
plot(svm.poly, data)
svm.radial <- svm(y ~ ., data = data, kernel = "radial", gamma = 1, cost = 1)
plot(svm.radial, data)
trainIndex <- sample(1:n, 0.7 * n)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]
models <- list(
linear = svm(y ~ ., data = train, kernel = "linear", cost = 1),
poly = svm(y ~ ., data = train, kernel = "polynomial", degree = 3, cost = 1),
radial = svm(y ~ ., data = train, kernel = "radial", gamma = 1, cost = 1)
)
errorRates <- sapply(models, function(model) {
train.pred <- predict(model, train)
test.pred <- predict(model, test)
c(
train = mean(train.pred != train$y),
test = mean(test.pred != test$y)
)
})
knitr::kable(t(errorRates), caption = "Training and Test Error Rates")
train | test | |
---|---|---|
linear | 0.3428571 | 0.3333333 |
poly | 0.3285714 | 0.1666667 |
radial | 0.0142857 | 0.0666667 |
The radial svm classification plot outperformed by far. Because of majority of the data is centered in a circular shape.
In this problem, you will use support vector approaches in order to predict whether a given car gets high or low gas mileage based on the Auto data set.
Create a binary variable that takes on a 1 for cars with gas mileage above the median, and a 0 for cars with gas mileage below the median.
data("Auto")
Auto <- na.omit(Auto)
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto$mpg01 <- as.factor(Auto$mpg01)
Auto <- Auto[, -which(names(Auto) == "mpg")]
Fit a support vector classifier to the data with various values of cost, in order to predict whether a car gets high or low gas mileage. Report the cross-validation errors associated with different values of this parameter. Comment on your results. Note you will need to fit the classifier without the gas mileage variable to produce sensible results.
set.seed(1)
cv.linear <- tune(svm, mpg01 ~ ., data = Auto, kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 10, 100)))
summary(cv.linear)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.08673077
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.08923077 0.04698309
## 2 1e-01 0.08673077 0.04040897
## 3 1e+00 0.09961538 0.04923181
## 4 1e+01 0.11237179 0.05701890
## 5 1e+02 0.11750000 0.06208951
In this analysis, we fit a support vector classifier with a linear kernel to the Auto dataset in order to predict whether a car achieves high or low gas mileage. The binary response variable mpg01 was created by assigning a value of 1 to cars with above-median miles per gallon. Using 10-fold cross-validation, we evaluated the model across various values of the cost parameter. The lowest cross-validation error rate (8.67%) was achieved when cost = 0.1. Performance worsened for both smaller and larger values of cost, suggesting that this value strikes a balance between underfitting and overfitting. Based on these results, cost = 0.1 was selected as the optimal tuning parameter for the linear SVM.
Now repeat (b), this time using SVMs with radial and polynomial basis kernels, with different values of gamma and degree and cost. Comment on your results.
# Radial
set.seed(1)
cv.radial <- tune(svm, mpg01 ~ ., data = Auto, kernel = "radial",
ranges = list(cost = c(0.1, 1, 10), gamma = c(0.5, 1, 2)))
summary(cv.radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 1
##
## - best performance: 0.07897436
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.5 0.08410256 0.04164179
## 2 1.0 0.5 0.08673077 0.04708817
## 3 10.0 0.5 0.09173077 0.04008042
## 4 0.1 1.0 0.55115385 0.04366593
## 5 1.0 1.0 0.07903846 0.04891067
## 6 10.0 1.0 0.07897436 0.04869339
## 7 0.1 2.0 0.55115385 0.04366593
## 8 1.0 2.0 0.13769231 0.06926822
## 9 10.0 2.0 0.13512821 0.06692968
# Polynomial
set.seed(1)
cv.poly <- tune(svm, mpg01 ~ ., data = Auto, kernel = "polynomial",
ranges = list(cost = c(0.1, 1, 10), degree = c(2, 3, 4)))
summary(cv.poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 10 2
##
## - best performance: 0.520641
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 0.5511538 0.04366593
## 2 1.0 2 0.5511538 0.04366593
## 3 10.0 2 0.5206410 0.08505283
## 4 0.1 3 0.5511538 0.04366593
## 5 1.0 3 0.5511538 0.04366593
## 6 10.0 3 0.5511538 0.04366593
## 7 0.1 4 0.5511538 0.04366593
## 8 1.0 4 0.5511538 0.04366593
## 9 10.0 4 0.5511538 0.04366593
Cross-validation revealed that the radial kernel achieved the lowest error rate of 7.89%, slightly outperforming the linear kernel from Part B. In contrast, the polynomial kernel performed poorly, with a cross-validation error of over 52%, indicating severe overfitting or model mismatch. These results suggest that a radial kernel is more effective at capturing the underlying structure of the data than either the linear or polynomial alternatives, likely due to its flexibility in modeling non-linear relationships without introducing excessive variance.
Make some plots to back up your assertions in (b) and (c).
Hint: In the lab, we used the plot() function for svm objects only in cases with p =2. When p>2, you can use the plot() function to create plots displaying pairs of variables at a time. Essentially, instead of typing
plot(svmfit, dat)
where svmfit contains your fitted model and dat is a data frame containing your data, you can type
plot(svmfit, dat, x1 ∼ x4)
in order to plot just the first and fourth variables. However, you must replace x1 and x4 with the correct variable names. To find out more, type ?plot.svm.
best.linear <- cv.linear$best.model
best.radial <- cv.radial$best.model
best.poly <- cv.poly$best.model
par(mfrow = c(1, 1))
plot(best.linear, Auto, horsepower ~ weight, main = "Linear Kernel")
plot(best.radial, Auto, horsepower ~ weight, main = "Radial Kernel")
plot(best.poly, Auto, horsepower ~ weight, main = "Poly Kernel")
This problem involves the OJ data set which is part of the ISLR2 package.
Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.
data(OJ)
set.seed(1)
trainIndex <- sample(1:nrow(OJ), 800)
OJ.train <- OJ[trainIndex, ]
OJ.test <- OJ[-trainIndex, ]
Fit a support vector classifier to the training data using cost = 0.01, with Purchase as the response and the other vari ables as predictors. Use the summary() function to produce summary statistics, and describe the results obtained.
svm.linearOJ <- svm(Purchase ~ ., data = OJ.train, kernel = "linear", cost = 0.01)
summary(svm.linearOJ)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ.train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 435
##
## ( 219 216 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
What are the training and test error rates?
train.pred <- predict(svm.linearOJ, OJ.train)
test.pred <- predict(svm.linearOJ, OJ.test)
train.error <- mean(train.pred != OJ.train$Purchase)
test.error <- mean(test.pred != OJ.test$Purchase)
c(train = train.error, test = test.error)
## train test
## 0.1750000 0.1777778
Use the tune() function to select an optimal cost. Consider values in the range 0.01 to 10.
cv.oj.linear <- tune(svm, Purchase ~ ., data = OJ.train, kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 10)))
summary(cv.oj.linear)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.17125
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17375 0.03884174
## 2 0.10 0.17875 0.03064696
## 3 1.00 0.17500 0.03061862
## 4 10.00 0.17125 0.03488573
Compute the training and test error rates using this new value for cost.
best.model.linear <- cv.oj.linear$best.model
train.err.best <- mean(predict(best.model.linear, OJ.train) != OJ.train$Purchase)
test.err.best <- mean(predict(best.model.linear, OJ.test) != OJ.test$Purchase)
c(train = train.err.best, test = test.err.best)
## train test
## 0.1637500 0.1481481
Repeat parts (b) through (e) using a support vector machine with a radial kernel. Use the default value for gamma.
cv.oj.radial <- tune(svm, Purchase ~ ., data = OJ.train, kernel = "radial",
ranges = list(cost = c(0.1, 1, 10)))
summary(cv.oj.radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.17625
##
## - Detailed performance results:
## cost error dispersion
## 1 0.1 0.18250 0.05470883
## 2 1.0 0.17625 0.03793727
## 3 10.0 0.18125 0.04340139
Repeat parts (b) through (e) using a support vector machine with a polynomial kernel. Set degree = 2.
cv.oj.poly <- tune(svm, Purchase ~ ., data = OJ.train, kernel = "polynomial",
ranges = list(cost = c(0.1, 1, 10)), degree = 2)
summary(cv.oj.poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.18625
##
## - Detailed performance results:
## cost error dispersion
## 1 0.1 0.32375 0.06730166
## 2 1.0 0.20000 0.05137012
## 3 10.0 0.18625 0.05185785
Overall, which approach seems to give the best results on this data?
errs <- data.frame(
Model = c("Linear", "Radial", "Polynomial"),
TrainError = c(
mean(predict(cv.oj.linear$best.model, OJ.train) != OJ.train$Purchase),
mean(predict(cv.oj.radial$best.model, OJ.train) != OJ.train$Purchase),
mean(predict(cv.oj.poly$best.model, OJ.train) != OJ.train$Purchase)
),
TestError = c(
mean(predict(cv.oj.linear$best.model, OJ.test) != OJ.test$Purchase),
mean(predict(cv.oj.radial$best.model, OJ.test) != OJ.test$Purchase),
mean(predict(cv.oj.poly$best.model, OJ.test) != OJ.test$Purchase)
)
)
knitr::kable(errs, caption = "Train and Test Error Comparison (OJ Data)")
Model | TrainError | TestError |
---|---|---|
Linear | 0.16375 | 0.1481481 |
Radial | 0.15125 | 0.1851852 |
Polynomial | 0.15000 | 0.1888889 |
To evaluate model performance on the OJ dataset, we compared support vector machines using linear, radial, and polynomial kernels. While all models performed reasonably well, the linear kernel achieved the lowest test error rate of 14.8%, making it the most effective at generalizing to unseen data. The radial and polynomial kernels had slightly lower training errors, but their test error rates were higher (18.5% and 18.9%, respectively), indicating potential overfitting. These results suggest that the additional complexity introduced by non-linear kernels did not improve out-of-sample performance, and in this case, a simpler linear decision boundary generalized best.