library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary. We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features.
set.seed(23)
x1 = runif(500) - 0.5
x2 = runif(500) - 0.5
y = 1 * (x1^2 - x2^2 > 0)
data.frame(x1, x2) %>%
ggplot(aes(x1, x2)) +
geom_point(aes(colour = as.factor(y))) +
theme_minimal() +
theme(legend.position = "none")
logit_model = glm(y ~ x1 + x2, family = "binomial")
logit_model
##
## Call: glm(formula = y ~ x1 + x2, family = "binomial")
##
## Coefficients:
## (Intercept) x1 x2
## -0.05418 -0.52477 0.02243
##
## Degrees of Freedom: 499 Total (i.e. Null); 497 Residual
## Null Deviance: 692.8
## Residual Deviance: 689.9 AIC: 695.9
df = data.frame(x1, x2, y = as.factor(y))
df$preds = ifelse(predict(logit_model, type = "response") > 0.5, 1, 0)
df %>%
ggplot(aes(x1, x2)) +
geom_point(aes(colour = as.factor(preds))) +
theme_minimal() +
theme(legend.position = "none")
logit_model2 = glm(y ~ x1 + x2 + I(x1^2) + I(x2^2), family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logit_model2
##
## Call: glm(formula = y ~ x1 + x2 + I(x1^2) + I(x2^2), family = "binomial")
##
## Coefficients:
## (Intercept) x1 x2 I(x1^2) I(x2^2)
## 8.072 120.777 -121.585 35721.515 -35675.872
##
## Degrees of Freedom: 499 Total (i.e. Null); 495 Residual
## Null Deviance: 692.8
## Residual Deviance: 1.303e-05 AIC: 10
df$preds2 = ifelse(predict(logit_model2, type = "response") > 0.5, 1, 0)
df %>%
ggplot(aes(x1, x2)) +
geom_point(aes(colour = as.factor(preds2))) +
theme_minimal() +
theme(legend.position = "none")
svm_model = svm(y ~ x1 + x2, data = df, kernel = "linear", scale = F)
summary(svm_model)
##
## Call:
## svm(formula = y ~ x1 + x2, data = df, kernel = "linear", scale = F)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 481
##
## ( 240 241 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
set.seed(23)
tune_svm = tune(svm, y ~ x1 + x2, data = df, kernal = "linear",
ranges = list(cost = c(10, 100, 500, 1000)))
summary(tune_svm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 500
##
## - best performance: 0.02
##
## - Detailed performance results:
## cost error dispersion
## 1 10 0.024 0.02458545
## 2 100 0.022 0.02201010
## 3 500 0.020 0.02108185
## 4 1000 0.028 0.02699794
best_model = tune_svm$best.model
df$preds3 = predict(best_model, df[ ,1:3])
df %>%
ggplot(aes(x1, x2)) +
geom_point(aes(colour = as.factor(preds3))) +
theme_minimal() +
theme(legend.position = "none")
svm_model2 = svm(y ~ x1 + x2, data = df, kernel = "radial", gamma = 1)
summary(svm_model2)
##
## Call:
## svm(formula = y ~ x1 + x2, data = df, kernel = "radial", gamma = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 161
##
## ( 79 82 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
set.seed(23)
tune_svm2 = tune(svm, y ~ x1 + x2, data = df, kernal = "radial",
ranges = list(cost = c(10, 100, 500, 1000), gamma = c(0.1, 0.5, 1, 2, 3)))
summary(tune_svm2)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1000 0.1
##
## - best performance: 0.018
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 10 0.1 0.034 0.02674987
## 2 100 0.1 0.026 0.02503331
## 3 500 0.1 0.020 0.02309401
## 4 1000 0.1 0.018 0.02201010
## 5 10 0.5 0.024 0.02458545
## 6 100 0.5 0.022 0.02201010
## 7 500 0.5 0.020 0.02108185
## 8 1000 0.5 0.028 0.02699794
## 9 10 1.0 0.028 0.03155243
## 10 100 1.0 0.028 0.02347576
## 11 500 1.0 0.028 0.02149935
## 12 1000 1.0 0.032 0.02529822
## 13 10 2.0 0.030 0.03299832
## 14 100 2.0 0.026 0.02319004
## 15 500 2.0 0.034 0.03405877
## 16 1000 2.0 0.030 0.03018462
## 17 10 3.0 0.034 0.03272783
## 18 100 3.0 0.038 0.03326660
## 19 500 3.0 0.030 0.03162278
## 20 1000 3.0 0.030 0.03299832
best_model2 = tune_svm2$best.model
df$preds4 = predict(best_model, df[ ,1:3])
df %>%
ggplot(aes(x1, x2)) +
geom_point(aes(colour = as.factor(preds4))) +
theme_minimal() +
theme(legend.position = "none")
The main take away is that the SVM impressively modeled the non-linear relationship with very few parameters. This is going to be a lot more efficient that manually trying to calculate the relationship in a logit model.
In this problem, you will use support vector approaches in order to predict whether a given car gets high or low gas mileage based on the Auto data set.
auto = Auto
auto$mpg_class = factor(as.numeric(auto$mpg > median(auto$mpg)))
svm_model = svm(mpg_class ~ . -mpg, data = auto, kernel = "linear", scale = F)
set.seed(23)
tune_svm = tune(svm, mpg_class ~ . -mpg, data = auto, kernal = "linear",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 10, 100, 1000)))
summary(tune_svm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 100
##
## - best performance: 0.08692308
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.56903846 0.05422978
## 2 1e-02 0.56903846 0.05422978
## 3 1e-01 0.15596154 0.07332520
## 4 1e+00 0.09205128 0.04410227
## 5 1e+01 0.09205128 0.04729922
## 6 1e+02 0.08692308 0.05148748
## 7 1e+03 0.10455128 0.05021545
df = data.frame(tune_svm$performances$cost, tune_svm$performances$error)
df %>%
ggplot(aes(tune_svm.performances.cost, tune_svm.performances.error)) +
geom_point()
set.seed(23)
tune_svm2 = tune(svm, mpg_class ~ . -mpg, data = auto, kernal = "radial",
ranges = list(cost = c(0.1, 1, 10, 100, 500, 1000), gamma = c(0.1, 0.5, 1, 2, 3)))
summary(tune_svm2)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 0.1
##
## - best performance: 0.07647436
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.1 0.08948718 0.03889147
## 2 1e+00 0.1 0.08692308 0.04569673
## 3 1e+01 0.1 0.07647436 0.03785376
## 4 1e+02 0.1 0.09705128 0.02668837
## 5 5e+02 0.1 0.09974359 0.04923644
## 6 1e+03 0.1 0.09974359 0.04923644
## 7 1e-01 0.5 0.08698718 0.04068822
## 8 1e+00 0.5 0.08435897 0.04520148
## 9 1e+01 0.5 0.08423077 0.03606392
## 10 1e+02 0.5 0.08435897 0.04032997
## 11 5e+02 0.5 0.08435897 0.04032997
## 12 1e+03 0.5 0.08435897 0.04032997
## 13 1e-01 1.0 0.56903846 0.05422978
## 14 1e+00 1.0 0.07916667 0.03506781
## 15 1e+01 1.0 0.08423077 0.03429497
## 16 1e+02 1.0 0.08423077 0.03429497
## 17 5e+02 1.0 0.08423077 0.03429497
## 18 1e+03 1.0 0.08423077 0.03429497
## 19 1e-01 2.0 0.56903846 0.05422978
## 20 1e+00 2.0 0.11487179 0.06033198
## 21 1e+01 2.0 0.11480769 0.05532699
## 22 1e+02 2.0 0.11480769 0.05532699
## 23 5e+02 2.0 0.11480769 0.05532699
## 24 1e+03 2.0 0.11480769 0.05532699
## 25 1e-01 3.0 0.56903846 0.05422978
## 26 1e+00 3.0 0.40365385 0.15419418
## 27 1e+01 3.0 0.38596154 0.15519190
## 28 1e+02 3.0 0.38596154 0.15519190
## 29 5e+02 3.0 0.38596154 0.15519190
## 30 1e+03 3.0 0.38596154 0.15519190
set.seed(23)
tune_svm3 = tune(svm, mpg_class ~ . -mpg, data = auto, kernal = "polynomial",
ranges = list(cost = c(0.1, 1, 10, 100, 500, 1000), gamma = c(0.1, 0.5, 1, 2, 3)))
summary(tune_svm3)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 0.1
##
## - best performance: 0.07647436
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.1 0.08948718 0.03889147
## 2 1e+00 0.1 0.08692308 0.04569673
## 3 1e+01 0.1 0.07647436 0.03785376
## 4 1e+02 0.1 0.09705128 0.02668837
## 5 5e+02 0.1 0.09974359 0.04923644
## 6 1e+03 0.1 0.09974359 0.04923644
## 7 1e-01 0.5 0.08698718 0.04068822
## 8 1e+00 0.5 0.08435897 0.04520148
## 9 1e+01 0.5 0.08423077 0.03606392
## 10 1e+02 0.5 0.08435897 0.04032997
## 11 5e+02 0.5 0.08435897 0.04032997
## 12 1e+03 0.5 0.08435897 0.04032997
## 13 1e-01 1.0 0.56903846 0.05422978
## 14 1e+00 1.0 0.07916667 0.03506781
## 15 1e+01 1.0 0.08423077 0.03429497
## 16 1e+02 1.0 0.08423077 0.03429497
## 17 5e+02 1.0 0.08423077 0.03429497
## 18 1e+03 1.0 0.08423077 0.03429497
## 19 1e-01 2.0 0.56903846 0.05422978
## 20 1e+00 2.0 0.11487179 0.06033198
## 21 1e+01 2.0 0.11480769 0.05532699
## 22 1e+02 2.0 0.11480769 0.05532699
## 23 5e+02 2.0 0.11480769 0.05532699
## 24 1e+03 2.0 0.11480769 0.05532699
## 25 1e-01 3.0 0.56903846 0.05422978
## 26 1e+00 3.0 0.40365385 0.15419418
## 27 1e+01 3.0 0.38596154 0.15519190
## 28 1e+02 3.0 0.38596154 0.15519190
## 29 5e+02 3.0 0.38596154 0.15519190
## 30 1e+03 3.0 0.38596154 0.15519190
The radial and polynomial outperformed the linear kernal.
Hint: In the lab, we used the plot() function for svm objects only in
cases with p = 2. When p > 2, you can use the plot() function to
create plots displaying pairs of variables at a time. Essentially,
instead of typing plot(svmfit, dat)
where
svmfit
contains your fitted model and dat
is a
data frame containing your data, you can type
plot(svmfit, dat, x1 ∼ x4)
in order to plot just the first
and fourth variables. However, you must replace x1 and x4 with the
correct variable names. To find out more, type
?plot.svm
.
auto = auto %>%
dplyr::select(-mpg)
svm_fit = svm(mpg_class ~ ., data = auto, kernal = "radial", scale = T, cost = 10, gamma = 0.1)
plot(svm_fit, auto, weight ~ horsepower)
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
plot(svm_fit,
auto,
weight ~ horsepower,
slice = list(cylinders = median(auto$cylinders),
displacement = median(auto$displacement),
acceleration = median(auto$acceleration),
year = median(auto$year),
origin = Mode(auto$origin),
name = Mode(auto$name)))
This problem involves the OJ data set which is part of the ISLR2 package.
oj = OJ
set.seed(23)
train_index = sample(1:nrow(oj), 800)
train = oj[train_index, ]
test = oj[-train_index, ]
svm_fit = svm(Purchase ~ ., data = train, kernel = "linear", scale = T, cost = 0.01)
summary(svm_fit)
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "linear", cost = 0.01,
## scale = T)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 441
##
## ( 220 221 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
There are two classes CH and MM. This is a large number of support vectors compared to the 800 training size.
data.frame(
training_error = mean(predict(svm_fit, train) != train$Purchase),
testing_error = mean(predict(svm_fit, test) != test$Purchase))
## training_error testing_error
## 1 0.165 0.162963
set.seed(23)
tune_svm = tune(svm, Purchase ~ ., data = train, kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 10)))
summary(tune_svm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.16625
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17125 0.05434266
## 2 0.10 0.17250 0.04779877
## 3 1.00 0.17000 0.04901814
## 4 10.00 0.16625 0.06039511
svm_fit = svm(Purchase ~ ., data = train, kernel = "linear", scale = T, cost = 10)
linear_df = data.frame(
training_error = mean(predict(svm_fit, train) != train$Purchase),
testing_error = mean(predict(svm_fit, test) != test$Purchase))
linear_df
## training_error testing_error
## 1 0.16125 0.1703704
Much lower error.
svm_fit2 = svm(Purchase ~ ., data = train, kernel = 'radial', decision.values = T, cost = 0.01)
summary(svm_fit2)
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "radial", decision.values = T,
## cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.01
##
## Number of Support Vectors: 616
##
## ( 306 310 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
data.frame(
training_error = mean(predict(svm_fit2, train) != train$Purchase),
testing_error = mean(predict(svm_fit2, test) != test$Purchase))
## training_error testing_error
## 1 0.3825 0.4111111
set.seed(23)
tune_svm = tune(svm, Purchase ~ ., data = train, kernel = "radial", decision.values = T,
ranges = list(cost = c(0.01, 0.1, 1, 10)))
summary(tune_svm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.165
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.38250 0.05277047
## 2 0.10 0.19500 0.06072479
## 3 1.00 0.16500 0.04401704
## 4 10.00 0.18375 0.05036326
svm_fit = svm(Purchase ~ ., data = train, kernal = "radial", cost = 1, decision.values = T)
radial_df = data.frame(
training_error = mean(predict(svm_fit, train) != train$Purchase),
testing_error = mean(predict(svm_fit, test) != test$Purchase))
radial_df
## training_error testing_error
## 1 0.14375 0.162963
svm_fit = svm(Purchase ~ ., data = train, kernel = "polynomial", cost = 0.01, degree = 2)
summary(svm_fit)
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "polynomial",
## cost = 0.01, degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 0.01
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 618
##
## ( 306 312 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
data.frame(
training_error = mean(predict(svm_fit, train) != train$Purchase),
testing_error = mean(predict(svm_fit, test) != test$Purchase))
## training_error testing_error
## 1 0.3825 0.4111111
set.seed(23)
tune_svm = tune(svm, Purchase ~ ., data = train, kernel = "polynomial", degree = 2,
ranges = list(cost = c(0.01, 0.1, 1, 10)))
summary(tune_svm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.18125
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.38250 0.05277047
## 2 0.10 0.31625 0.07289881
## 3 1.00 0.19375 0.05179085
## 4 10.00 0.18125 0.05179085
svm_fit = svm(Purchase ~ ., data = train, kernel = "polynomial", degree = 2, cost = 10)
poly_df = data.frame(
training_error = mean(predict(svm_fit, train) != train$Purchase),
testing_error = mean(predict(svm_fit, test) != test$Purchase))
poly_df
## training_error testing_error
## 1 0.14375 0.1666667
res = cbind("Kernel" = c("Poly", "Radial", "Linerar"), rbind(poly_df, radial_df, linear_df))
res
## Kernel training_error testing_error
## 1 Poly 0.14375 0.1666667
## 2 Radial 0.14375 0.1629630
## 3 Linerar 0.16125 0.1703704
The radial model seemed to do the best but they all had very similar results.