library(ggplot2)
library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(e1071)
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
attach(Auto)
## The following object is masked from package:lubridate:
##
## origin
##
## The following object is masked from package:ggplot2:
##
## mpg
attach(OJ)
set.seed(38)
x1 <-runif(500)- 0.5
x2 <-runif(500)- 0.5
y <- 1 * (x1^2- x2^2 > 0)
df <- data.frame(x1 = x1, x2 = x2, y = as.factor(y))
ggplot(df, aes(x = x1, y = x2, color = y)) +
geom_point() +
labs(x = "X1", y = "X2") +
theme_minimal()
df_glm <- glm(y ~ x1 + x2, data = df, family = "binomial")
summary(df_glm)
##
## Call:
## glm(formula = y ~ x1 + x2, family = "binomial", data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.02863 0.08964 0.319 0.749
## x1 -0.36562 0.31347 -1.166 0.243
## x2 0.03616 0.32143 0.112 0.910
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 693.02 on 499 degrees of freedom
## Residual deviance: 691.64 on 497 degrees of freedom
## AIC: 697.64
##
## Number of Fisher Scoring iterations: 3
df$pred_prob <- predict(df_glm, type = "response")
df$pred_class <- as.factor(ifelse(df$pred_prob > 0.5, 1, 0))
ggplot(df, aes(x = x1, y = x2, color = pred_class)) +
geom_point() +
labs(x = "X1", y = "X2", color = "Predicted Class") +
theme_minimal()
df_glm2 <- glm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2), data = df, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(df_glm2)
##
## Call:
## glm(formula = y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2), family = "binomial",
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.935e+14 5.494e+06 -71617558 <2e-16 ***
## x1 -1.893e+14 1.050e+07 -18023155 <2e-16 ***
## x2 -2.086e+13 1.079e+07 -1933585 <2e-16 ***
## I(x1^2) 3.555e+16 4.001e+07 888517154 <2e-16 ***
## I(x2^2) -3.165e+16 4.149e+07 -762811342 <2e-16 ***
## I(x1 * x2) 6.597e+14 3.765e+07 17523197 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 693.02 on 499 degrees of freedom
## Residual deviance: 1441.75 on 494 degrees of freedom
## AIC: 1453.7
##
## Number of Fisher Scoring iterations: 23
df$pred_prob2 <- predict(df_glm2, type = "response")
df$pred_2 <- as.factor(ifelse(df$pred_prob2 > 0.5, 1, 0))
ggplot(df, aes(x = x1, y = x2, color = pred_2)) +
geom_point() +
labs(x = "X1", y = "X2", color = "Predicted Class") +
theme_minimal()
svc_model <- svm(y ~ x1 + x2, data = df, kernel = "linear", cost = 1)
summary(svc_model)
##
## Call:
## svm(formula = y ~ x1 + x2, data = df, kernel = "linear", cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 486
##
## ( 243 243 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
df$svc_pred <- predict(svc_model, newdata = df)
ggplot(df, aes(x = x1, y = x2, color = svc_pred)) +
geom_point() +
labs(x = "X1", y = "X2", color = "Predicted Class") +
theme_minimal()
df_svm <- svm(y ~ x1 + x2, data = df, kernel = 'radial', gamma = 1)
summary(df_svm)
##
## Call:
## svm(formula = y ~ x1 + x2, data = df, kernel = "radial", gamma = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 157
##
## ( 78 79 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
df$svm_pred <- predict(df_svm, newdata = df)
ggplot(df, aes(x = x1, y = x2, color = svm_pred)) +
geom_point() +
labs(x = "X1", y = "X2", color = "Predicted Class") +
theme_minimal()
The basic logistic model and the SVC with a linear kernel didn’t do well because their boundaries were straight, but the actual pattern in the data was curved. When we added curved terms to the logistic model or used an SVM with a radial kernel, the models did much better. The radial SVM gave the best results because it could follow the curved shape of the data.
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
median_mpg <- median(Auto$mpg)
Auto$high_mpg <- as.factor(ifelse(Auto$mpg > median_mpg, 1, 0))
set.seed(38)
Auto2 <- Auto[, !(names(Auto) %in% c("mpg"))]
car_svm <- tune(svm, high_mpg ~ ., data = Auto2,
kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10, 100)))
summary(car_svm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.01
##
## - best performance: 0.08916667
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.08916667 0.04669520
## 2 1e-01 0.09429487 0.04485190
## 3 1e+00 0.08923077 0.03648006
## 4 5e+00 0.10448718 0.05011219
## 5 1e+01 0.09942308 0.04867149
## 6 1e+02 0.10948718 0.04100538
After performing 10-fold cross-validation, the best SVM model used a cost of 1 and achieved the lowest error rate of 0.0891.
set.seed(38)
car_radial <- tune(svm, high_mpg ~ ., data = Auto2,
kernel = "radial",
ranges = list(cost = c(0.1, 1, 10),
gamma = c(0.1, 1, 10)))
summary(car_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 0.1
##
## - best performance: 0.06365385
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.1 0.08910256 0.04632762
## 2 1.0 0.1 0.08666667 0.04364606
## 3 10.0 0.1 0.06365385 0.03433469
## 4 0.1 1.0 0.50089744 0.16333516
## 5 1.0 1.0 0.07653846 0.04491370
## 6 10.0 1.0 0.08166667 0.05076232
## 7 0.1 10.0 0.54589744 0.04287924
## 8 1.0 10.0 0.51006410 0.06199973
## 9 10.0 10.0 0.49987179 0.05890991
set.seed(38)
car_poly <- tune(svm, high_mpg ~ ., data = Auto2,
kernel = "polynomial",
ranges = list(cost = c(0.1, 1, 10),
degree = c(2, 3, 4)))
summary(car_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 10 2
##
## - best performance: 0.530641
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 0.5408974 0.05097648
## 2 1.0 2 0.5408974 0.05097648
## 3 10.0 2 0.5306410 0.07289163
## 4 0.1 3 0.5408974 0.05097648
## 5 1.0 3 0.5408974 0.05097648
## 6 10.0 3 0.5408974 0.05097648
## 7 0.1 4 0.5408974 0.05097648
## 8 1.0 4 0.5408974 0.05097648
## 9 10.0 4 0.5408974 0.05097648
The radial SVM performed nearly as well as the linear SVM, with a lowest error of 0.0636. In contrast, the polynomial SVM failed to improve predictions, showing high error rates across all settings. This suggests that a linear or radial kernel works best for this classification task.
Auto2$high_mpg <- as.factor(Auto2$high_mpg)
car_svm <- svm(high_mpg ~ ., data = Auto2, kernel = "radial", cost = 10, gamma = 1)
plot(car_svm, Auto2, horsepower ~ weight)
str(OJ)
## 'data.frame': 1070 obs. of 18 variables:
## $ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
## $ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
## $ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
## $ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
## $ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
## $ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
## $ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
## $ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
## $ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
## $ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
## $ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
## $ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
## $ PctDiscMM : num 0 0.151 0 0 0 ...
## $ PctDiscCH : num 0 0 0.0914 0 0 ...
## $ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
## $ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
set.seed(38)
train_index <- sample(1:nrow(OJ), 800)
oj_train <- OJ[train_index, ]
oj_test <- OJ[-train_index, ]
oj_svm <- svm(Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01)
summary(oj_svm)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 442
##
## ( 220 222 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The SVM used a low cost of 0.01, so it made a wide margin and allowed some mistakes. It used 442 support vectors, meaning it needed a lot of help to separate the two classes, which could mean it’s not fitting the data very tightly.
train_pred <- predict(oj_svm, oj_train)
test_pred <- predict(oj_svm, oj_test)
train_error <- mean(train_pred != oj_train$Purchase)
test_error <- mean(test_pred != oj_test$Purchase)
train_error
## [1] 0.16625
test_error
## [1] 0.1703704
set.seed(38)
oj_tuned <- tune(svm, Purchase ~ ., data = oj_train,
kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(oj_tuned)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.01
##
## - best performance: 0.1675
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.16750 0.04338138
## 2 0.10 0.17250 0.03670453
## 3 1.00 0.17125 0.03775377
## 4 5.00 0.17125 0.03910900
## 5 10.00 0.17250 0.03717451
best_oj_svm <- oj_tuned$best.model
train_pred_best <- predict(best_oj_svm, oj_train)
test_pred_best <- predict(best_oj_svm, oj_test)
train_error_best <- mean(train_pred_best != oj_train$Purchase)
test_error_best <- mean(test_pred_best != oj_test$Purchase)
train_error_best
## [1] 0.16625
test_error_best
## [1] 0.1703704
oj_rad <- svm(Purchase ~ ., data = oj_train, kernel = "radial", cost = 0.01)
summary(oj_rad)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "radial", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.01
##
## Number of Support Vectors: 611
##
## ( 308 303 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train_pred2 <- predict(oj_rad, oj_train)
test_pred2 <- predict(oj_rad, oj_test)
train_error2 <- mean(train_pred2 != oj_train$Purchase)
test_error2 <- mean(test_pred2 != oj_test$Purchase)
train_error2
## [1] 0.37875
test_error2
## [1] 0.4222222
set.seed(38)
oj_tuned_rad <- tune(svm, Purchase ~ ., data = oj_train,
kernel = "radial",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(oj_tuned_rad)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.18
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.37875 0.05497790
## 2 0.10 0.18250 0.03641962
## 3 1.00 0.18000 0.02958040
## 4 5.00 0.18375 0.02766993
## 5 10.00 0.18750 0.02763854
best_oj_rad <- oj_tuned_rad$best.model
train_pred_best2 <- predict(best_oj_rad, oj_train)
test_pred_best2 <- predict(best_oj_rad, oj_test)
train_error_best2 <- mean(train_pred_best2 != oj_train$Purchase)
test_error_best2 <- mean(test_pred_best2 != oj_test$Purchase)
train_error_best2
## [1] 0.14625
test_error_best2
## [1] 0.1703704
oj_poly <- svm(Purchase ~ ., data = oj_train,
kernel = "polynomial", degree = 2, cost = 0.01)
summary(oj_poly)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "polynomial",
## degree = 2, cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 0.01
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 611
##
## ( 308 303 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
mean(predict(oj_poly, oj_train) != oj_train$Purchase)
## [1] 0.37875
mean(predict(oj_poly, oj_test) != oj_test$Purchase)
## [1] 0.4222222
oj_tuned_poly <- tune(svm, Purchase ~ ., data = oj_train,
kernel = "polynomial", degree = 2,
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(oj_tuned_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.1875
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.37875 0.05036326
## 2 0.10 0.31625 0.07121222
## 3 1.00 0.20625 0.03294039
## 4 5.00 0.18875 0.04348132
## 5 10.00 0.18750 0.04409586
best_oj_poly <- oj_tuned_poly$best.model
mean(predict(best_oj_poly, oj_train) != oj_train$Purchase)
## [1] 0.1575
mean(predict(best_oj_poly, oj_test) != oj_test$Purchase)
## [1] 0.1851852
The radial SVM worked the best with the lowest test error. The polynomial one didn’t do as well. Overall, the radial SVM gave the most accurate results on this data.