library(e1071)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(2025)
sim_data <- tibble(
X1 = rnorm(100),
X2 = rnorm(100)
) %>%
mutate(Class = factor(ifelse(X1^2 + X2^2 > 1.5, "A", "B")))
ggplot(sim_data, aes(x = X1, y = X2, color = Class)) +
geom_point(size = 2) +
theme_minimal() +
labs(title = "Simulated Nonlinear Boundary")
Interpretation: The plot clearly shows a nonlinear boundary, points classified as “A” are outside a circular-like boundary, while “B” points are inside. A linear classifier would struggle to separate these classes cleanly.
# Fitting SVM Models
set.seed(2025)
svm_linear <- svm(Class ~ ., data = sim_data, kernel = "linear", cost = 1)
svm_poly <- svm(Class ~ ., data = sim_data, kernel = "polynomial", degree = 3, cost = 1)
svm_radial <- svm(Class ~ ., data = sim_data, kernel = "radial", gamma = 0.5, cost = 1)
# Training Errors
train_preds <- tibble(
Linear = predict(svm_linear, sim_data),
Poly = predict(svm_poly, sim_data),
Radial = predict(svm_radial, sim_data)
)
train_error_rates <- tibble(
Model = c("Linear", "Polynomial", "Radial"),
Error = c(
mean(train_preds$Linear != sim_data$Class),
mean(train_preds$Poly != sim_data$Class),
mean(train_preds$Radial != sim_data$Class)
)
)
train_error_rates
## # A tibble: 3 × 2
## Model Error
## <chr> <dbl>
## 1 Linear 0.44
## 2 Polynomial 0.44
## 3 Radial 0.05
Interpretation:
- Linear SVM and Polynomial SVM both had 44% training error.
- Radial SVM had a very low training error (~5%), meaning it captures the nonlinear boundary much better.
# Test Set Performance
set.seed(3579)
test_data <- tibble(
X1 = rnorm(100),
X2 = rnorm(100)
) %>%
mutate(Class = factor(ifelse(X1^2 + X2^2 > 1.5, "A", "B")))
test_preds <- tibble(
Linear = predict(svm_linear, test_data),
Poly = predict(svm_poly, test_data),
Radial = predict(svm_radial, test_data)
)
test_error_rates <- tibble(
Model = c("Linear", "Polynomial", "Radial"),
Error = c(
mean(test_preds$Linear != test_data$Class),
mean(test_preds$Poly != test_data$Class),
mean(test_preds$Radial != test_data$Class)
)
)
test_error_rates
## # A tibble: 3 × 2
## Model Error
## <chr> <dbl>
## 1 Linear 0.5
## 2 Polynomial 0.5
## 3 Radial 0.09
Interpretation:
- Linear and Polynomial SVMs both had around 50% test error, performing poorly.
- Radial SVM had only 9% test error, outperforming all models on unseen data. Thus, Radial SVM is the best choice for this nonlinear separation.
Auto <- read.table("/Users/saransh/Downloads/Statistical_Learning_Resources/Auto.data", header = TRUE, na.strings = "?")
Auto <- na.omit(Auto)
Auto_clean <- Auto %>% select(-mpg, -name)
# Create high_mpg variable
Auto_clean$high_mpg <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto_clean$high_mpg <- as.factor(Auto_clean$high_mpg)
set.seed(2025)
tune_svc <- tune(svm, high_mpg ~ ., data = Auto_clean, kernel = "linear", ranges = list(cost = c(0.01, 0.1, 1, 10)))
summary(tune_svc)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.08166667
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.09192308 0.04204051
## 2 0.10 0.08935897 0.04543923
## 3 1.00 0.08166667 0.03942201
## 4 10.00 0.08423077 0.03991005
Comment:
- Best cost for linear SVM is around 1, achieving ~8% cross-validation error.
- Increasing cost beyond 1 only marginally improves results.
set.seed(2025)
tune_radial <- tune(svm, high_mpg ~ ., data = Auto_clean, kernel = "radial", ranges = list(cost = c(0.1, 1, 10), gamma = c(0.01, 0.1, 1)))
set.seed(2025)
tune_poly <- tune(svm, high_mpg ~ ., data = Auto_clean, kernel = "polynomial", degree = 2:3, ranges = list(cost = c(0.1, 1, 5)))
summary(tune_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 1
##
## - best performance: 0.06634615
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.01 0.11230769 0.03469417
## 2 1.0 0.01 0.08935897 0.04380206
## 3 10.0 0.01 0.09192308 0.04374365
## 4 0.1 0.10 0.09192308 0.04204051
## 5 1.0 0.10 0.08929487 0.04035244
## 6 10.0 0.10 0.08416667 0.03399525
## 7 0.1 1.00 0.08173077 0.04922072
## 8 1.0 1.00 0.06634615 0.02448843
## 9 10.0 1.00 0.08166667 0.02644087
summary(tune_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 5
##
## - best performance: 0.1885256
##
## - Detailed performance results:
## cost error dispersion
## 1 0.1 0.2755128 0.05637204
## 2 1.0 0.2578205 0.06836058
## 3 5.0 0.1885256 0.06072020
plot(tune_radial$best.model, Auto_clean, horsepower ~ weight)
plot(tune_poly$best.model, Auto_clean, horsepower ~ displacement)
plot(tune_svc$best.model, Auto_clean, acceleration ~ weight)
Interpretation:
- Radial boundary was more flexible and captured the structure better.
- Linear SVM had clean separation but slightly less flexible.
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Auto
OJ <- read.csv("/Users/saransh/Downloads/Statistical_Learning_Resources/OJ.csv")
OJ$Purchase <- as.factor(OJ$Purchase)
set.seed(2025)
train_index <- sample(1:nrow(OJ), 800)
train_OJ <- OJ[train_index, ]
test_OJ <- OJ[-train_index, ]
svc_oj <- svm(Purchase ~ ., data = train_OJ, kernel = "linear", cost = 0.01)
summary(svc_oj)
##
## Call:
## svm(formula = Purchase ~ ., data = train_OJ, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 433
##
## ( 217 216 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
Interpretation:
- 433 support vectors used, indicating soft margin due to low cost (0.01).
- Slightly more balanced margin.
mean(predict(svc_oj, train_OJ) != train_OJ$Purchase)
## [1] 0.165
mean(predict(svc_oj, test_OJ) != test_OJ$Purchase)
## [1] 0.1851852
Interpretation:
- Training error: ~16.5%, Test error: ~18.5%.
set.seed(2025)
tune_svc_oj <- tune(svm, Purchase ~ . - StoreID, data = train_OJ,
kernel = "linear", ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tune_svc_oj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.17
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17500 0.04526159
## 2 0.10 0.17000 0.04571956
## 3 1.00 0.17375 0.03928617
## 4 5.00 0.17625 0.04059026
## 5 10.00 0.17625 0.04910660
best_svc_oj <- tune_svc_oj$best.model
# Training Error
train_pred_best <- predict(best_svc_oj, train_OJ)
train_error_best <- mean(train_pred_best != train_OJ$Purchase)
train_error_best
## [1] 0.1625
# Test Error
test_pred_best <- predict(best_svc_oj, test_OJ)
test_error_best <- mean(test_pred_best != test_OJ$Purchase)
test_error_best
## [1] 0.1740741
Interpretation:
- Best cost gives ~16.2% training error and ~17.4% test error, slightly improved from initial model.
set.seed(2025)
tune_rad_oj <- tune(svm, Purchase ~ . - StoreID, data = train_OJ,
kernel = "radial", ranges = list(cost = c(0.1, 1, 10), gamma = c(0.01, 0.1, 1)))
summary(tune_rad_oj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 0.01
##
## - best performance: 0.17375
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.01 0.22500 0.05892557
## 2 1.0 0.01 0.17375 0.05118390
## 3 10.0 0.01 0.17375 0.04910660
## 4 0.1 0.10 0.19750 0.04031129
## 5 1.0 0.10 0.18125 0.04419417
## 6 10.0 0.10 0.20000 0.05368374
## 7 0.1 1.00 0.33000 0.07293452
## 8 1.0 1.00 0.21750 0.05898446
## 9 10.0 1.00 0.22625 0.07417369
set.seed(2025)
tune_poly_oj <- tune(svm, Purchase ~ . - StoreID, data = train_OJ,
kernel = "polynomial", degree = 2,
ranges = list(cost = c(0.1, 1, 5)))
summary(tune_poly_oj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 5
##
## - best performance: 0.17875
##
## - Detailed performance results:
## cost error dispersion
## 1 0.1 0.33250 0.03496029
## 2 1.0 0.19000 0.04031129
## 3 5.0 0.17875 0.04411554
tune_svc_oj$best.performance
## [1] 0.17
tune_rad_oj$best.performance
## [1] 0.17375
tune_poly_oj$best.performance
## [1] 0.17875
Interpretation:
- Radial SVM achieved the best cross-validation error (~17.3%).
- Polynomial was slightly worse (~17.8%), and Linear close (~17.0%).
- Radial kernel was slightly better suited to capture the complexity in the OJ data.