library(ggplot2)
library(e1071)
library(caret)
## Loading required package: lattice
library(ISLR2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

4

set.seed(96)

# Step 1: Simulate data (non-linear boundary)
n <- 100
x1 <- runif(n, -2, 2)
x2 <- runif(n, -2, 2)

# Create a non-linear decision boundary (circle)
y <- ifelse(x1^2 + x2^2 > 1.5^2, "Class1", "Class2")
data <- data.frame(x1 = x1, x2 = x2, y = as.factor(y))

# Split into train/test
train_idx <- sample(1:n, size = 70)
train <- data[train_idx, ]
test <- data[-train_idx, ]
# Linear SVM (Support Vector Classifier)
svm_linear <- svm(y ~ ., data = train, kernel = "linear", cost = 1)

# Polynomial Kernel (degree = 3)
svm_poly <- svm(y ~ ., data = train, kernel = "polynomial", degree = 3, cost = 1)

# Radial Basis Function Kernel
svm_rbf <- svm(y ~ ., data = train, kernel = "radial", gamma = 1, cost = 1)
# Predictions
train_preds <- data.frame(
  linear = predict(svm_linear, train),
  poly = predict(svm_poly, train),
  rbf = predict(svm_rbf, train)
)

test_preds <- data.frame(
  linear = predict(svm_linear, test),
  poly = predict(svm_poly, test),
  rbf = predict(svm_rbf, test)
)

# Compute training error for each model
train_error_linear <- mean(predict(svm_linear, train) != train$y)
train_error_poly   <- mean(predict(svm_poly, train) != train$y)
train_error_rbf    <- mean(predict(svm_rbf, train) != train$y)

# Compute test error for each model
test_error_linear <- mean(predict(svm_linear, test) != test$y)
test_error_poly   <- mean(predict(svm_poly, test) != test$y)
test_error_rbf    <- mean(predict(svm_rbf, test) != test$y)


error_df <- data.frame(
  Model = c("Linear", "Polynomial", "RBF"),
  Training_Error = c(train_error_linear, train_error_poly, train_error_rbf),
  Test_Error = c(test_error_linear, test_error_poly, test_error_rbf)
)
print(format(error_df, digits = 3), row.names = FALSE)
##       Model Training_Error Test_Error
##      Linear         0.4714      0.233
##  Polynomial         0.3143      0.500
##         RBF         0.0286      0.000

Note: About Radial Kernel Test-Error

  1. Its slightly confusing to have a test error as 0.000, this implies on test data, the radial kernel svm has made no mistake.

  2. This could be because the test data could be more clearly seperated than you would like to test our model.

  3. k-fold CV would have be a better method to accuarately understand its performance on test dataset.

Interpretation of the results:

  1. Radial kernel out performs both Linear and Polynomial with degree 3 in both Training Erro and Test Error.

  2. Ironically, Linear SVC has a better test error compared Polynomial. Then again, a k-fold CV would accurately help us understand each model better.

On new data lets plot all three models:

  1. Linear classifier
# Create grid for plotting
x1_seq <- seq(-2, 2, length.out = 200)
x2_seq <- seq(-2, 2, length.out = 200)
grid <- expand.grid(x1 = x1_seq, x2 = x2_seq)
grid$pred <- predict(svm_linear, newdata = grid)

# Plot decision boundary
ggplot() +
  geom_tile(data = grid, aes(x = x1, y = x2, fill = pred), alpha = 0.4) +
  geom_point(data = train, aes(x = x1, y = x2, color = y), size = 2) +
  labs(title = "SVM Linear - Decision Boundary",
       x = "x1", y = "x2") +
  theme_minimal() +
  scale_fill_manual(values = c("Class1" = "#99ccff", "Class2" = "#ffcccc")) +
  scale_color_manual(values = c("Class1" = "blue", "Class2" = "red"))

Observations:

  1. The model completely predicts everypoint as Class1

  2. Worst performer

  1. Radial kernel:
# Predict on grid
grid$pred <- predict(svm_rbf, newdata = grid)

# Plot decision boundary
ggplot() +
  geom_tile(data = grid, aes(x = x1, y = x2, fill = pred), alpha = 0.4) +
  geom_point(data = train, aes(x = x1, y = x2, color = y), size = 2) +
  labs(title = "SVM with RBF Kernel - Decision Boundary",
       x = "x1", y = "x2") +
  theme_minimal() +
  scale_fill_manual(values = c("Class1" = "#99ccff", "Class2" = "#ffcccc")) +
  scale_color_manual(values = c("Class1" = "blue", "Class2" = "red"))

Observations:

  1. Except for 2-3 points(out of 200), the svm_rbf model is classifying pretty well.

  2. So far this has the best classification error rate

  1. Polynomial Kernel Plot
# Predict on grid
grid$pred <- predict(svm_poly, newdata = grid)

# Plot decision boundary
ggplot() +
  geom_tile(data = grid, aes(x = x1, y = x2, fill = pred), alpha = 0.4) +
  geom_point(data = train, aes(x = x1, y = x2, color = y), size = 2) +
  labs(title = "SVM with Polynomial(degree =3) Kernel - Decision Boundary",
       x = "x1", y = "x2") +
  theme_minimal() +
  scale_fill_manual(values = c("Class1" = "#99ccff", "Class2" = "#ffcccc")) +
  scale_color_manual(values = c("Class1" = "blue", "Class2" = "red"))

Observations:

  1. Misclassifies alot of CLass1 as CLass2

  2. This better than linear but not as good as Radial Kernel

Conclusion:

  1. As expected Radial kernel outperforms all the other kernels SVM. Since the data is initially classified based on circle decision boundary –> \({X_1}^2 + {X_2}^2 = 2.25\)

  2. Even from errors reported above, Radial is the clear winner!

7

(a)

set.seed(96)
data(Auto)

# (a) Create binary variable: 1 if mpg > median, else 0
Auto <- Auto %>%
  mutate(highMPG = ifelse(mpg > median(mpg), 1, 0)) %>%
  select(-mpg)  # Remove mpg since it's now the target

# Convert highMPG to factor for classification
Auto$highMPG <- as.factor(Auto$highMPG)

(b)

# Define tuning grid for cost
tune_out_linear <- tune(svm,
                        highMPG ~ .,
                        data = Auto,
                        kernel = "linear",
                        ranges = list(cost = c(0.01, 0.1, 1, 10, 100)),
                        scale = TRUE)

# Best model and CV errors
summary(tune_out_linear)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  0.01
## 
## - best performance: 0.09198718 
## 
## - Detailed performance results:
##    cost      error dispersion
## 1 1e-02 0.09198718 0.03884572
## 2 1e-01 0.10224359 0.03456688
## 3 1e+00 0.09955128 0.03728256
## 4 1e+01 0.10980769 0.04862607
## 5 1e+02 0.10974359 0.04699767
  • the best cost is 0.01 (which is also the standard value used)

  • As cost increases, the error slightly increases — meaning higher penalty for misclassifications doesn’t improve generalization

  • It can be explained since a lower cost in linear SVM allows for a wider margin, tolerating some misclassified points for better generalization.

(c)

Radial

tune_out_radial <- tune(svm,
                        highMPG ~ .,
                        data = Auto,
                        kernel = "radial",
                        ranges = list(cost = c(0.1, 1, 10),
                                      gamma = c(0.5, 1, 2)))

summary(tune_out_radial)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1   0.5
## 
## - best performance: 0.07916667 
## 
## - Detailed performance results:
##   cost gamma      error dispersion
## 1  0.1   0.5 0.08948718 0.03046532
## 2  1.0   0.5 0.07916667 0.03085406
## 3 10.0   0.5 0.07916667 0.03527292
## 4  0.1   1.0 0.57384615 0.03126062
## 5  1.0   1.0 0.08173077 0.03986661
## 6 10.0   1.0 0.07916667 0.03085406
## 7  0.1   2.0 0.57384615 0.03126062
## 8  1.0   2.0 0.12256410 0.04509489
## 9 10.0   2.0 0.12000000 0.04382978

Observations & Interpretations

  • Cost = 1.0 and Gamma = 0.5 seem to yield the best results, with a CV error of 0.0792 same as 0.0792 for Cost = 10 and Gamma = 0.5

  • Lower values of gamma (0.5) tend to give more stable and better performance with low error rates.

  • The dispersion (standard deviation) of the cross-validation errors is fairly low for most settings (typically between 0.03 and 0.05), which means the model’s performance is quite stable across different folds.

  • The best radial kernel results (CV error ≈ 7.9%) are better than the best linear kernel results from part (b), which had a CV error of ~9.2%

Polynomial Kernel

tune_out_poly <- tune(svm,
                      highMPG ~ .,
                      data = Auto,
                      kernel = "polynomial",
                      ranges = list(cost = c(0.1, 1, 10),
                                    degree = c(2, 3, 4)))

summary(tune_out_poly)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost degree
##    10      2
## 
## - best performance: 0.5383974 
## 
## - Detailed performance results:
##   cost degree     error dispersion
## 1  0.1      2 0.5611538 0.03806443
## 2  1.0      2 0.5611538 0.03806443
## 3 10.0      2 0.5383974 0.05626799
## 4  0.1      3 0.5611538 0.03806443
## 5  1.0      3 0.5611538 0.03806443
## 6 10.0      3 0.5611538 0.03806443
## 7  0.1      4 0.5611538 0.03806443
## 8  1.0      4 0.5611538 0.03806443
## 9 10.0      4 0.5611538 0.03806443

Observations & Interpretations

  • No significant improvement in performance between degree 2, 3, and 4. The CV error remains constant around 0.5612.

  • Which imply that the data is simple enough for higher degrees, thus degree 2 should be enough

  • Across all values of cost (0.1, 1.0, 10.0), the CV error remains nearly identical, with only slight variations.

  • Cost = 10 seems to give a slightly lower error (0.5384), but the difference is minimal.

  • The dispersion is very stable across all parameter combinations, especially at degree 2, 3, and 4 with values around 0.0381.

Conclusion : The radial kernel outperforms the polynomial kernel and linear kernel by a noticeable margin on this data, showing that non-linear separability is best captured by the radial kernel for this problem.

# Extract best models
best_linear <- tune_out_linear$best.model
best_radial <- tune_out_radial$best.model
best_poly   <- tune_out_poly$best.model

# Training error
train_err <- function(model) {
  mean(predict(model, Auto) != Auto$highMPG)
}

# Create a data frame with the training errors
training_errors_df <- data.frame(
  Model = c("Linear", "Radial", "Polynomial"),
  Training_Error = c(train_err(best_linear), train_err(best_radial), train_err(best_poly))
)

# Print the data frame with formatted output
print(format(training_errors_df, digits = 3), row.names = FALSE)
##       Model Training_Error
##      Linear         0.0867
##      Radial         0.0179
##  Polynomial         0.4311
# Linear SVM plot
plot(best_linear, Auto, horsepower ~ weight)

# Radial SVM plot
plot(best_radial, Auto, displacement ~ acceleration)

# Polynomial SVM plot
plot(best_poly, Auto, cylinders ~ horsepower)

Conclusion from Training Errors: The radial kernel SVM is clearly the best performer in terms of training error, which aligns with conclusion from the best test error rate as well.

From Linear SVM Plot (weight and horsepower): Even though the exact decision boundary is not clearly defined, the support vectors closer to the range 2500,3700 weight and horsepower is from 60 to 130. Within this range a lot of support vectors are present

From Radial SVM Plot (accelaration and displacement): There is on pattern visible from the plot. But we can see lot of X which implies the number of support vectors are more compared to linear

From Best Polynomial SVM Plot (horsepower and cylinders): We can see alot of X as well but no clear boundary as was the case in all above plot. We can see a that alot of overlap of support vectors, this pattern could from horse power from 50 to 100 in light red and a lot bigger range for black X marks above at 8 cylinders. Its hard to tell whether the Polynomial SVM is better than Radial from the plots itself, thus we have to rely on test errors

8

(a)

set.seed(96)

data(OJ)
train_index <- sample(1:nrow(OJ), 800)
train_data <- OJ[train_index, ]
test_data  <- OJ[-train_index, ]

(b)

svm_linear <- svm(Purchase ~ ., data = train_data, kernel = "linear", cost = 0.01, scale = TRUE)
summary(svm_linear)
## 
## Call:
## svm(formula = Purchase ~ ., data = train_data, kernel = "linear", 
##     cost = 0.01, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  433
## 
##  ( 217 216 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

Conclusion from summary

  • A total of 433 support vectors (out of 800) are support the linear decision boundary and 217 for CH and 216 for MM

  • Pretty even split among the two classes

(c)

# Predictions
train_pred_linear <- predict(svm_linear, train_data)
test_pred_linear <- predict(svm_linear, test_data)

# Error rates
train_error_linear <- mean(train_pred_linear != train_data$Purchase)
test_error_linear <- mean(test_pred_linear != test_data$Purchase)

cat("Training Error (Linear):", train_error_linear, "\n")
## Training Error (Linear): 0.1625
cat("Test Error (Linear):", test_error_linear, "\n")
## Test Error (Linear): 0.1851852

Interpretations

  • Nearly 16% of the training dataset is misclassified. While 18% of the test data is misclassified

  • The training error and test error are pretty close indicating low variance and model is not overfitting.

  • A more flexible model can used.

(d)

tune_linear <- tune(svm, Purchase ~ ., data = train_data,
                    kernel = "linear",
                    ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))

summary(tune_linear)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     5
## 
## - best performance: 0.16375 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.17250 0.03525699
## 2  0.10 0.16500 0.03899786
## 3  1.00 0.16875 0.04093101
## 4  5.00 0.16375 0.03606033
## 5 10.00 0.16375 0.03508422

Observations

  • Higher cost 5,10 have lowest error and smaller dispersion but cost = 10.00 might be considered as the optimal value because it has the lowest dispersion (0.035084) compared to cost = 5.00 (0.036060).

(e)

best_linear_model <-  svm(Purchase ~ ., data = train_data, kernel = "linear", cost = 10, scale = TRUE)

# Predictions
train_pred_best_linear <- predict(best_linear_model, train_data)
test_pred_best_linear <- predict(best_linear_model, test_data)

# Error rates
train_error_best_linear <- mean(train_pred_best_linear != train_data$Purchase)
test_error_best_linear <- mean(test_pred_best_linear != test_data$Purchase)

cat("Best Training Error (Linear):", train_error_best_linear, "\n")
## Best Training Error (Linear): 0.15625
cat("Best Test Error (Linear):", test_error_best_linear, "\n")
## Best Test Error (Linear): 0.1962963

Conclusions:

  • Linear SVM at cost = 10 slightly improved training accuracy.

  • However, the generalization to the test set is not better, which shows that linear decision boundaries might not be sufficient for this problem.

(f)

# (b) Radial SVM with default gamma and cost = 0.01
svm_radial <- svm(Purchase ~ ., data = train_data, kernel = "radial", cost = 0.01)
summary(svm_radial)
## 
## Call:
## svm(formula = Purchase ~ ., data = train_data, kernel = "radial", 
##     cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.01 
## 
## Number of Support Vectors:  627
## 
##  ( 315 312 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
# (c) Errors
train_error_radial <- mean(predict(svm_radial, train_data) != train_data$Purchase)
test_error_radial <- mean(predict(svm_radial, test_data) != test_data$Purchase)

# (d) Tune cost
set.seed(123)
tune_radial <- tune(svm, Purchase ~ ., data = train_data,
                    kernel = "radial",
                    ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))

summary(tune_radial)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.17 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.39000 0.06174545
## 2  0.10 0.18625 0.04730589
## 3  1.00 0.17000 0.05407043
## 4  5.00 0.17500 0.04208127
## 5 10.00 0.17875 0.04825065
  • 627 suppport vectors and evenly split.

  • The best cost is 1.00 since it has lowest error and decent dispersion as well.

# (e) Best radial errors
best_radial_model <- svm(Purchase ~ ., data = train_data, kernel = "radial", cost = 1)
train_error_best_radial <- mean(predict(best_radial_model, train_data) != train_data$Purchase)
test_error_best_radial <- mean(predict(best_radial_model, test_data) != test_data$Purchase)

cat("Best Training Error (Radial):", train_error_best_radial, "\n")
## Best Training Error (Radial): 0.14625
cat("Best Test Error (Radial):", test_error_best_radial, "\n")
## Best Test Error (Radial): 0.1925926
  • Radial SVM performs slightly better than the linear SVM on both training and test sets.

  • The improvement isn’t huge, but the radial kernel captures non-linear boundaries a bit better, which is expected if the data is not linearly separable.

# (b) Polynomial SVM (degree 2, cost = 0.01)
svm_poly <- svm(Purchase ~ ., data = train_data,
                kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_poly)
## 
## Call:
## svm(formula = Purchase ~ ., data = train_data, kernel = "polynomial", 
##     degree = 2, cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  0.01 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  627
## 
##  ( 315 312 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
# (c) Errors
train_error_poly <- mean(predict(svm_poly, train_data) != train_data$Purchase)
test_error_poly <- mean(predict(svm_poly, test_data) != test_data$Purchase)

# (d) Tune cost
tune_poly <- tune(svm, Purchase ~ ., data = train_data,
                  kernel = "polynomial", degree = 2,
                  ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))

summary(tune_poly)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##    10
## 
## - best performance: 0.17875 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.38250 0.04866267
## 2  0.10 0.32500 0.06508541
## 3  1.00 0.20000 0.06692658
## 4  5.00 0.18375 0.04860913
## 5 10.00 0.17875 0.05138701
  • Very high number of support vectors (627 out of 800) at cost = 0.01. same as radical SVM

  • Best cost = 10 with lowest cross-validation error (0.17).

  • As cost increases, error decreases, similar to radial SVM behavior.

# (e) Best poly errors
best_poly_model <-  svm(Purchase ~ ., data = train_data,
                kernel = "polynomial", degree = 2, cost = 10)
train_error_best_poly <- mean(predict(best_poly_model, train_data) != train_data$Purchase)
test_error_best_poly <- mean(predict(best_poly_model, test_data) != test_data$Purchase)

cat("Best Training Error (Poly):", train_error_best_poly, "\n")
## Best Training Error (Poly): 0.145
cat("Best Test Error (Poly):", test_error_best_poly, "\n")
## Best Test Error (Poly): 0.2037037
  • Lowest training loss till now (among the best models) but higher test loss
error_summary <- data.frame(
  Model = c("Linear (Tuned)", "Radial (Tuned)", "Poly (Tuned)"),
  Training_Error = c(train_error_best_linear, train_error_best_radial, train_error_best_poly),
  Test_Error = c(test_error_best_linear, test_error_best_radial, test_error_best_poly)
)

print(error_summary)
##            Model Training_Error Test_Error
## 1 Linear (Tuned)        0.15625  0.1962963
## 2 Radial (Tuned)        0.14625  0.1925926
## 3   Poly (Tuned)        0.14500  0.2037037

COnclusions:

  • Polynomial SVM had the lowest training error, very slightly better than radial.

  • All models are fairly close, so none are drastically over- or underfitting.

  • Radial kernel performs best overall, with the lowest test error (19.26%).

  • Linear is a close second (19.63%).

  • Polynomial SVM had the worst test error (20.37%), even though its training error was low — suggesting some overfitting to training data.

  • Therefore, Best model for this dataset: Radial SVM (cost = 1).