Problem 5: Logistic Regression & SVM with Nonlinear Boundaries

options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("e1071")
## 
## The downloaded binary packages are in
##  /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//RtmpPhvihc/downloaded_packages
library(e1071)


set.seed(123)  # For reproducibility

# (a) Generate data with quadratic boundary
n <- 500
x1 <- runif(n) - 0.5
x2 <- runif(n) - 0.5
y <- as.factor(1 * (x1^2 - x2^2 > 0))  # Class labels 0/1

# (b) Plot observations colored by class
plot(x1, x2, col = ifelse(y == 1, "blue", "red"), pch = 19,
     xlab = "X1", ylab = "X2", main = "Data colored by true class")

# (c) Fit logistic regression with linear terms
data_df <- data.frame(x1 = x1, x2 = x2, y = y)
logistic_lin <- glm(y ~ x1 + x2, data = data_df, family = "binomial")

# (d) Predict classes with linear model & plot
prob_lin <- predict(logistic_lin, type = "response")
pred_lin <- ifelse(prob_lin > 0.5, 1, 0)

plot(x1, x2, col = ifelse(pred_lin == 1, "blue", "red"), pch = 19,
     xlab = "X1", ylab = "X2", main = "Predicted class (linear logistic regression)")

# (e) Fit logistic regression with nonlinear terms
# Using quadratic and interaction terms
logistic_quad <- glm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2), 
                     data = data_df, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# (f) Predict and plot nonlinear logistic regression results
prob_quad <- predict(logistic_quad, type = "response")
pred_quad <- ifelse(prob_quad > 0.5, 1, 0)

plot(x1, x2, col = ifelse(pred_quad == 1, "blue", "red"), pch = 19,
     xlab = "X1", ylab = "X2", main = "Predicted class (nonlinear logistic regression)")

# (g) Fit a support vector classifier (linear kernel)
library(e1071)
svm_linear <- svm(y ~ x1 + x2, data = data_df, kernel = "linear", cost = 10, scale = FALSE)

# Predict and plot SVM linear
pred_svm_linear <- predict(svm_linear, data_df)
plot(x1, x2, col = ifelse(pred_svm_linear == 1, "blue", "red"), pch = 19,
     xlab = "X1", ylab = "X2", main = "Predicted class (linear SVM)")

# (h) Fit SVM with nonlinear kernel (radial basis kernel)
svm_rbf <- svm(y ~ x1 + x2, data = data_df, kernel = "radial", gamma = 1, cost = 10, scale = FALSE)

# Predict and plot SVM RBF
pred_svm_rbf <- predict(svm_rbf, data_df)
plot(x1, x2, col = ifelse(pred_svm_rbf == 1, "blue", "red"), pch = 19,
     xlab = "X1", ylab = "X2", main = "Predicted class (RBF kernel SVM)")

# (i) Comments:
# - The linear logistic regression and linear SVM produce linear decision boundaries.
# - The nonlinear logistic regression model (with quadratic terms) produces a nonlinear boundary capturing the data better.
# - The RBF kernel SVM also produces a nonlinear boundary and should match or outperform nonlinear logistic regression.

Problem 7: SVM for Gas Mileage Classification on Auto Data

library(ISLR2)
library(e1071)
set.seed(123)

data(Auto)

# (a) Create binary mpg variable: 1 if mpg > median, else 0
median_mpg <- median(Auto$mpg)
Auto$mpg01 <- as.factor(ifelse(Auto$mpg > median_mpg, 1, 0))

# Remove mpg variable for prediction
Auto2 <- Auto[, !(names(Auto) == "mpg")]

# (b) Fit SVM classifier with linear kernel and different cost values
cost_values <- c(0.01, 0.1, 1, 10, 100)

cv_errors <- numeric(length(cost_values))

for (i in seq_along(cost_values)) {
  svm_fit <- svm(mpg01 ~ ., data = Auto2, kernel = "linear", cost = cost_values[i], scale = TRUE)
  preds <- predict(svm_fit, Auto2)
  cv_errors[i] <- mean(preds != Auto2$mpg01)
}

# Report CV errors for different cost
data.frame(Cost = cost_values, Error = cv_errors)
##    Cost       Error
## 1 1e-02 0.086734694
## 2 1e-01 0.079081633
## 3 1e+00 0.028061224
## 4 1e+01 0.015306122
## 5 1e+02 0.007653061
# (c) Repeat with radial and polynomial kernels
gamma_values <- c(0.01, 0.1, 1)
degree_values <- 2:3

results_rbf <- expand.grid(Cost = cost_values, Gamma = gamma_values, Error = NA)
results_poly <- expand.grid(Cost = cost_values, Degree = degree_values, Error = NA)

# Radial kernel tuning
for (i in 1:nrow(results_rbf)) {
  svm_fit <- svm(mpg01 ~ ., data = Auto2, kernel = "radial",
                 cost = results_rbf$Cost[i], gamma = results_rbf$Gamma[i], scale = TRUE)
  preds <- predict(svm_fit, Auto2)
  results_rbf$Error[i] <- mean(preds != Auto2$mpg01)
}

# Polynomial kernel tuning
for (i in 1:nrow(results_poly)) {
  svm_fit <- svm(mpg01 ~ ., data = Auto2, kernel = "polynomial",
                 cost = results_poly$Cost[i], degree = results_poly$Degree[i], scale = TRUE)
  preds <- predict(svm_fit, Auto2)
  results_poly$Error[i] <- mean(preds != Auto2$mpg01)
}

# Show results
results_rbf
##     Cost Gamma       Error
## 1  1e-02  0.01 0.178571429
## 2  1e-01  0.01 0.112244898
## 3  1e+00  0.01 0.084183673
## 4  1e+01  0.01 0.066326531
## 5  1e+02  0.01 0.015306122
## 6  1e-02  0.10 0.094387755
## 7  1e-01  0.10 0.086734694
## 8  1e+00  0.10 0.076530612
## 9  1e+01  0.10 0.010204082
## 10 1e+02  0.10 0.002551020
## 11 1e-02  1.00 0.017857143
## 12 1e-01  1.00 0.017857143
## 13 1e+00  1.00 0.007653061
## 14 1e+01  1.00 0.000000000
## 15 1e+02  1.00 0.000000000
results_poly
##     Cost Degree     Error
## 1  1e-02      2 0.4311224
## 2  1e-01      2 0.4311224
## 3  1e+00      2 0.4311224
## 4  1e+01      2 0.4311224
## 5  1e+02      2 0.2933673
## 6  1e-02      3 0.4311224
## 7  1e-01      3 0.4311224
## 8  1e+00      3 0.4311224
## 9  1e+01      3 0.4311224
## 10 1e+02      3 0.4030612
# (d) Plots: Error vs Cost for linear, radial, and polynomial kernels
library(ggplot2)

# Linear kernel data frame
df_linear <- data.frame(Cost = cost_values, Error = cv_errors, Kernel = "Linear")

# Radial kernel aggregate: best gamma for each cost
df_rbf <- aggregate(Error ~ Cost, data = results_rbf, FUN = min)
df_rbf$Kernel <- "Radial"

# Polynomial kernel aggregate: best degree for each cost
df_poly <- aggregate(Error ~ Cost, data = results_poly, FUN = min)
df_poly$Kernel <- "Polynomial"

df_all <- rbind(df_linear, df_rbf, df_poly)

ggplot(df_all, aes(x = Cost, y = Error, color = Kernel)) +
  geom_line() + geom_point() +
  scale_x_log10() +
  labs(title = "SVM Classification Error vs Cost",
       x = "Cost (log scale)",
       y = "Classification Error") +
  theme_minimal()


Problem 8: SVM on OJ Dataset

library(ISLR2)
library(e1071)
set.seed(123)

data(OJ)

# (a) Create training (800) and test sets
train_idx <- sample(1:nrow(OJ), 800)
train <- OJ[train_idx, ]
test <- OJ[-train_idx, ]

# (b) Fit SVM classifier with cost = 0.01, linear kernel by default
svm_fit <- svm(Purchase ~ ., data = train, kernel = "linear", cost = 0.01)
summary(svm_fit)
## 
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "linear", cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  442
## 
##  ( 220 222 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
# (c) Training and test error rates
train_pred <- predict(svm_fit, train)
test_pred <- predict(svm_fit, test)

train_err <- mean(train_pred != train$Purchase)
test_err <- mean(test_pred != test$Purchase)
cat("Training error:", train_err, "\nTest error:", test_err, "\n")
## Training error: 0.165 
## Test error: 0.1777778
# (d) Tune cost parameter from 0.01 to 10 using tune()
tune_out <- tune(svm, Purchase ~ ., data = train, kernel = "linear",
                 ranges = list(cost = seq(0.01, 10, length.out = 10)))
summary(tune_out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  5.56
## 
## - best performance: 0.16375 
## 
## - Detailed performance results:
##     cost   error dispersion
## 1   0.01 0.17625 0.03143004
## 2   1.12 0.16875 0.03596391
## 3   2.23 0.16625 0.03537988
## 4   3.34 0.16500 0.02934469
## 5   4.45 0.16750 0.02898755
## 6   5.56 0.16375 0.02972676
## 7   6.67 0.16625 0.02949223
## 8   7.78 0.17125 0.02829041
## 9   8.89 0.17125 0.02829041
## 10 10.00 0.17250 0.02751262
# (e) Use best cost from tuning to fit and evaluate
best_cost <- tune_out$best.parameters$cost
svm_best <- svm(Purchase ~ ., data = train, kernel = "linear", cost = best_cost)

train_pred_best <- predict(svm_best, train)
test_pred_best <- predict(svm_best, test)

cat("Training error (best cost):", mean(train_pred_best != train$Purchase), "\n")
## Training error (best cost): 0.1625
cat("Test error (best cost):", mean(test_pred_best != test$Purchase), "\n")
## Test error (best cost): 0.1666667
# (f) Repeat (b)-(e) for radial kernel (default gamma)
svm_rbf_fit <- svm(Purchase ~ ., data = train, kernel = "radial", cost = 0.01)
summary(svm_rbf_fit)
## 
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "radial", cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.01 
## 
## Number of Support Vectors:  629
## 
##  ( 313 316 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
train_pred_rbf <- predict(svm_rbf_fit, train)
test_pred_rbf <- predict(svm_rbf_fit, test)
cat("RBF Training error:", mean(train_pred_rbf != train$Purchase), "\n")
## RBF Training error: 0.39125
cat("RBF Test error:", mean(test_pred_rbf != test$Purchase), "\n")
## RBF Test error: 0.3851852
tune_rbf <- tune(svm, Purchase ~ ., data = train, kernel = "radial",
                 ranges = list(cost = seq(0.01, 10, length.out = 10)))
summary(tune_rbf)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  1.12
## 
## - best performance: 0.1625 
## 
## - Detailed performance results:
##     cost   error dispersion
## 1   0.01 0.39125 0.04411554
## 2   1.12 0.16250 0.03486083
## 3   2.23 0.16500 0.03216710
## 4   3.34 0.16750 0.03343734
## 5   4.45 0.16750 0.03184162
## 6   5.56 0.17000 0.03545341
## 7   6.67 0.17125 0.03955042
## 8   7.78 0.17000 0.03961621
## 9   8.89 0.16625 0.03586723
## 10 10.00 0.16500 0.03622844
best_cost_rbf <- tune_rbf$best.parameters$cost
svm_rbf_best <- svm(Purchase ~ ., data = train, kernel = "radial", cost = best_cost_rbf)

train_pred_rbf_best <- predict(svm_rbf_best, train)
test_pred_rbf_best <- predict(svm_rbf_best, test)
cat("RBF Training error (best cost):", mean(train_pred_rbf_best != train$Purchase), "\n")
## RBF Training error (best cost): 0.1375
cat("RBF Test error (best cost):", mean(test_pred_rbf_best != test$Purchase), "\n")
## RBF Test error (best cost): 0.1851852
# (g) Repeat (b)-(e) for polynomial kernel with degree = 2
svm_poly_fit <- svm(Purchase ~ ., data = train, kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_poly_fit)
## 
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "polynomial", 
##     degree = 2, cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  0.01 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  631
## 
##  ( 313 318 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
train_pred_poly <- predict(svm_poly_fit, train)
test_pred_poly <- predict(svm_poly_fit, test)
cat("Polynomial Training error:", mean(train_pred_poly != train$Purchase), "\n")
## Polynomial Training error: 0.3725
cat("Polynomial Test error:", mean(test_pred_poly != test$Purchase), "\n")
## Polynomial Test error: 0.3740741
tune_poly <- tune(svm, Purchase ~ ., data = train, kernel = "polynomial", degree = 2,
                  ranges = list(cost = seq(0.01, 10, length.out = 10)))
summary(tune_poly)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  6.67
## 
## - best performance: 0.16375 
## 
## - Detailed performance results:
##     cost   error dispersion
## 1   0.01 0.38750 0.04289846
## 2   1.12 0.18125 0.03294039
## 3   2.23 0.17250 0.03717451
## 4   3.34 0.17000 0.03827895
## 5   4.45 0.17000 0.04005205
## 6   5.56 0.16875 0.04093101
## 7   6.67 0.16375 0.03884174
## 8   7.78 0.16750 0.03736085
## 9   8.89 0.17000 0.03827895
## 10 10.00 0.17125 0.03729108
best_cost_poly <- tune_poly$best.parameters$cost
svm_poly_best <- svm(Purchase ~ ., data = train, kernel = "polynomial", degree = 2, cost = best_cost_poly)

train_pred_poly_best <- predict(svm_poly_best, train)
test_pred_poly_best <- predict(svm_poly_best, test)
cat("Polynomial Training error (best cost):", mean(train_pred_poly_best != train$Purchase), "\n")
## Polynomial Training error (best cost): 0.1425
cat("Polynomial Test error (best cost):", mean(test_pred_poly_best != test$Purchase), "\n")
## Polynomial Test error (best cost): 0.1962963
# (h) Overall best approach:
# You can compare test errors from svm_best, svm_rbf_best, svm_poly_best to pick best.