Problem 5: Logistic Regression & SVM with Nonlinear
Boundaries
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("e1071")
##
## The downloaded binary packages are in
## /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//RtmpPhvihc/downloaded_packages
library(e1071)
set.seed(123) # For reproducibility
# (a) Generate data with quadratic boundary
n <- 500
x1 <- runif(n) - 0.5
x2 <- runif(n) - 0.5
y <- as.factor(1 * (x1^2 - x2^2 > 0)) # Class labels 0/1
# (b) Plot observations colored by class
plot(x1, x2, col = ifelse(y == 1, "blue", "red"), pch = 19,
xlab = "X1", ylab = "X2", main = "Data colored by true class")

# (c) Fit logistic regression with linear terms
data_df <- data.frame(x1 = x1, x2 = x2, y = y)
logistic_lin <- glm(y ~ x1 + x2, data = data_df, family = "binomial")
# (d) Predict classes with linear model & plot
prob_lin <- predict(logistic_lin, type = "response")
pred_lin <- ifelse(prob_lin > 0.5, 1, 0)
plot(x1, x2, col = ifelse(pred_lin == 1, "blue", "red"), pch = 19,
xlab = "X1", ylab = "X2", main = "Predicted class (linear logistic regression)")

# (e) Fit logistic regression with nonlinear terms
# Using quadratic and interaction terms
logistic_quad <- glm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2),
data = data_df, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# (f) Predict and plot nonlinear logistic regression results
prob_quad <- predict(logistic_quad, type = "response")
pred_quad <- ifelse(prob_quad > 0.5, 1, 0)
plot(x1, x2, col = ifelse(pred_quad == 1, "blue", "red"), pch = 19,
xlab = "X1", ylab = "X2", main = "Predicted class (nonlinear logistic regression)")

# (g) Fit a support vector classifier (linear kernel)
library(e1071)
svm_linear <- svm(y ~ x1 + x2, data = data_df, kernel = "linear", cost = 10, scale = FALSE)
# Predict and plot SVM linear
pred_svm_linear <- predict(svm_linear, data_df)
plot(x1, x2, col = ifelse(pred_svm_linear == 1, "blue", "red"), pch = 19,
xlab = "X1", ylab = "X2", main = "Predicted class (linear SVM)")

# (h) Fit SVM with nonlinear kernel (radial basis kernel)
svm_rbf <- svm(y ~ x1 + x2, data = data_df, kernel = "radial", gamma = 1, cost = 10, scale = FALSE)
# Predict and plot SVM RBF
pred_svm_rbf <- predict(svm_rbf, data_df)
plot(x1, x2, col = ifelse(pred_svm_rbf == 1, "blue", "red"), pch = 19,
xlab = "X1", ylab = "X2", main = "Predicted class (RBF kernel SVM)")

# (i) Comments:
# - The linear logistic regression and linear SVM produce linear decision boundaries.
# - The nonlinear logistic regression model (with quadratic terms) produces a nonlinear boundary capturing the data better.
# - The RBF kernel SVM also produces a nonlinear boundary and should match or outperform nonlinear logistic regression.
Problem 7: SVM for Gas Mileage Classification on Auto Data
library(ISLR2)
library(e1071)
set.seed(123)
data(Auto)
# (a) Create binary mpg variable: 1 if mpg > median, else 0
median_mpg <- median(Auto$mpg)
Auto$mpg01 <- as.factor(ifelse(Auto$mpg > median_mpg, 1, 0))
# Remove mpg variable for prediction
Auto2 <- Auto[, !(names(Auto) == "mpg")]
# (b) Fit SVM classifier with linear kernel and different cost values
cost_values <- c(0.01, 0.1, 1, 10, 100)
cv_errors <- numeric(length(cost_values))
for (i in seq_along(cost_values)) {
svm_fit <- svm(mpg01 ~ ., data = Auto2, kernel = "linear", cost = cost_values[i], scale = TRUE)
preds <- predict(svm_fit, Auto2)
cv_errors[i] <- mean(preds != Auto2$mpg01)
}
# Report CV errors for different cost
data.frame(Cost = cost_values, Error = cv_errors)
## Cost Error
## 1 1e-02 0.086734694
## 2 1e-01 0.079081633
## 3 1e+00 0.028061224
## 4 1e+01 0.015306122
## 5 1e+02 0.007653061
# (c) Repeat with radial and polynomial kernels
gamma_values <- c(0.01, 0.1, 1)
degree_values <- 2:3
results_rbf <- expand.grid(Cost = cost_values, Gamma = gamma_values, Error = NA)
results_poly <- expand.grid(Cost = cost_values, Degree = degree_values, Error = NA)
# Radial kernel tuning
for (i in 1:nrow(results_rbf)) {
svm_fit <- svm(mpg01 ~ ., data = Auto2, kernel = "radial",
cost = results_rbf$Cost[i], gamma = results_rbf$Gamma[i], scale = TRUE)
preds <- predict(svm_fit, Auto2)
results_rbf$Error[i] <- mean(preds != Auto2$mpg01)
}
# Polynomial kernel tuning
for (i in 1:nrow(results_poly)) {
svm_fit <- svm(mpg01 ~ ., data = Auto2, kernel = "polynomial",
cost = results_poly$Cost[i], degree = results_poly$Degree[i], scale = TRUE)
preds <- predict(svm_fit, Auto2)
results_poly$Error[i] <- mean(preds != Auto2$mpg01)
}
# Show results
results_rbf
## Cost Gamma Error
## 1 1e-02 0.01 0.178571429
## 2 1e-01 0.01 0.112244898
## 3 1e+00 0.01 0.084183673
## 4 1e+01 0.01 0.066326531
## 5 1e+02 0.01 0.015306122
## 6 1e-02 0.10 0.094387755
## 7 1e-01 0.10 0.086734694
## 8 1e+00 0.10 0.076530612
## 9 1e+01 0.10 0.010204082
## 10 1e+02 0.10 0.002551020
## 11 1e-02 1.00 0.017857143
## 12 1e-01 1.00 0.017857143
## 13 1e+00 1.00 0.007653061
## 14 1e+01 1.00 0.000000000
## 15 1e+02 1.00 0.000000000
results_poly
## Cost Degree Error
## 1 1e-02 2 0.4311224
## 2 1e-01 2 0.4311224
## 3 1e+00 2 0.4311224
## 4 1e+01 2 0.4311224
## 5 1e+02 2 0.2933673
## 6 1e-02 3 0.4311224
## 7 1e-01 3 0.4311224
## 8 1e+00 3 0.4311224
## 9 1e+01 3 0.4311224
## 10 1e+02 3 0.4030612
# (d) Plots: Error vs Cost for linear, radial, and polynomial kernels
library(ggplot2)
# Linear kernel data frame
df_linear <- data.frame(Cost = cost_values, Error = cv_errors, Kernel = "Linear")
# Radial kernel aggregate: best gamma for each cost
df_rbf <- aggregate(Error ~ Cost, data = results_rbf, FUN = min)
df_rbf$Kernel <- "Radial"
# Polynomial kernel aggregate: best degree for each cost
df_poly <- aggregate(Error ~ Cost, data = results_poly, FUN = min)
df_poly$Kernel <- "Polynomial"
df_all <- rbind(df_linear, df_rbf, df_poly)
ggplot(df_all, aes(x = Cost, y = Error, color = Kernel)) +
geom_line() + geom_point() +
scale_x_log10() +
labs(title = "SVM Classification Error vs Cost",
x = "Cost (log scale)",
y = "Classification Error") +
theme_minimal()

Problem 8: SVM on OJ Dataset
library(ISLR2)
library(e1071)
set.seed(123)
data(OJ)
# (a) Create training (800) and test sets
train_idx <- sample(1:nrow(OJ), 800)
train <- OJ[train_idx, ]
test <- OJ[-train_idx, ]
# (b) Fit SVM classifier with cost = 0.01, linear kernel by default
svm_fit <- svm(Purchase ~ ., data = train, kernel = "linear", cost = 0.01)
summary(svm_fit)
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 442
##
## ( 220 222 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
# (c) Training and test error rates
train_pred <- predict(svm_fit, train)
test_pred <- predict(svm_fit, test)
train_err <- mean(train_pred != train$Purchase)
test_err <- mean(test_pred != test$Purchase)
cat("Training error:", train_err, "\nTest error:", test_err, "\n")
## Training error: 0.165
## Test error: 0.1777778
# (d) Tune cost parameter from 0.01 to 10 using tune()
tune_out <- tune(svm, Purchase ~ ., data = train, kernel = "linear",
ranges = list(cost = seq(0.01, 10, length.out = 10)))
summary(tune_out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 5.56
##
## - best performance: 0.16375
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17625 0.03143004
## 2 1.12 0.16875 0.03596391
## 3 2.23 0.16625 0.03537988
## 4 3.34 0.16500 0.02934469
## 5 4.45 0.16750 0.02898755
## 6 5.56 0.16375 0.02972676
## 7 6.67 0.16625 0.02949223
## 8 7.78 0.17125 0.02829041
## 9 8.89 0.17125 0.02829041
## 10 10.00 0.17250 0.02751262
# (e) Use best cost from tuning to fit and evaluate
best_cost <- tune_out$best.parameters$cost
svm_best <- svm(Purchase ~ ., data = train, kernel = "linear", cost = best_cost)
train_pred_best <- predict(svm_best, train)
test_pred_best <- predict(svm_best, test)
cat("Training error (best cost):", mean(train_pred_best != train$Purchase), "\n")
## Training error (best cost): 0.1625
cat("Test error (best cost):", mean(test_pred_best != test$Purchase), "\n")
## Test error (best cost): 0.1666667
# (f) Repeat (b)-(e) for radial kernel (default gamma)
svm_rbf_fit <- svm(Purchase ~ ., data = train, kernel = "radial", cost = 0.01)
summary(svm_rbf_fit)
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "radial", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.01
##
## Number of Support Vectors: 629
##
## ( 313 316 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train_pred_rbf <- predict(svm_rbf_fit, train)
test_pred_rbf <- predict(svm_rbf_fit, test)
cat("RBF Training error:", mean(train_pred_rbf != train$Purchase), "\n")
## RBF Training error: 0.39125
cat("RBF Test error:", mean(test_pred_rbf != test$Purchase), "\n")
## RBF Test error: 0.3851852
tune_rbf <- tune(svm, Purchase ~ ., data = train, kernel = "radial",
ranges = list(cost = seq(0.01, 10, length.out = 10)))
summary(tune_rbf)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1.12
##
## - best performance: 0.1625
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39125 0.04411554
## 2 1.12 0.16250 0.03486083
## 3 2.23 0.16500 0.03216710
## 4 3.34 0.16750 0.03343734
## 5 4.45 0.16750 0.03184162
## 6 5.56 0.17000 0.03545341
## 7 6.67 0.17125 0.03955042
## 8 7.78 0.17000 0.03961621
## 9 8.89 0.16625 0.03586723
## 10 10.00 0.16500 0.03622844
best_cost_rbf <- tune_rbf$best.parameters$cost
svm_rbf_best <- svm(Purchase ~ ., data = train, kernel = "radial", cost = best_cost_rbf)
train_pred_rbf_best <- predict(svm_rbf_best, train)
test_pred_rbf_best <- predict(svm_rbf_best, test)
cat("RBF Training error (best cost):", mean(train_pred_rbf_best != train$Purchase), "\n")
## RBF Training error (best cost): 0.1375
cat("RBF Test error (best cost):", mean(test_pred_rbf_best != test$Purchase), "\n")
## RBF Test error (best cost): 0.1851852
# (g) Repeat (b)-(e) for polynomial kernel with degree = 2
svm_poly_fit <- svm(Purchase ~ ., data = train, kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_poly_fit)
##
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "polynomial",
## degree = 2, cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 0.01
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 631
##
## ( 313 318 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train_pred_poly <- predict(svm_poly_fit, train)
test_pred_poly <- predict(svm_poly_fit, test)
cat("Polynomial Training error:", mean(train_pred_poly != train$Purchase), "\n")
## Polynomial Training error: 0.3725
cat("Polynomial Test error:", mean(test_pred_poly != test$Purchase), "\n")
## Polynomial Test error: 0.3740741
tune_poly <- tune(svm, Purchase ~ ., data = train, kernel = "polynomial", degree = 2,
ranges = list(cost = seq(0.01, 10, length.out = 10)))
summary(tune_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 6.67
##
## - best performance: 0.16375
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.38750 0.04289846
## 2 1.12 0.18125 0.03294039
## 3 2.23 0.17250 0.03717451
## 4 3.34 0.17000 0.03827895
## 5 4.45 0.17000 0.04005205
## 6 5.56 0.16875 0.04093101
## 7 6.67 0.16375 0.03884174
## 8 7.78 0.16750 0.03736085
## 9 8.89 0.17000 0.03827895
## 10 10.00 0.17125 0.03729108
best_cost_poly <- tune_poly$best.parameters$cost
svm_poly_best <- svm(Purchase ~ ., data = train, kernel = "polynomial", degree = 2, cost = best_cost_poly)
train_pred_poly_best <- predict(svm_poly_best, train)
test_pred_poly_best <- predict(svm_poly_best, test)
cat("Polynomial Training error (best cost):", mean(train_pred_poly_best != train$Purchase), "\n")
## Polynomial Training error (best cost): 0.1425
cat("Polynomial Test error (best cost):", mean(test_pred_poly_best != test$Purchase), "\n")
## Polynomial Test error (best cost): 0.1962963
# (h) Overall best approach:
# You can compare test errors from svm_best, svm_rbf_best, svm_poly_best to pick best.