Assignment8

Chapter 9 Questions

library(ggplot2)
library(e1071)
library(ISLR2)
library(caret)

set.seed(1)
n <- 500

x1 <- runif(n, -2, 2)
x2 <- runif(n, -2, 2)

y <- ifelse(x1^2 + x2 < 1.5, 1, 0)

quad_data <- data.frame(x1 = x1, x2 = x2, class = as.factor(y))

ggplot(quad_data, aes(x = x1, y = x2, color = class)) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_manual(values = c("red", "blue"),
                     labels = c("0", "1")) +
  labs(title = "Simulated Data with Quadratic Decision Boundary",
       x = "X1",
       y = "X2",
       color = "Class") +
  theme_minimal()

logit_model <- glm(class ~ x1 + x2, data = quad_data, family = "binomial")
summary(logit_model)

## 
## Call:
## glm(formula = class ~ x1 + x2, family = "binomial", data = quad_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.32292    0.11369   2.840  0.00451 ** 
## x1          -0.16043    0.09825  -1.633  0.10252    
## x2          -1.33911    0.11823 -11.327  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 688.53  on 499  degrees of freedom
## Residual deviance: 488.56  on 497  degrees of freedom
## AIC: 494.56
## 
## Number of Fisher Scoring iterations: 4

pred_probs <- predict(logit_model, type = "response")
pred_class <- ifelse(pred_probs > 0.5, 1, 0)
quad_data$pred_class <- as.factor(pred_class)

ggplot(quad_data, aes(x = x1, y = x2, color = pred_class)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(title = "Logistic Regression Predictions",
       subtitle = "Linear Decision Boundary",
       x = "X1", y = "X2", color = "Predicted Class") +
  scale_color_manual(values = c("red", "blue"),
                     labels = c("0", "1")) +
  theme_minimal()

logit_quad <- glm(class ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2),
                  data = quad_data, family = "binomial")
summary(logit_quad)

## 
## Call:
## glm(formula = class ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2), 
##     family = "binomial", data = quad_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1008.623  41825.007   0.024    0.981
## x1             -9.823   4592.034  -0.002    0.998
## x2           -676.769  28270.590  -0.024    0.981
## I(x1^2)      -677.016  28199.805  -0.024    0.981
## I(x2^2)         5.885   4647.415   0.001    0.999
## I(x1 * x2)    -11.103   4136.228  -0.003    0.998
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6.8853e+02  on 499  degrees of freedom
## Residual deviance: 1.8513e-06  on 494  degrees of freedom
## AIC: 12
## 
## Number of Fisher Scoring iterations: 25

quad_probs <- predict(logit_quad, type = "response")
quad_preds <- ifelse(quad_probs > 0.5, 1, 0)
quad_data$quad_class <- as.factor(quad_preds)

x1_grid <- seq(min(quad_data$x1), max(quad_data$x1), length = 100)
x2_grid <- seq(min(quad_data$x2), max(quad_data$x2), length = 100)
grid <- expand.grid(x1 = x1_grid, x2 = x2_grid)

grid$x1x2 <- grid$x1 * grid$x2
grid$x1sq <- grid$x1^2
grid$x2sq <- grid$x2^2

grid_pred <- predict(logit_quad,
                     newdata = data.frame(x1 = grid$x1,
                                          x2 = grid$x2,
                                          `I(x1^2)` = grid$x1sq,
                                          `I(x2^2)` = grid$x2sq,
                                          `I(x1 * x2)` = grid$x1x2),
                     type = "response")

grid$pred_class <- as.factor(ifelse(grid_pred > 0.5, 1, 0))

ggplot() +
  geom_tile(data = grid, aes(x = x1, y = x2, fill = pred_class), alpha = 0.4) +
  geom_point(data = quad_data, aes(x = x1, y = x2, color = quad_class), size = 2) +
  scale_fill_manual(values = c("pink", "lightblue")) +
  scale_color_manual(values = c("red", "blue"),
                     labels = c("0", "1")) +
  labs(title = "Logistic Regression with Non-Linear Terms",
       subtitle = "Curved Decision Boundary",
       x = "X1", y = "X2", color = "Predicted Class") +
  theme_minimal()

svc_model <- svm(class ~ x1 + x2, data = quad_data, kernel = "linear", cost = 1, scale = TRUE)

svc_preds <- predict(svc_model, newdata = quad_data)
quad_data$svc_class <- svc_preds

ggplot(quad_data, aes(x = x1, y = x2, color = svc_class)) +
  geom_point(size = 2, alpha = 0.7) +
  labs(title = "Support Vector Classifier Predictions",
       x = "X1", y = "X2", color = "Predicted Class") +
  scale_color_manual(values = c("red", "blue")) +
  theme_minimal()

svm_rbf <- svm(class ~ x1 + x2, data = quad_data, kernel = "radial", cost = 1, gamma = 1)

rbf_preds <- predict(svm_rbf, newdata = quad_data)
quad_data$rbf_class <- rbf_preds

ggplot(quad_data, aes(x = x1, y = x2, color = rbf_class)) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_manual(values = c("red", "blue")) +
  labs(title = "SVM with RBF Kernel",
       subtitle = "Predicted Class Labels (Non-Linear Boundary)",
       x = "X1", y = "X2", color = "Predicted Class") +
  theme_minimal()

i. The real separation between the two classes follows a curved shape, so different models were tried to see which could capture this best. A simple logistic regression drew a straight line, which didn’t match the true pattern very well. When curved terms were added to the logistic model, the predictions got better and the decision boundary became more flexible and matched the shape of the data. The SVM with the curved boundary wrapped around the Class 1 points in the middle, which was expected. This shows that more flexible models are better at handling complicated shapes in data, especially when the boundary between classes isn’t just a straight line.

data(Auto)

mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)

b. A lower C means the model allows some mistakes to keep things simple, while a higher C tries to fit the training data more closely. In the test, the lowest error came when C was small. That means a simpler model actually made more accurate predictions. When C got too large, the model became too strict and didn’t perform as well.

Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto$mpg01 <- as.factor(Auto$mpg01)

Auto_svm <- Auto[, !(names(Auto) %in% c("mpg"))]

set.seed(1)
train_control <- trainControl(method = "cv", number = 10)

cost_values <- c(0.01, 0.1, 1, 10, 100)
svm_results <- data.frame(Cost = cost_values, CV_Error = NA)

for (i in seq_along(cost_values)) {
  svm_fit <- train(mpg01 ~ ., data = Auto_svm,
                   method = "svmLinear",
                   trControl = train_control,
                   tuneGrid = data.frame(C = cost_values[i]))
  
  svm_results$CV_Error[i] <- 1 - max(svm_fit$results$Accuracy)
}

print(svm_results)

##    Cost  CV_Error
## 1 1e-02 0.0894973
## 2 1e-01 0.0895108
## 3 1e+00 0.1024764
## 4 1e+01 0.0996390
## 5 1e+02 0.1045108

c. Comparing findings from using a radial kernel to a polynomial kernel. The best radial settings used a cost value of 10, and the model made its decisions using 379 support vectors. The model was correct about 92% of the time, which means it made very few mistakes. The polynomial model used a degree of 2 and a cost value of 10 to let it focus on fitting the training data closely. This version of the model used 392 support vectors to help separate the two classes of cars. It correctly predicted gas mileage in only about 48% of the cases. The polynomial kernel didn’t match the shape of the data very well and it struggled to make good predictions.

Auto$mpg01 <- as.factor(ifelse(Auto$mpg > median(Auto$mpg), 1, 0))
Auto_svm <- Auto[, !(names(Auto) %in% c("mpg"))]

set.seed(1)
tune_radial <- tune(svm, mpg01 ~ ., data = Auto_svm, kernel = "radial",
                    ranges = list(cost = c(0.1, 1, 10), gamma = c(0.01, 0.1, 1)))

best_radial <- tune_radial$best.model
summary(best_radial)

## 
## Call:
## best.tune(METHOD = svm, train.x = mpg01 ~ ., data = Auto_svm, ranges = list(cost = c(0.1, 
##     1, 10), gamma = c(0.01, 0.1, 1)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  10 
## 
## Number of Support Vectors:  379
## 
##  ( 188 191 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

radial_cv_error <- 1 - tune_radial$best.performance
radial_cv_error

## [1] 0.9210256

set.seed(1)
tune_poly <- tune(svm, mpg01 ~ ., data = Auto_svm, kernel = "polynomial",
                  ranges = list(cost = c(0.1, 1, 10), degree = c(2, 3, 4)))

best_poly <- tune_poly$best.model
summary(best_poly)

## 
## Call:
## best.tune(METHOD = svm, train.x = mpg01 ~ ., data = Auto_svm, ranges = list(cost = c(0.1, 
##     1, 10), degree = c(2, 3, 4)), kernel = "polynomial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  10 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  392
## 
##  ( 196 196 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

poly_cv_error <- 1 - tune_poly$best.performance
poly_cv_error

## [1] 0.479359

Auto_sub <- Auto_svm[, c("horsepower", "weight", "mpg01")]

svc_model <- svm(mpg01 ~ ., data = Auto_sub, kernel = "linear", cost = 1, scale = TRUE)
svm_radial <- svm(mpg01 ~ ., data = Auto_sub, kernel = "radial", cost = 10, gamma = 0.1, scale = TRUE)
svm_poly <- svm(mpg01 ~ ., data = Auto_sub, kernel = "polynomial", cost = 10, degree = 2, coef0 = 0, scale = TRUE)

plot_svm <- function(model, data, title) {
  x_range <- seq(min(data$horsepower), max(data$horsepower), length = 200)
  y_range <- seq(min(data$weight), max(data$weight), length = 200)
  grid <- expand.grid(horsepower = x_range, weight = y_range)
  grid$pred <- predict(model, newdata = grid)
  sv <- data[model$index, ]

  ggplot() +
    geom_tile(data = grid, aes(x = horsepower, y = weight, fill = pred), alpha = 0.3) +
    geom_point(data = data, aes(x = horsepower, y = weight, color = mpg01), size = 2) +
    geom_point(data = sv, aes(x = horsepower, y = weight), shape = 21, color = "black", size = 2, stroke = 1.2) +
    labs(title = title, x = "Horsepower", y = "Weight") +
    scale_fill_manual(values = c("lightpink", "lightblue")) +
    scale_color_manual(values = c("red", "blue")) +
    theme_minimal()
}

plot_svm(svc_model, Auto_sub, "Support Vector Classifier (Linear Kernel)")

plot_svm(svm_radial, Auto_sub, "SVM with Radial Kernel")

plot_svm(svm_poly, Auto_sub, "SVM with Polynomial Kernel (Degree 2)")

data(OJ)

set.seed(1)
train_index <- sample(1:nrow(OJ), 800)

train <- OJ[train_index, ]
test <- OJ[-train_index, ]

ojsvc_model <- svm(Purchase ~ ., data = train, kernel = "linear", cost = 0.01, scale = TRUE)
summary(ojsvc_model)

## 
## Call:
## svm(formula = Purchase ~ ., data = train, kernel = "linear", cost = 0.01, 
##     scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  435
## 
##  ( 219 216 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

train_pred <- predict(ojsvc_model, newdata = train)
train_error <- mean(train_pred != train$Purchase)

test_pred <- predict(ojsvc_model, newdata = test)
test_error <- mean(test_pred != test$Purchase)

cat("Training Error Rate:", round(train_error, 4), "\n")

## Training Error Rate: 0.175

cat("Test Error Rate:", round(test_error, 4), "\n")

## Test Error Rate: 0.1778

set.seed(1)
tune_result <- tune(svm, Purchase ~ ., data = train, kernel = "linear",
                    ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))

summary(tune_result)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.1725 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.17625 0.02853482
## 2  0.10 0.17250 0.03162278
## 3  1.00 0.17500 0.02946278
## 4  5.00 0.17250 0.03162278
## 5 10.00 0.17375 0.03197764

best_svc <- tune_result$best.model

train_pred_best <- predict(best_svc, newdata = train)
train_error_best <- mean(train_pred_best != train$Purchase)

test_pred_best <- predict(best_svc, newdata = test)
test_error_best <- mean(test_pred_best != test$Purchase)

cat("Test Error with Optimal C:", round(test_error_best, 4), "\n")

## Test Error with Optimal C: 0.163

cat("Train Error with Optimal C:", round(train_error_best, 4), "\n")

## Train Error with Optimal C: 0.165

set.seed(1)
tune_radial <- tune(svm, Purchase ~ ., data = train, kernel = "radial",
                    ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))  

summary(tune_radial)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.17125 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.39375 0.04007372
## 2  0.10 0.18625 0.02853482
## 3  1.00 0.17125 0.02128673
## 4  5.00 0.18000 0.02220485
## 5 10.00 0.18625 0.02853482

best_radial <- tune_radial$best.model

train_pred_radial <- predict(best_radial, newdata = train)
train_error_radial <- mean(train_pred_radial != train$Purchase)

test_pred_radial <- predict(best_radial, newdata = test)
test_error_radial <- mean(test_pred_radial != test$Purchase)

cat("Test Error with Best Radial SVM:", round(test_error_radial, 4), "\n")

## Test Error with Best Radial SVM: 0.1852

cat("Train Error with Best Radial SVM:", round(train_error_radial, 4), "\n")

## Train Error with Best Radial SVM: 0.1512

set.seed(1)
tune_poly <- tune(svm, Purchase ~ ., data = train, kernel = "polynomial",
                  ranges = list(cost = c(0.01, 0.1, 1, 5, 10)),
                  degree = 2)

summary(tune_poly)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##    10
## 
## - best performance: 0.18125 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.39125 0.04210189
## 2  0.10 0.32125 0.05001736
## 3  1.00 0.20250 0.04116363
## 4  5.00 0.18250 0.03496029
## 5 10.00 0.18125 0.02779513

best_poly <- tune_poly$best.model

train_pred_poly <- predict(best_poly, newdata = train)
train_error_poly <- mean(train_pred_poly != train$Purchase)

test_pred_poly <- predict(best_poly, newdata = test)
test_error_poly <- mean(test_pred_poly != test$Purchase)

cat("Test Error with Best Polynomial SVM (degree=2):", round(test_error_poly, 4), "\n")

## Test Error with Best Polynomial SVM (degree=2): 0.1889

cat("Train Error with Best Polynomial SVM (degree=2):", round(train_error_poly, 4), "\n")

## Train Error with Best Polynomial SVM (degree=2): 0.15

h. The linear SVM with C = 0.1 performed the best. It had the lowest test error rate at 16.3%, meaning it made the fewest mistakes predicting new outcomes.

Assignment8

Mandi Stanley

2025-07-24

Chapter 9 Questions