Assignment#8

5

library(e1071)
set.seed(1)

# (a) 
n <- 500
X1 <- rnorm(n)
X2 <- rnorm(n)
Y <- ifelse(X1^2 + X2^2 > 1, 1, 0)  
data <- data.frame(X1 = X1, X2 = X2, Y = as.factor(Y))

# (b) 
plot(data$X1, data$X2, col = ifelse(data$Y == 1, "red", "blue"),
     pch = 19, xlab = "X1", ylab = "X2", main = "True Class Labels")

# (c) 
logit_model <- glm(Y ~ X1 + X2, data = data, family = binomial)

# (d) 
pred_prob <- predict(logit_model, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
plot(data$X1, data$X2, col = ifelse(pred_class == 1, "red", "blue"),
     pch = 19, xlab = "X1", ylab = "X2", main = "Logistic Regression (Linear)")

# (e) 
logit_nonlinear <- glm(Y ~ X1 + X2 + I(X1^2) + I(X2^2) + I(X1*X2),
                       data = data, family = binomial)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# (f) 
pred_prob_nl <- predict(logit_nonlinear, type = "response")
pred_class_nl <- ifelse(pred_prob_nl > 0.5, 1, 0)
plot(data$X1, data$X2, col = ifelse(pred_class_nl == 1, "red", "blue"),
     pch = 19, xlab = "X1", ylab = "X2", main = "Logistic Regression (Non-linear)")

# (g) 
svm_linear <- svm(Y ~ X1 + X2, data = data, kernel = "linear", cost = 1)
pred_svm_linear <- predict(svm_linear, data)
plot(data$X1, data$X2, col = ifelse(pred_svm_linear == 1, "red", "blue"),
     pch = 19, xlab = "X1", ylab = "X2", main = "SVM with Linear Kernel")

# (h) 
svm_radial <- svm(Y ~ X1 + X2, data = data, kernel = "radial", cost = 1, gamma = 1)
pred_svm_radial <- predict(svm_radial, data)
plot(data$X1, data$X2, col = ifelse(pred_svm_radial == 1, "red", "blue"),
     pch = 19, xlab = "X1", ylab = "X2", main = "SVM with Radial Kernel")

# (i) The linear logistic regression fails to capture the quadratic boundary.Adding non-linear transformations allows logistic regression to approximate the true boundary.

7

library(ISLR2)
library(e1071)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

set.seed(1)

Auto <- na.omit(Auto)
mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
Auto$mpg01 <- factor(Auto$mpg01, levels = c(0, 1))

Auto_features <- subset(Auto, select = -c(mpg, name))
numeric_features <- sapply(Auto_features, is.numeric)
Auto_numeric <- Auto_features[, numeric_features]
Auto_numeric$mpg01 <- Auto$mpg01

rf_model <- randomForest(mpg01 ~ ., data = Auto_numeric, importance = TRUE)
importance_df <- importance(rf_model)
top_features <- rownames(importance_df)[order(importance_df[, 1], decreasing = TRUE)][1:2]
cat("Top 2 numeric features selected:", paste(top_features, collapse = " & "), "\n")

## Top 2 numeric features selected: year & displacement

Auto_top2 <- Auto_numeric[, c(top_features, "mpg01")]

tune_linear <- tune(svm, mpg01 ~ ., data = Auto_top2, kernel = "linear",
                    ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
best_linear <- tune_linear$best.model

tune_radial <- tune(svm, mpg01 ~ ., data = Auto_top2, kernel = "radial",
                    ranges = list(cost = c(0.01, 0.1, 1, 5, 10),
                                  gamma = c(0.01, 0.1, 1)))
best_radial <- tune_radial$best.model

tune_poly <- tune(svm, mpg01 ~ ., data = Auto_top2, kernel = "polynomial", degree = 2,
                  ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
best_poly <- tune_poly$best.model

train_err_linear <- mean(predict(best_linear, Auto_top2) != Auto_top2$mpg01)
train_err_radial <- mean(predict(best_radial, Auto_top2) != Auto_top2$mpg01)
train_err_poly   <- mean(predict(best_poly, Auto_top2)   != Auto_top2$mpg01)

summary_table <- data.frame(
  Kernel = c("Linear", "Radial", "Polynomial"),
  Best_C = c(tune_linear$best.parameters$cost,
             tune_radial$best.parameters$cost,
             tune_poly$best.parameters$cost),
  Best_Gamma = c(NA,
                 tune_radial$best.parameters$gamma,
                 NA),
  Training_Error = c(train_err_linear, train_err_radial, train_err_poly),
  CV_Error = c(min(tune_linear$performances$error),
               min(tune_radial$performances$error),
               min(tune_poly$performances$error))
)
print(summary_table)

##       Kernel Best_C Best_Gamma Training_Error   CV_Error
## 1     Linear      1         NA     0.09438776 0.09423077
## 2     Radial     10          1     0.08163265 0.08647436
## 3 Polynomial     10         NA     0.37244898 0.40275641

plot_svm_decision <- function(model, data, features, title) {
  x_range <- seq(min(data[[features[1]]]), max(data[[features[1]]]), length = 200)
  y_range <- seq(min(data[[features[2]]]), max(data[[features[2]]]), length = 200)
  grid <- expand.grid(X1 = x_range, X2 = y_range)
  names(grid) <- features
  grid$pred <- predict(model, grid)
  
  ggplot(data, aes_string(x = features[1], y = features[2], color = "mpg01")) +
    geom_point(size = 2, alpha = 0.7) +
    geom_contour(data = grid, aes_string(z = "as.numeric(pred)"), breaks = 1.5, color = "black") +
    labs(title = title, color = "Class") +
    theme_minimal()
}

p1 <- plot_svm_decision(best_linear, Auto_top2, top_features, "SVM (Linear)")

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

p2 <- plot_svm_decision(best_radial, Auto_top2, top_features, "SVM (Radial)")
p3 <- plot_svm_decision(best_poly, Auto_top2, top_features, "SVM (Polynomial)")

print(p1)

print(p2)

print(p3)

cv_errors <- c(min(tune_linear$performances$error),
               min(tune_radial$performances$error),
               min(tune_poly$performances$error))

barplot(cv_errors, names.arg = c("Linear", "Radial", "Polynomial"),
        col = "skyblue", main = "CV Error Comparison",
        ylab = "CV Error Rate", ylim = c(0, max(cv_errors) + 0.05))

8

library(ISLR2)
library(e1071)
set.seed(1)

# (a)
data(OJ)
train_idx <- sample(1:nrow(OJ), 800)
train <- OJ[train_idx, ]
test <- OJ[-train_idx, ]

# (b) 
svc_linear <- svm(Purchase ~ ., data = train, kernel = "linear", cost = 0.01, scale = TRUE)
svc_linear$tot.nSV

## [1] 435

# (c) 
train_pred <- predict(svc_linear, train)
train_error <- mean(train_pred != train$Purchase)
test_pred <- predict(svc_linear, test)
test_error <- mean(test_pred != test$Purchase)
train_error; test_error

## [1] 0.175

## [1] 0.1777778

# (d) 
set.seed(2)
tune_linear <- tune(svm, Purchase ~ ., data = train, kernel = "linear",
                    ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tune_linear)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     5
## 
## - best performance: 0.1675 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.17625 0.04059026
## 2  0.10 0.17125 0.04168749
## 3  1.00 0.17000 0.04090979
## 4  5.00 0.16750 0.03545341
## 5 10.00 0.17000 0.03736085

best_C_linear <- tune_linear$best.parameters$cost

# (e) 
svc_best_linear <- svm(Purchase ~ ., data = train, kernel = "linear", cost = best_C_linear)
train_error_best <- mean(predict(svc_best_linear, train) != train$Purchase)
test_error_best <- mean(predict(svc_best_linear, test) != test$Purchase)
train_error_best; test_error_best

## [1] 0.16625

## [1] 0.1555556

# (f) 
svc_radial <- svm(Purchase ~ ., data = train, kernel = "radial", cost = 0.01)
svc_radial$tot.nSV

## [1] 634

train_error_radial <- mean(predict(svc_radial, train) != train$Purchase)
test_error_radial <- mean(predict(svc_radial, test) != test$Purchase)
train_error_radial; test_error_radial

## [1] 0.39375

## [1] 0.3777778

set.seed(2)
tune_radial <- tune(svm, Purchase ~ ., data = train, kernel = "radial",
                    ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tune_radial)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.1725 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.39375 0.03240906
## 2  0.10 0.19500 0.03782269
## 3  1.00 0.17250 0.03162278
## 4  5.00 0.18375 0.03682259
## 5 10.00 0.18750 0.03173239

best_C_radial <- tune_radial$best.parameters$cost

svc_best_radial <- svm(Purchase ~ ., data = train, kernel = "radial", cost = best_C_radial)
train_error_best_radial <- mean(predict(svc_best_radial, train) != train$Purchase)
test_error_best_radial <- mean(predict(svc_best_radial, test) != test$Purchase)
train_error_best_radial; test_error_best_radial

## [1] 0.15125

## [1] 0.1851852

# (g) 
svc_poly <- svm(Purchase ~ ., data = train, kernel = "polynomial", degree = 2, cost = 0.01)
svc_poly$tot.nSV

## [1] 636

train_error_poly <- mean(predict(svc_poly, train) != train$Purchase)
test_error_poly <- mean(predict(svc_poly, test) != test$Purchase)
train_error_poly; test_error_poly

## [1] 0.3725

## [1] 0.3666667

set.seed(2)
tune_poly <- tune(svm, Purchase ~ ., data = train, kernel = "polynomial", degree = 2,
                  ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tune_poly)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     5
## 
## - best performance: 0.18 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.39000 0.03670453
## 2  0.10 0.32125 0.03866254
## 3  1.00 0.19625 0.03910900
## 4  5.00 0.18000 0.03872983
## 5 10.00 0.18125 0.03830162

best_C_poly <- tune_poly$best.parameters$cost

svc_best_poly <- svm(Purchase ~ ., data = train, kernel = "polynomial", degree = 2, cost = best_C_poly)
train_error_best_poly <- mean(predict(svc_best_poly, train) != train$Purchase)
test_error_best_poly <- mean(predict(svc_best_poly, test) != test$Purchase)
train_error_best_poly; test_error_best_poly

## [1] 0.155

## [1] 0.1814815

# (h) 
results <- data.frame(
  Kernel = c("Linear", "Radial", "Polynomial"),
  Best_C = c(best_C_linear, best_C_radial, best_C_poly),
  Train_Error = c(train_error_best, train_error_best_radial, train_error_best_poly),
  Test_Error = c(test_error_best, test_error_best_radial, test_error_best_poly)
)
print(results)

##       Kernel Best_C Train_Error Test_Error
## 1     Linear      5     0.16625  0.1555556
## 2     Radial      1     0.15125  0.1851852
## 3 Polynomial      5     0.15500  0.1814815

Assignment#8

2025-07-20

5

7

8