5
library(e1071)
set.seed(1)
# (a)
n <- 500
X1 <- rnorm(n)
X2 <- rnorm(n)
Y <- ifelse(X1^2 + X2^2 > 1, 1, 0)
data <- data.frame(X1 = X1, X2 = X2, Y = as.factor(Y))
# (b)
plot(data$X1, data$X2, col = ifelse(data$Y == 1, "red", "blue"),
pch = 19, xlab = "X1", ylab = "X2", main = "True Class Labels")

# (c)
logit_model <- glm(Y ~ X1 + X2, data = data, family = binomial)
# (d)
pred_prob <- predict(logit_model, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
plot(data$X1, data$X2, col = ifelse(pred_class == 1, "red", "blue"),
pch = 19, xlab = "X1", ylab = "X2", main = "Logistic Regression (Linear)")

# (e)
logit_nonlinear <- glm(Y ~ X1 + X2 + I(X1^2) + I(X2^2) + I(X1*X2),
data = data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# (f)
pred_prob_nl <- predict(logit_nonlinear, type = "response")
pred_class_nl <- ifelse(pred_prob_nl > 0.5, 1, 0)
plot(data$X1, data$X2, col = ifelse(pred_class_nl == 1, "red", "blue"),
pch = 19, xlab = "X1", ylab = "X2", main = "Logistic Regression (Non-linear)")

# (g)
svm_linear <- svm(Y ~ X1 + X2, data = data, kernel = "linear", cost = 1)
pred_svm_linear <- predict(svm_linear, data)
plot(data$X1, data$X2, col = ifelse(pred_svm_linear == 1, "red", "blue"),
pch = 19, xlab = "X1", ylab = "X2", main = "SVM with Linear Kernel")

# (h)
svm_radial <- svm(Y ~ X1 + X2, data = data, kernel = "radial", cost = 1, gamma = 1)
pred_svm_radial <- predict(svm_radial, data)
plot(data$X1, data$X2, col = ifelse(pred_svm_radial == 1, "red", "blue"),
pch = 19, xlab = "X1", ylab = "X2", main = "SVM with Radial Kernel")

# (i) The linear logistic regression fails to capture the quadratic boundary.Adding non-linear transformations allows logistic regression to approximate the true boundary.
7
library(ISLR2)
library(e1071)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
set.seed(1)
Auto <- na.omit(Auto)
mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
Auto$mpg01 <- factor(Auto$mpg01, levels = c(0, 1))
Auto_features <- subset(Auto, select = -c(mpg, name))
numeric_features <- sapply(Auto_features, is.numeric)
Auto_numeric <- Auto_features[, numeric_features]
Auto_numeric$mpg01 <- Auto$mpg01
rf_model <- randomForest(mpg01 ~ ., data = Auto_numeric, importance = TRUE)
importance_df <- importance(rf_model)
top_features <- rownames(importance_df)[order(importance_df[, 1], decreasing = TRUE)][1:2]
cat("Top 2 numeric features selected:", paste(top_features, collapse = " & "), "\n")
## Top 2 numeric features selected: year & displacement
Auto_top2 <- Auto_numeric[, c(top_features, "mpg01")]
tune_linear <- tune(svm, mpg01 ~ ., data = Auto_top2, kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
best_linear <- tune_linear$best.model
tune_radial <- tune(svm, mpg01 ~ ., data = Auto_top2, kernel = "radial",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10),
gamma = c(0.01, 0.1, 1)))
best_radial <- tune_radial$best.model
tune_poly <- tune(svm, mpg01 ~ ., data = Auto_top2, kernel = "polynomial", degree = 2,
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
best_poly <- tune_poly$best.model
train_err_linear <- mean(predict(best_linear, Auto_top2) != Auto_top2$mpg01)
train_err_radial <- mean(predict(best_radial, Auto_top2) != Auto_top2$mpg01)
train_err_poly <- mean(predict(best_poly, Auto_top2) != Auto_top2$mpg01)
summary_table <- data.frame(
Kernel = c("Linear", "Radial", "Polynomial"),
Best_C = c(tune_linear$best.parameters$cost,
tune_radial$best.parameters$cost,
tune_poly$best.parameters$cost),
Best_Gamma = c(NA,
tune_radial$best.parameters$gamma,
NA),
Training_Error = c(train_err_linear, train_err_radial, train_err_poly),
CV_Error = c(min(tune_linear$performances$error),
min(tune_radial$performances$error),
min(tune_poly$performances$error))
)
print(summary_table)
## Kernel Best_C Best_Gamma Training_Error CV_Error
## 1 Linear 1 NA 0.09438776 0.09423077
## 2 Radial 10 1 0.08163265 0.08647436
## 3 Polynomial 10 NA 0.37244898 0.40275641
plot_svm_decision <- function(model, data, features, title) {
x_range <- seq(min(data[[features[1]]]), max(data[[features[1]]]), length = 200)
y_range <- seq(min(data[[features[2]]]), max(data[[features[2]]]), length = 200)
grid <- expand.grid(X1 = x_range, X2 = y_range)
names(grid) <- features
grid$pred <- predict(model, grid)
ggplot(data, aes_string(x = features[1], y = features[2], color = "mpg01")) +
geom_point(size = 2, alpha = 0.7) +
geom_contour(data = grid, aes_string(z = "as.numeric(pred)"), breaks = 1.5, color = "black") +
labs(title = title, color = "Class") +
theme_minimal()
}
p1 <- plot_svm_decision(best_linear, Auto_top2, top_features, "SVM (Linear)")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2 <- plot_svm_decision(best_radial, Auto_top2, top_features, "SVM (Radial)")
p3 <- plot_svm_decision(best_poly, Auto_top2, top_features, "SVM (Polynomial)")
print(p1)

print(p2)

print(p3)

cv_errors <- c(min(tune_linear$performances$error),
min(tune_radial$performances$error),
min(tune_poly$performances$error))
barplot(cv_errors, names.arg = c("Linear", "Radial", "Polynomial"),
col = "skyblue", main = "CV Error Comparison",
ylab = "CV Error Rate", ylim = c(0, max(cv_errors) + 0.05))

8
library(ISLR2)
library(e1071)
set.seed(1)
# (a)
data(OJ)
train_idx <- sample(1:nrow(OJ), 800)
train <- OJ[train_idx, ]
test <- OJ[-train_idx, ]
# (b)
svc_linear <- svm(Purchase ~ ., data = train, kernel = "linear", cost = 0.01, scale = TRUE)
svc_linear$tot.nSV
## [1] 435
# (c)
train_pred <- predict(svc_linear, train)
train_error <- mean(train_pred != train$Purchase)
test_pred <- predict(svc_linear, test)
test_error <- mean(test_pred != test$Purchase)
train_error; test_error
## [1] 0.175
## [1] 0.1777778
# (d)
set.seed(2)
tune_linear <- tune(svm, Purchase ~ ., data = train, kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tune_linear)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 5
##
## - best performance: 0.1675
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17625 0.04059026
## 2 0.10 0.17125 0.04168749
## 3 1.00 0.17000 0.04090979
## 4 5.00 0.16750 0.03545341
## 5 10.00 0.17000 0.03736085
best_C_linear <- tune_linear$best.parameters$cost
# (e)
svc_best_linear <- svm(Purchase ~ ., data = train, kernel = "linear", cost = best_C_linear)
train_error_best <- mean(predict(svc_best_linear, train) != train$Purchase)
test_error_best <- mean(predict(svc_best_linear, test) != test$Purchase)
train_error_best; test_error_best
## [1] 0.16625
## [1] 0.1555556
# (f)
svc_radial <- svm(Purchase ~ ., data = train, kernel = "radial", cost = 0.01)
svc_radial$tot.nSV
## [1] 634
train_error_radial <- mean(predict(svc_radial, train) != train$Purchase)
test_error_radial <- mean(predict(svc_radial, test) != test$Purchase)
train_error_radial; test_error_radial
## [1] 0.39375
## [1] 0.3777778
set.seed(2)
tune_radial <- tune(svm, Purchase ~ ., data = train, kernel = "radial",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tune_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.1725
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39375 0.03240906
## 2 0.10 0.19500 0.03782269
## 3 1.00 0.17250 0.03162278
## 4 5.00 0.18375 0.03682259
## 5 10.00 0.18750 0.03173239
best_C_radial <- tune_radial$best.parameters$cost
svc_best_radial <- svm(Purchase ~ ., data = train, kernel = "radial", cost = best_C_radial)
train_error_best_radial <- mean(predict(svc_best_radial, train) != train$Purchase)
test_error_best_radial <- mean(predict(svc_best_radial, test) != test$Purchase)
train_error_best_radial; test_error_best_radial
## [1] 0.15125
## [1] 0.1851852
# (g)
svc_poly <- svm(Purchase ~ ., data = train, kernel = "polynomial", degree = 2, cost = 0.01)
svc_poly$tot.nSV
## [1] 636
train_error_poly <- mean(predict(svc_poly, train) != train$Purchase)
test_error_poly <- mean(predict(svc_poly, test) != test$Purchase)
train_error_poly; test_error_poly
## [1] 0.3725
## [1] 0.3666667
set.seed(2)
tune_poly <- tune(svm, Purchase ~ ., data = train, kernel = "polynomial", degree = 2,
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tune_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 5
##
## - best performance: 0.18
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39000 0.03670453
## 2 0.10 0.32125 0.03866254
## 3 1.00 0.19625 0.03910900
## 4 5.00 0.18000 0.03872983
## 5 10.00 0.18125 0.03830162
best_C_poly <- tune_poly$best.parameters$cost
svc_best_poly <- svm(Purchase ~ ., data = train, kernel = "polynomial", degree = 2, cost = best_C_poly)
train_error_best_poly <- mean(predict(svc_best_poly, train) != train$Purchase)
test_error_best_poly <- mean(predict(svc_best_poly, test) != test$Purchase)
train_error_best_poly; test_error_best_poly
## [1] 0.155
## [1] 0.1814815
# (h)
results <- data.frame(
Kernel = c("Linear", "Radial", "Polynomial"),
Best_C = c(best_C_linear, best_C_radial, best_C_poly),
Train_Error = c(train_error_best, train_error_best_radial, train_error_best_poly),
Test_Error = c(test_error_best, test_error_best_radial, test_error_best_poly)
)
print(results)
## Kernel Best_C Train_Error Test_Error
## 1 Linear 5 0.16625 0.1555556
## 2 Radial 1 0.15125 0.1851852
## 3 Polynomial 5 0.15500 0.1814815