set.seed(1)
x1 <- runif(500) - 0.5
x2 <- runif(500) - 0.5
epsilon <- rnorm(500, mean = 0, sd = 0.05) # Small noise
y <- as.factor(1 * (x1^2 - x2^2 + epsilon > 0))
dat <- data.frame(x1 = x1, x2 = x2, y = y)
ggplot(dat, aes(x = x1, y = x2, color = y)) +
geom_point() +
labs(title = "Simulated Data with Quadratic Decision Boundary")
glm_lin <- glm(y ~ x1 + x2, data = dat, family = "binomial")
dat$pred_lin <- as.factor(ifelse(predict(glm_lin, type = "response") > 0.5, 1, 0))
ggplot(dat, aes(x = x1, y = x2, color = pred_lin)) +
geom_point() +
labs(title = "Logistic Regression with Linear Predictors")
Answer: The decision boundary is linear and clearly fails to capture the true separation.
glm_nonlin <- glm(
y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2),
data = dat,
family = "binomial"
)
dat$pred_nonlin <- as.factor(ifelse(predict(glm_nonlin, type = "response") > 0.5, 1, 0))
ggplot(dat, aes(x = x1, y = x2, color = pred_nonlin)) +
geom_point() +
labs(title = "Logistic Regression with Non-Linear Predictors")
svm_lin <- svm(y ~ x1 + x2, data = dat, kernel = "linear", cost = 1)
dat$svm_lin_pred <- predict(svm_lin)
ggplot(dat, aes(x = x1, y = x2, color = svm_lin_pred)) +
geom_point() +
labs(title = "SVM with Linear Kernel")
svm_rad <- svm(y ~ x1 + x2, data = dat, kernel = "radial", cost = 1)
dat$svm_rad_pred <- predict(svm_rad)
ggplot(dat, aes(x = x1, y = x2, color = svm_rad_pred)) +
geom_point() +
labs(title = "SVM with Radial Kernel")
Answer: Logistic regression with nonlinear terms and SVM with radial kernels both perform well. Linear models fail due to the nonlinear boundary.
# Start clean and confirm mpg is numeric
data(Auto) # Reload fresh copy
Auto <- na.omit(Auto) # Remove missing values
# Check type of mpg just in case
str(Auto$mpg) # Should return "num"
## num [1:392] 18 15 18 16 17 15 14 14 14 15 ...
# Create mpg01 and remove mpg and name afterward
Auto <- Auto %>%
mutate(mpg01 = ifelse(as.numeric(mpg) > median(as.numeric(mpg)), 1, 0)) %>%
select(-mpg, -name)
set.seed(1)
svm_tune_linear <- tune(
svm,
mpg01 ~ ., # Predict mpg01
data = Auto, # Use cleaned dataset
kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 10, 100))
)
summary(svm_tune_linear)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.01
##
## - best performance: 0.1053223
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.1053223 0.03162078
## 2 1e-01 0.1083165 0.03461157
## 3 1e+00 0.1100350 0.03552713
## 4 1e+01 0.1101804 0.03557982
## 5 1e+02 0.1101708 0.03556061
Answer: The cost with the lowest cross-validation error is best. Linear SVMs perform moderately well.
svm_tune_radial <- tune(svm, mpg01 ~ ., data = Auto, kernel = "radial",
ranges = list(cost = c(0.1, 1, 10), gamma = c(0.5, 1, 2)))
svm_tune_poly <- tune(svm, mpg01 ~ ., data = Auto, kernel = "polynomial",
ranges = list(cost = c(0.1, 1, 10), degree = c(2, 3)))
summary(svm_tune_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 0.5
##
## - best performance: 0.05955388
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.5 0.06970597 0.03349539
## 2 1.0 0.5 0.05955388 0.03307096
## 3 10.0 0.5 0.06724373 0.03548038
## 4 0.1 1.0 0.08114160 0.02662778
## 5 1.0 1.0 0.06031574 0.02879835
## 6 10.0 1.0 0.06986628 0.02813494
## 7 0.1 2.0 0.12026254 0.01759096
## 8 1.0 2.0 0.07073989 0.02240443
## 9 10.0 2.0 0.08673891 0.02796514
summary(svm_tune_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 10 3
##
## - best performance: 0.1113738
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 0.2269920 0.03627644
## 2 1.0 2 0.1651022 0.02910030
## 3 10.0 2 0.1592405 0.04433953
## 4 0.1 3 0.1376712 0.03070863
## 5 1.0 3 0.1228948 0.03188548
## 6 10.0 3 0.1113738 0.02617738
Answer: Radial kernels often outperform polynomial and linear kernels in complex settings.
plot(svm_tune_radial$best.model, Auto, horsepower ~ weight)
set.seed(1)
train_ind <- sample(1:nrow(OJ), 800)
OJ_train <- OJ[train_ind, ]
OJ_test <- OJ[-train_ind, ]
svm_oj_linear <- svm(Purchase ~ ., data = OJ_train, kernel = "linear", cost = 0.01)
summary(svm_oj_linear)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ_train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 435
##
## ( 219 216 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train_pred <- predict(svm_oj_linear, OJ_train)
test_pred <- predict(svm_oj_linear, OJ_test)
train_error <- mean(train_pred != OJ_train$Purchase)
test_error <- mean(test_pred != OJ_test$Purchase)
train_error
## [1] 0.175
test_error
## [1] 0.1777778
svm_tune_oj_linear <- tune(svm, Purchase ~ ., data = OJ_train, kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 10)))
summary(svm_tune_oj_linear)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.17125
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17375 0.03884174
## 2 0.10 0.17875 0.03064696
## 3 1.00 0.17500 0.03061862
## 4 10.00 0.17125 0.03488573
best_svm_oj_linear <- svm_tune_oj_linear$best.model
mean(predict(best_svm_oj_linear, OJ_test) != OJ_test$Purchase)
## [1] 0.1481481
svm_tune_oj_radial <- tune(svm, Purchase ~ ., data = OJ_train, kernel = "radial",
ranges = list(cost = c(0.01, 0.1, 1, 10)))
summary(svm_tune_oj_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.17625
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39375 0.06568284
## 2 0.10 0.18250 0.05470883
## 3 1.00 0.17625 0.03793727
## 4 10.00 0.18125 0.04340139
svm_tune_oj_poly <- tune(svm, Purchase ~ ., data = OJ_train, kernel = "polynomial",
ranges = list(cost = c(0.01, 0.1, 1, 10), degree = 2))
summary(svm_tune_oj_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 10 2
##
## - best performance: 0.18625
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.01 2 0.39000 0.08287373
## 2 0.10 2 0.32375 0.06730166
## 3 1.00 2 0.20000 0.05137012
## 4 10.00 2 0.18625 0.05185785
Answer: Radial kernel usually yields the best test performance due to its flexibility, especially when decision boundaries are not linear.
```