set.seed(42)
n <- 500
x1 <- runif(n) - 0.5
x2 <- runif(n) - 0.5
y <- as.factor(1 * (x1^2 - x2^2 > 0))
data <- data.frame(x1, x2, y)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
ggplot(data, aes(x = x1, y = x2, color = y)) +
geom_point(alpha = 0.6) +
labs(title = "True Classes", color = "Class") +
theme_minimal()
glm_lin <- glm(y ~ x1 + x2, data = data, family = binomial)
summary(glm_lin)
##
## Call:
## glm(formula = y ~ x1 + x2, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.09978 0.08976 1.112 0.266
## x1 -0.17659 0.30658 -0.576 0.565
## x2 -0.20067 0.30978 -0.648 0.517
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 691.79 on 499 degrees of freedom
## Residual deviance: 691.08 on 497 degrees of freedom
## AIC: 697.08
##
## Number of Fisher Scoring iterations: 3
data$pred_lin <- predict(glm_lin, type = "response") > 0.5
data$pred_lin <- factor(data$pred_lin, levels = c(FALSE, TRUE), labels = c("0","1"))
ggplot(data, aes(x = x1, y = x2, color = pred_lin)) +
geom_point(alpha = 0.6) +
labs(title = "Linear Logistic Regression Prediction",
color = "Predicted\nClass") +
theme_minimal()
glm_nl <- glm(y ~ poly(x1, 2, raw = TRUE) + poly(x2, 2, raw = TRUE) +
I(x1 * x2),
data = data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm_nl)
##
## Call:
## glm(formula = y ~ poly(x1, 2, raw = TRUE) + poly(x2, 2, raw = TRUE) +
## I(x1 * x2), family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 10.41 138.85 0.075 0.940
## poly(x1, 2, raw = TRUE)1 183.96 3263.04 0.056 0.955
## poly(x1, 2, raw = TRUE)2 88648.36 844122.80 0.105 0.916
## poly(x2, 2, raw = TRUE)1 -331.24 4128.28 -0.080 0.936
## poly(x2, 2, raw = TRUE)2 -86338.24 820611.62 -0.105 0.916
## I(x1 * x2) -1396.21 15698.89 -0.089 0.929
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.9179e+02 on 499 degrees of freedom
## Residual deviance: 8.2483e-05 on 494 degrees of freedom
## AIC: 12
##
## Number of Fisher Scoring iterations: 25
data$pred_nl <- predict(glm_nl, type = "response") > 0.5
data$pred_nl <- factor(data$pred_nl, levels = c(FALSE, TRUE), labels = c("0","1"))
ggplot(data, aes(x = x1, y = x2, color = pred_nl)) +
geom_point(alpha = 0.6) +
labs(title = "Non-linear Logistic Regression Prediction",
color = "Predicted\nClass") +
theme_minimal()
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
svm_lin <- svm(y ~ x1 + x2, data = data, kernel = "linear", cost = 1, scale = FALSE)
data$pred_svm_lin <- predict(svm_lin, data)
ggplot(data, aes(x = x1, y = x2, color = pred_svm_lin)) +
geom_point(alpha = 0.6) +
labs(title = "Linear SVM Prediction",
color = "Predicted\nClass") +
theme_minimal()
svm_rbf <- svm(y ~ x1 + x2, data = data, kernel = "radial", gamma = 1, cost = 1, scale = FALSE)
data$pred_svm_rbf <- predict(svm_rbf, data)
ggplot(data, aes(x = x1, y = x2, color = pred_svm_rbf)) +
geom_point(alpha = 0.6) +
labs(title = "Non-linear (RBF) SVM Prediction",
color = "Predicted\nClass") +
theme_minimal()
From these results, we can see that the linear methods can only display straight lines, and that is not good since the true boundaries we see are not that linear. We can also see that the feature transformation can produce highly flexible decision boundaries that match the data’s shape.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(e1071)
data(Auto)
Auto$mpg01 <- factor(ifelse(Auto$mpg > median(Auto$mpg), "High", "Low"))
set.seed(42)
tune_lin <- tune(svm,
mpg01 ~ . - mpg,
data = Auto,
kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 10, 100)),
tunecontrol = tune.control(cross = 5))
summary(tune_lin)
##
## Parameter tuning of 'svm':
##
## - sampling method: 5-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.08932165
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.08935411 0.03618129
## 2 1e-01 0.08932165 0.04691167
## 3 1e+00 0.09948069 0.02287240
## 4 1e+01 0.11213892 0.02859924
## 5 1e+02 0.13008763 0.02081952
cv_results_lin <- data.frame(
cost = tune_lin$performances$cost,
error = tune_lin$performances$error
)
print(cv_results_lin)
## cost error
## 1 1e-02 0.08935411
## 2 1e-01 0.08932165
## 3 1e+00 0.09948069
## 4 1e+01 0.11213892
## 5 1e+02 0.13008763
We can see here that the cross validation shows that the smallest error (8.93%) happens at very low cost values, such as 0.01 and 0.1., so we could say that a soft margin is optimal for this data. We can also see that as the cost rises, the error rises as well, which shows that if the margin is too hard it could lead to overfitting.
set.seed(42)
tune_radial <- tune(svm,
mpg01 ~ . - mpg,
data = Auto,
kernel = "radial",
ranges = list(cost = c(0.1, 1, 10, 100),
gamma = c(0.5, 1, 2)))
summary(tune_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 1
##
## - best performance: 0.07891026
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.5 0.09166667 0.05222075
## 2 1.0 0.5 0.08660256 0.04333178
## 3 10.0 0.5 0.08397436 0.04088294
## 4 100.0 0.5 0.08910256 0.03762626
## 5 0.1 1.0 0.59679487 0.05312225
## 6 1.0 1.0 0.07891026 0.03633038
## 7 10.0 1.0 0.08910256 0.04132724
## 8 100.0 1.0 0.08910256 0.04132724
## 9 0.1 2.0 0.59679487 0.05312225
## 10 1.0 2.0 0.16544872 0.07914205
## 11 10.0 2.0 0.15525641 0.06985108
## 12 100.0 2.0 0.15525641 0.06985108
tune_poly <- tune(svm,
mpg01 ~ . - mpg,
data = Auto,
kernel = "polynomial",
ranges = list(cost = c(0.1, 1, 10),
degree = c(2, 3)))
summary(tune_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 10 2
##
## - best performance: 0.5612179
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 0.5842949 0.04703306
## 2 1.0 2 0.5842949 0.04703306
## 3 10.0 2 0.5612179 0.06909693
## 4 0.1 3 0.5842949 0.04703306
## 5 1.0 3 0.5842949 0.04703306
## 6 10.0 3 0.5842949 0.04703306
We can see here that the error stays high across the combinations of cost and polynomial degree, which means that the polynomial kernel is performing worse than random guessing. When the cost is increased from 0.1 or 1 to 10 for degree 2 does have some improvement, showing that the error goes down from 58.4% to 56.1%, but its still not acceptable. Degree 3 does not benefit from any cost, because the error stays at 58.4%. We can see that overall from these results, the polynomial kernel is a poor choice in this case.
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
##
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Auto
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(e1071)
set.seed(42)
data(OJ)
n <- nrow(OJ)
train_idx <- sample(seq_len(n), size = 800)
OJ_train <- OJ[train_idx, ]
OJ_test <- OJ[-train_idx, ]
svc_lin01 <- svm(Purchase ~ ., data = OJ_train,
kernel = "linear", cost = 0.01, scale = TRUE)
summary(svc_lin01)
##
## Call:
## svm(formula = Purchase ~ ., data = OJ_train, kernel = "linear", cost = 0.01,
## scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 432
##
## ( 215 217 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
Since the model has such a low cost and uses 432 support vectors, we can see that this indicates a very soft margin. This means that the data is underfit, so there will be high training and test erros.
pred_train_lin01 <- predict(svc_lin01, OJ_train)
pred_test_lin01 <- predict(svc_lin01, OJ_test)
train_err_lin01 <- mean(pred_train_lin01 != OJ_train$Purchase)
test_err_lin01 <- mean(pred_test_lin01 != OJ_test$Purchase)
cat("Linear SVC (cost=0.01):\n",
" Training error =", round(train_err_lin01, 4), "\n",
" Test error =", round(test_err_lin01, 4), "\n")
## Linear SVC (cost=0.01):
## Training error = 0.1713
## Test error = 0.163
The training error rate is 0.1713 or 17.13%, and the test error rate is 0.163, or 16.3%.
set.seed(42)
tune_lin <- tune(svm,
Purchase ~ .,
data = OJ_train,
kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)),
tunecontrol = tune.control(cross = 10))
summary(tune_lin)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.175
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17750 0.02415229
## 2 0.10 0.17625 0.03356689
## 3 1.00 0.17500 0.02886751
## 4 5.00 0.18375 0.02703521
## 5 10.00 0.18625 0.02729087
best_lin <- tune_lin$best.model
pred_train_lin <- predict(best_lin, OJ_train)
pred_test_lin <- predict(best_lin, OJ_test)
train_err_lin <- mean(pred_train_lin != OJ_train$Purchase)
test_err_lin <- mean(pred_test_lin != OJ_test$Purchase)
cat("Linear SVC (cost=", best_lin$cost, "):\n",
" Training error =", round(train_err_lin, 4), "\n",
" Test error =", round(test_err_lin, 4), "\n", sep = "")
## Linear SVC (cost=1):
## Training error =0.1675
## Test error =0.163
set.seed(42)
tune_rad <- tune(svm,
Purchase ~ .,
data = OJ_train,
kernel = "radial",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tune_rad)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.18
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.38500 0.04199868
## 2 0.10 0.18125 0.03784563
## 3 1.00 0.18000 0.03343734
## 4 5.00 0.18625 0.03701070
## 5 10.00 0.19375 0.03738408
best_rad <- tune_rad$best.model
pred_train_rad <- predict(best_rad, OJ_train)
pred_test_rad <- predict(best_rad, OJ_test)
train_err_rad <- mean(pred_train_rad != OJ_train$Purchase)
test_err_rad <- mean(pred_test_rad != OJ_test$Purchase)
cat("RBF SVM (cost=", best_rad$cost, "):\n",
" Training error =", round(train_err_rad, 4), "\n",
" Test error =", round(test_err_rad, 4), "\n", sep = "")
## RBF SVM (cost=1):
## Training error =0.15
## Test error =0.1593
set.seed(42)
tune_poly <- tune(svm,
Purchase ~ .,
data = OJ_train,
kernel = "polynomial",
degree = 2,
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tune_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 5
##
## - best performance: 0.18375
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.38625 0.04308019
## 2 0.10 0.31625 0.05529278
## 3 1.00 0.19250 0.04216370
## 4 5.00 0.18375 0.04041881
## 5 10.00 0.19000 0.03425801
best_poly <- tune_poly$best.model
pred_train_poly <- predict(best_poly, OJ_train)
pred_test_poly <- predict(best_poly, OJ_test)
train_err_poly <- mean(pred_train_poly != OJ_train$Purchase)
test_err_poly <- mean(pred_test_poly != OJ_test$Purchase)
cat("Poly SVM (deg=2, cost=", best_poly$cost, "):\n",
" Training error =", round(train_err_poly, 4), "\n",
" Test error =", round(test_err_poly, 4), "\n", sep = "")
## Poly SVM (deg=2, cost=5):
## Training error =0.1475
## Test error =0.1667
results <- data.frame(
Method = c("Linear SVC (tuned)", "RBF SVM (tuned)", "Poly SVM deg2"),
Cost = c(best_lin$cost, best_rad$cost, best_poly$cost),
TrainErr = c(train_err_lin, train_err_rad, train_err_poly),
TestErr = c(test_err_lin, test_err_rad, test_err_poly)
)
print(results)
## Method Cost TrainErr TestErr
## 1 Linear SVC (tuned) 1 0.1675 0.1629630
## 2 RBF SVM (tuned) 1 0.1500 0.1592593
## 3 Poly SVM deg2 5 0.1475 0.1666667
We can see here that the RBF SVM (tuned) offers the best performance. This is because it has the lowest test error value, 0.1592593. Even though the Ploy SVM deg2 has the lowest training error, it has the worst/highest test error value.