set.seed(42)
x1 <- runif (500) - 0.5
x2 <- runif (500) - 0.5
y <- 1 * ( x1 ^2 - x2 ^2 > 0)
df <- data.frame(x1 = x1, x2 = x2, y = as.factor(y))
library(ggplot2)
ggplot(df, aes(x = x1, y = x2, color = y)) +
geom_point(size = 2) +
theme_minimal()
log_model <- glm(y ~ x1 + x2, data = df, family = 'binomial')
summary(log_model)
##
## Call:
## glm(formula = y ~ x1 + x2, family = "binomial", data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.09978 0.08976 1.112 0.266
## x1 -0.17659 0.30658 -0.576 0.565
## x2 -0.20067 0.30978 -0.648 0.517
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 691.79 on 499 degrees of freedom
## Residual deviance: 691.08 on 497 degrees of freedom
## AIC: 697.08
##
## Number of Fisher Scoring iterations: 3
log_model_preds <- predict(log_model, newdata = df, type = 'response')
log_model_class <- ifelse(log_model_preds >= 0.5, 1, 0)
ggplot(df, aes(x = x1, y = x2, color = as.factor(log_model_class))) +
geom_point(size = 2) +
theme_minimal()
df$x1_sq <- x1^2
df$x2_sq <- x2^2
df$x1x2 <- x1 * x2
log_model2 <- glm(y ~ x1 + x2 + x1_sq + x2_sq + x1x2, data = df, family = 'binomial')
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_model2)
##
## Call:
## glm(formula = y ~ x1 + x2 + x1_sq + x2_sq + x1x2, family = "binomial",
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 10.41 138.85 0.075 0.940
## x1 183.96 3263.04 0.056 0.955
## x2 -331.24 4128.28 -0.080 0.936
## x1_sq 88648.36 844122.80 0.105 0.916
## x2_sq -86338.24 820611.62 -0.105 0.916
## x1x2 -1396.21 15698.89 -0.089 0.929
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.9179e+02 on 499 degrees of freedom
## Residual deviance: 8.2483e-05 on 494 degrees of freedom
## AIC: 12
##
## Number of Fisher Scoring iterations: 25
log_model2_preds <- predict(log_model2, newdata = df, type = 'response')
log_model2_class <- ifelse(log_model2_preds >= 0.5, 1, 0)
ggplot(df, aes(x = x1, y = x2, color = log_model2_class)) +
geom_point(size = 2) +
theme_minimal()
library(caret)
## Loading required package: lattice
svm_linear <- train(y ~ x1 + x2,
data = df,
method = 'svmLinear',
trControl = trainControl(method = 'none'),
preProcess = c('center', 'scale'))
svm_preds <- predict(svm_linear, newdata = df)
ggplot(df, aes(x = x1, y = x2, color = as.factor(svm_preds))) +
geom_point(size = 2) +
theme_minimal()
svm_radial <- train(y ~ x1 + x2,
data = df,
method = 'svmRadial',
trControl = trainControl(method = 'none'),
preProcess = c('center', 'scale'))
svm_preds2 <- predict(svm_radial, newdata = df)
ggplot(df, aes(x = x1, y = x2, color = as.factor(svm_preds2))) +
geom_point(size = 2) +
theme_minimal()
Logistic regression without non-linear terms was not appropriate for this dataset, as it attempts to separate the classes using a linear decision boundary. Given the true decision boundary is quadratic in nature, the model performed poorly, and the prediction plot failed to resemble the actual class structure.
In contrast, logistic regression with non-linear terms was able to approximate the non-linear boundary effectively. This model’s predictions closely aligned with the true class labels, demonstrating how incorporating feature transformations can significantly improve performance.
The linear SVM performed even worse, as it completely failed to capture the non-linear structure in the data. It effectively defaulted to predicting a single class for most observations, which indicates severe underfitting due to model rigidity.
The radial (RBF) kernel SVM performed the best among all models. It was able to learn the complex non-linear boundary by projecting the data into a higher-dimensional space, where a linear separation becomes possible. This flexibility made it well-suited to the intrinsic pattern in our data, and its prediction plot closely matched the true class distribution.
library(ISLR2)
data(Auto)
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
Auto$mpg_class <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto$mpg_class <- as.factor(Auto$mpg_class)
#grid to use various values of cost
linear_grid <- expand.grid(C = c(0.001, 0.01, 0.1, 1, 10, 50, 100))
#train model
set.seed(42)
svm_linear2 <- train(mpg_class ~ . -mpg -name,
data = Auto,
method = 'svmLinear',
preProcess = c('center', 'scale'),
tuneGrid = linear_grid,
trControl = trainControl(method = 'cv', number = 10))
svm_linear2$results
## C Accuracy Kappa AccuracySD KappaSD
## 1 1e-03 0.8771964 0.7544406 0.05819952 0.11642342
## 2 1e-02 0.9103408 0.8206238 0.05912642 0.11824248
## 3 1e-01 0.9102767 0.8204991 0.05804753 0.11611848
## 4 1e+00 0.9050877 0.8100434 0.04353627 0.08716338
## 5 1e+01 0.9156781 0.8312751 0.02990227 0.05984062
## 6 5e+01 0.9183097 0.8365383 0.02899887 0.05804311
## 7 1e+02 0.9183097 0.8365383 0.02899887 0.05804311
Different values of cost don’t affect the results too much. The model is quite stable. Best performance was achieved when c = 50 and it sort of plateaus there. This is likely because our model can’t separate the data any more given it is already fitting it as tightly as possible
radial_grid <- expand.grid(sigma = c(0.001, 0.01, 0.05, 0.1, 1),
C = c(0.1, 1, 10, 50))
set.seed(42)
svm_radial2 <- train(mpg_class ~ . -mpg -name, data = Auto,
method = 'svmRadial',
preProcess = c('center', 'scale'),
tuneGrid = radial_grid,
trControl = trainControl(method = 'cv', number = 10))
svm_radial2$results
## sigma C Accuracy Kappa AccuracySD KappaSD
## 1 0.001 0.1 0.6600034 0.3302632 0.15461812 0.29652042
## 2 0.001 1.0 0.8875202 0.7749444 0.05338606 0.10689152
## 3 0.001 10.0 0.9128408 0.8256238 0.05798358 0.11595936
## 4 0.001 50.0 0.9052767 0.8104991 0.05637504 0.11276885
## 5 0.010 0.1 0.8901518 0.7802075 0.04999846 0.10013000
## 6 0.010 1.0 0.9103408 0.8206238 0.05912642 0.11824248
## 7 0.010 10.0 0.9027800 0.8055126 0.04258081 0.08518201
## 8 0.010 50.0 0.9129791 0.8259142 0.03541157 0.07080386
## 9 0.050 0.1 0.9076451 0.8152493 0.05966767 0.11933346
## 10 0.050 1.0 0.9078408 0.8156238 0.05274987 0.10548509
## 11 0.050 10.0 0.9154757 0.8309142 0.04122680 0.08242737
## 12 0.050 50.0 0.9182422 0.8364406 0.03776914 0.07552700
## 13 0.100 0.1 0.9102092 0.8203606 0.06212898 0.12424797
## 14 0.100 1.0 0.9077092 0.8153606 0.05851215 0.11701096
## 15 0.100 10.0 0.9179791 0.8358601 0.04055563 0.08113586
## 16 0.100 50.0 0.9129791 0.8258736 0.04439192 0.08881480
## 17 1.000 0.1 0.9129723 0.8258870 0.05243549 0.10486252
## 18 1.000 1.0 0.9333671 0.8666767 0.03942033 0.07886230
## 19 1.000 10.0 0.9181748 0.8362882 0.02720024 0.05434653
## 20 1.000 50.0 0.9106073 0.8211231 0.04602215 0.09200446
poly_grid <- expand.grid(degree = c(2, 3),
scale = 1,
C = c(0.001, 0.01, 0.05, 0.1, 1, 10))
svm_poly <- train(mpg_class ~ . -name -mpg, data = Auto,
method = 'svmPoly',
preProcess = c('center', 'scale'),
tuneGrid = poly_grid,
trControl = trainControl(method = 'cv', number = 10))
svm_poly$results
## degree scale C Accuracy Kappa AccuracySD KappaSD
## 1 2 1 1e-03 0.8983839 0.7968092 0.05311059 0.10601754
## 2 2 1 1e-02 0.9032490 0.8065744 0.03502055 0.06990063
## 3 2 1 5e-02 0.9084447 0.8169488 0.03586553 0.07154721
## 4 2 1 1e-01 0.9133772 0.8268510 0.03400320 0.06785424
## 5 2 1 1e+00 0.9006208 0.8013922 0.03822127 0.07636324
## 6 2 1 1e+01 0.8930567 0.7863482 0.04069299 0.08130793
## 7 3 1 1e-03 0.9155567 0.8313752 0.03250719 0.06464142
## 8 3 1 1e-02 0.9131140 0.8263924 0.03262535 0.06503851
## 9 3 1 5e-02 0.9183097 0.8367263 0.03566551 0.07109646
## 10 3 1 1e-01 0.9208063 0.8416729 0.03298683 0.06589784
## 11 3 1 1e+00 0.8952159 0.7905359 0.05337558 0.10678334
## 12 3 1 1e+01 0.8775270 0.7553084 0.06082213 0.12146594
Although all kernels perform very well, and find a pretty good accuracy with the different parameters, the Radial kernel is the best model out of the 3. It has the highest accuracy by about 1-2%, highest Kappa and a relatively consistent AccuracySD and KappaSD
ggplot(svm_linear2$results, aes(x = C, y = Accuracy)) +
geom_line() + geom_point() +
ggtitle('Linear SVM CV Accuracy') +
theme_minimal()
ggplot(svm_radial2$results, aes(x = C, y = Accuracy, color = as.factor(sigma))) +
geom_line() + geom_point() +
ggtitle('Radial SVM CV Accuracy') +
theme_minimal()
ggplot(svm_poly$results, aes(x = C, y = Accuracy, color = as.factor(degree))) +
geom_line() + geom_point() +
ggtitle('Polynomial SVM CV Accuracy') +
theme_minimal()
We can see on that Radial plot that we get a line above all else, at around 93%. This shows where our best performing model is, and the parameters it used which in this case were C = 1 and sigma = 1.
All models do find a very similar high accuracy.
library(e1071)
data(OJ)
set.seed(42)
train_index <- createDataPartition(OJ$Purchase, p = 799/nrow(OJ), list = FALSE)
train_OJ <- OJ[train_index, ]
test_OJ <- OJ[-train_index, ]
svm_linear3 <- svm(Purchase ~ ., data = train_OJ, kernel = 'linear', cost = 0.01, scale = TRUE)
summary(svm_linear3)
##
## Call:
## svm(formula = Purchase ~ ., data = train_OJ, kernel = "linear", cost = 0.01,
## scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 447
##
## ( 224 223 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
The number of support vectors in the model to predict Purchase is 447. This is technically a lot, out of the 800 observations which suggests the decision boundary is not clean, or the cost is low enough that many points are close to or within the margin.
train_preds <- predict(svm_linear3, newdata = train_OJ)
test_preds <- predict(svm_linear3, newdata = test_OJ)
train_error <- mean(train_preds != train_OJ$Purchase)
test_error <- mean(test_preds != test_OJ$Purchase)
cat('Train error: ', train_error, '\n')
## Train error: 0.16875
cat('Test error: ', test_error)
## Test error: 0.162963
set.seed(42)
tuned_svm <- tune(svm, Purchase ~ ., data = train_OJ, kernel = 'linear',
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tuned_svm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.17
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17625 0.01904855
## 2 0.10 0.17250 0.02108185
## 3 1.00 0.17625 0.02079162
## 4 5.00 0.17125 0.02433134
## 5 10.00 0.17000 0.02838231
train_pred_best <- predict(tuned_svm$best.model, train_OJ)
test_pred_best <- predict(tuned_svm$best.model, test_OJ)
cat('Train error: ', mean(train_pred_best != train_OJ$Purchase), '\n')
## Train error: 0.16
cat('Test error: ', mean(test_pred_best != test_OJ$Purchase))
## Test error: 0.1703704
svm_radial3 <- svm(Purchase ~ ., data = train_OJ, kernel = 'radial', cost = 0.01, scale = TRUE)
summary(svm_radial3)
##
## Call:
## svm(formula = Purchase ~ ., data = train_OJ, kernel = "radial", cost = 0.01,
## scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.01
##
## Number of Support Vectors: 625
##
## ( 313 312 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
train_pred_radial <- predict(svm_radial3, newdata = train_OJ)
test_pred_radial <- predict(svm_radial3, newdata = test_OJ)
mean(train_pred_radial != train_OJ$Purchase)
## [1] 0.39
mean(test_pred_radial != test_OJ$Purchase)
## [1] 0.3888889
set.seed(42)
tuned_radial <- tune(svm, Purchase ~ ., data = train_OJ, kernel = 'radial',
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tuned_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.185
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39000 0.03525699
## 2 0.10 0.19875 0.03197764
## 3 1.00 0.18500 0.02993047
## 4 5.00 0.19000 0.02687419
## 5 10.00 0.19750 0.03322900
train_pred_radial <- predict(tuned_radial$best.model, train_OJ)
test_pred_radial <- predict(tuned_radial$best.model, test_OJ)
cat('Train error: ', mean(train_pred_radial != train_OJ$Purchase), '\n')
## Train error: 0.15125
cat('Test error: ', mean(test_pred_radial != test_OJ$Purchase))
## Test error: 0.1518519
svm_poly2 <- svm(Purchase ~ ., data = train_OJ, kernel = 'polynomial', degree = 2, cost = 0.01)
summary(svm_poly2)
##
## Call:
## svm(formula = Purchase ~ ., data = train_OJ, kernel = "polynomial",
## degree = 2, cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 0.01
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 631
##
## ( 319 312 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
mean(predict(svm_poly2, train_OJ) != train_OJ$Purchase)
## [1] 0.39
mean(predict(svm_poly2, test_OJ) != test_OJ$Purchase)
## [1] 0.3888889
set.seed(42)
tuned_poly <- tune(svm, Purchase ~ ., data = train_OJ, kernel = "polynomial", degree = 2,
ranges = list(cost = c(0.01, 0.1, 1, 5, 10)))
summary(tuned_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.18625
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.39000 0.03525699
## 2 0.10 0.34000 0.03216710
## 3 1.00 0.20750 0.02371708
## 4 5.00 0.18750 0.02700309
## 5 10.00 0.18625 0.02161050
mean(predict(tuned_poly$best.model, train_OJ) != train_OJ$Purchase)
## [1] 0.16125
mean(predict(tuned_poly$best.model, test_OJ) != test_OJ$Purchase)
## [1] 0.162963
The radial kernel seems to have the lowest training and testing error out of all the kernels we tried. Specifically the tuned model, since it finds the best performance model based on the lowest error.