We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary. We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features.
(a) Generate a data set with n = 500 and p =2, such that the observations belong to two classes with a quadratic decision boundary between them. For instance, you can do this as follows:
set.seed(123)
x1 <-runif(500)- 0.5
x2 <-runif(500)- 0.5
y <- 1 * (x1^2- x2^2 > 0)
# Combine into a data frame
df <- data.frame(x1 = x1, x2 = x2, y = as.factor(y))
# View first few rows
head(df)
## x1 x2 y
## 1 -0.21242248 -0.1463939 1
## 2 0.28830514 -0.1335586 1
## 3 -0.09102308 -0.2128999 0
## 4 0.38301740 -0.4200271 0
## 5 0.44046728 -0.1345457 1
## 6 -0.45444350 -0.3219862 1
(b) Plot the observations, colored according to their class labels. Your plot should display X1 on the x-axis, and X2 on the y axis.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
ggplot(df, aes(x = x1, y = x2, color = y)) +
geom_point(alpha = 0.7, size = 2) +
scale_color_manual(values = c("red", "blue"), labels = c("Class 0", "Class 1")) +
labs(
title = "Observations Colored by Class",
x = "X1",
y = "X2",
color = "Class Label"
) +
theme_minimal()
(c) Fit a logistic regression model to the data, using X1 and X2 as predictors.
#Fit logistic regression model
glm.fit <- glm(y ~ x1 + x2, data=df, family="binomial")
summary(glm.fit)
##
## Call:
## glm(formula = y ~ x1 + x2, family = "binomial", data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.04792 0.08949 0.535 0.592
## x1 -0.03999 0.31516 -0.127 0.899
## x2 0.11509 0.30829 0.373 0.709
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 692.86 on 499 degrees of freedom
## Residual deviance: 692.71 on 497 degrees of freedom
## AIC: 698.71
##
## Number of Fisher Scoring iterations: 3
(d) Apply this model to the training data in order to obtain a predicted class label for each training observation. Plot the observations, colored according to the predicted class labels. The decision boundary should be linear.
df$pred_prob <- predict(glm.fit, newdata=df, type="response")
# Convert probabilities to class labels using 0.5 threshold
df$pred_class <- as.factor(ifelse(df$pred_prob > 0.5, 1, 0))
library(ggplot2)
ggplot(df, aes(x = x1, y = x2, color = pred_class)) +
geom_point(alpha = 0.7, size = 2) +
scale_color_manual(values = c("red", "blue"), labels = c("Class 0", "Class 1")) +
labs(
title = "Predicted Class Labels from Logistic Regression on Training Data",
x = "X1",
y = "X2",
color = "Predicted Class"
) +
theme_minimal()
(e) Now fit a logistic regression model to the data using non-linear functions of X1 and X2 as predictors (e.g. X21, X1×X2, log(X2),and so forth)
# Add non-linear features to the data
df$x1_sq <- df$x1^2
df$x2_sq <- df$x2^2
df$x1x2 <- df$x1 * df$x2
# Now fit a logistic regression model to the data using non-linear functions
logit_model <- glm(y ~ x1 + x2 + x1_sq + x2_sq + x1x2, data = df, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit_model)
##
## Call:
## glm(formula = y ~ x1 + x2 + x1_sq + x2_sq + x1x2, family = binomial,
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.12 1170.02 -0.006 0.995
## x1 -146.56 10983.22 -0.013 0.989
## x2 55.75 11290.45 0.005 0.996
## x1_sq 12828.87 479505.56 0.027 0.979
## x2_sq -12842.99 481258.81 -0.027 0.979
## x1x2 566.19 36927.06 0.015 0.988
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.9286e+02 on 499 degrees of freedom
## Residual deviance: 2.3990e-06 on 494 degrees of freedom
## AIC: 12
##
## Number of Fisher Scoring iterations: 25
(f) Apply this model to the training data in order to obtain a predicted class label for each training observation. Plot the observations, colored according to the predicted class labels. The decision boundary should be obviously non-linear. If it is not,then repeat (a)-(e) until you come up with an example in which the predicted class labels are obviously non-linear.
# Predict probabilities on training data
df$prob <- predict(logit_model, newdata = df, type = "response")
# Convert to predicted class labels (factor format)
df$pred <- as.factor(ifelse(df$prob > 0.5, 1, 0))
ggplot(df, aes(x = x1, y = x2, color = pred)) +
geom_point(alpha = 0.7, size = 2) +
scale_color_manual(values = c("red", "blue"), labels = c("Class 0", "Class 1")) +
labs(
title = "Predicted Class Labels Using Non-linear Logistic Regression",
x = "X1",
y = "X2",
color = "Predicted Class"
) +
theme_minimal()
(g) Fit a support vector classifier to the data with X1 and X2 as predictors. Obtain a class prediction for each training observation. Plot the observations, colored according to the predicted class labels.
library(e1071)
library(ggplot2)
dat.svm = data.frame(x1=x1, x2= x2, y=as.factor(y))
# Fit linear SVM classifier
svmfit <- svm(y ~ x1 + x2, data = df, kernel = "linear", cost = 10, scale = FALSE)
# Get predicted class labels (already factors)
dat.svm$preds <- predict(svmfit, newdata = dat.svm)
# Plot predicted classes
ggplot(dat.svm, aes(x = x1, y = x2, color = preds)) +
geom_point(alpha = 0.7, size = 2) +
scale_color_manual(values = c("red", "blue"), labels = c("Class 0", "Class 1")) +
labs(
title = "Predicted Class Labels Using Linear SVM",
x = "X1",
y = "X2",
color = "Predicted Class"
) +
theme_minimal()
(h) Fit a SVM using a non-linear kernel to the data. Obtain a class prediction for each training observation. Plot the observations, colored according to the predicted class labels.
library(e1071)
library(ggplot2)
# Reuse the same dataset
dat.svm = data.frame(x1 = x1, x2 = x2, y = as.factor(y))
# Fit non-linear SVM (radial kernel)
svmfit_radial <- svm(y ~ x1 + x2, data = dat.svm, kernel = "radial", cost = 10, gamma = 1)
# Predict class labels
dat.svm$preds_radial <- predict(svmfit_radial, newdata = dat.svm)
ggplot(dat.svm, aes(x = x1, y = x2, color = preds_radial)) +
geom_point(alpha = 0.7, size = 2) +
scale_color_manual(values = c("red", "blue"), labels = c("Class 0", "Class 1")) +
labs(
title = "Predicted Class Labels Using SVM with Radial Kernel",
x = "X1",
y = "X2",
color = "Predicted Class"
) +
theme_minimal()
(i) Comment on your results.
The linear SVM fails by predicting only one class while the radial
kernel SVM successfully predicts both classes. The linear SVM cannot
capture the non-linear decision boundary needed to separate the data,
while the radial kernel SVM maps the data to a higher-dimensional space
where it becomes linearly separable.
In this problem, you will use support vector approaches in order to predict whether a given car gets high or low gas mileage based on the Auto data set.
(a) Create a binary variable that takes on a 1 for cars with gas mileage above the median, and a 0 for cars with gas mileage below the median.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
# Load the Auto dataset
data("Auto")
# Compute the median of the mpg variable
median_mpg <- median(Auto$mpg)
# Create a binary variable based on median mpg
Auto$mpg_binary <- ifelse(Auto$mpg > median_mpg, 1, 0)
Auto$mpg_binary <- as.factor(Auto$mpg_binary)
(b) Fit a support vector classifier to the data with various values of cost, in order to predict whether a car gets high or low gas mileage. Report the cross-validation errors associated with different values of this parameter. Comment on your results. Note you will need to fit the classifier without the gas mileage variable to produce sensible results.
library(e1071)
set.seed(123)
# Grid of cost values
cost_values <- c(0.01, 0.1, 1, 5, 10, 100)
# Tune SVM model with linear kernel
svm_model <- tune(svm, mpg_binary ~ .-mpg, data = Auto, kernel = "linear", ranges = list(cost = cost_values))
summary(svm_model)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.01
##
## - best performance: 0.08910256
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.08910256 0.03791275
## 2 1e-01 0.09403846 0.04842472
## 3 1e+00 0.09147436 0.05817937
## 4 5e+00 0.10423077 0.06425850
## 5 1e+01 0.10673077 0.06562804
## 6 1e+02 0.12724359 0.06371052
From this output, we can see that the lowest cross-validation error is achieved when cost = 0.01, and it is about 8.9%. This means that the support vector classifier with cost = 0.01 correctly predicts the gas mileage of about 91.1% of the cars in the data set.
(c) Now repeat (b), this time using SVMs with radial and polynomial basis kernels, with different values of gamma and degree and cost. Comment on your results.
# Radial kernel with different gamma and cost values
set.seed(123)
svm_radial <- tune(svm, mpg_binary ~ . - mpg,
data = Auto,
kernel = "radial",
ranges = list(cost = c(0.01, 0.1, 1),
gamma = c(0.5, 1)),
tunecontrol = tune.control(cross = 10))
summary(svm_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 1
##
## - best performance: 0.08660256
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.01 0.5 0.58173077 0.04740051
## 2 0.10 0.5 0.08910256 0.03979296
## 3 1.00 0.5 0.08903846 0.04774716
## 4 0.01 1.0 0.58173077 0.04740051
## 5 0.10 1.0 0.58173077 0.04740051
## 6 1.00 1.0 0.08660256 0.04466420
Radial Kernel SVM Results Best Performance: Cost = 1, Gamma = 1 with CV error = 8.66% (91.34% accuracy)
# Polynomial kernel with different degree and cost values
set.seed(123)
svm_poly <- tune(svm, mpg_binary ~ . - mpg,
data = Auto,
kernel = "polynomial",
ranges = list(cost = c(0.1, 1, 10, 100),
degree = c(2, 3, 4)),
tunecontrol = tune.control(cross = 10))
summary(svm_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 100 2
##
## - best performance: 0.3086538
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 0.5817308 0.04740051
## 2 1.0 2 0.5817308 0.04740051
## 3 10.0 2 0.5714744 0.04575370
## 4 100.0 2 0.3086538 0.10382736
## 5 0.1 3 0.5817308 0.04740051
## 6 1.0 3 0.5817308 0.04740051
## 7 10.0 3 0.5817308 0.04740051
## 8 100.0 3 0.4159615 0.12008716
## 9 0.1 4 0.5817308 0.04740051
## 10 1.0 4 0.5817308 0.04740051
## 11 10.0 4 0.5817308 0.04740051
## 12 100.0 4 0.5817308 0.04740051
Polynomial Kernel SVM Results Best Performance: Cost = 100, Degree = 2 with CV error = 30.87% (69.13% accuracy).
(d) Make some plots to back up your assertions in (b) and (c).
# Create dataset without the name variable
Auto_clean <- Auto[, !names(Auto) %in% c("mpg", "name")]
# Refit models with clean data (excluding mpg and name)
set.seed(123)
best_linear <- svm(mpg_binary ~ ., data = Auto_clean, kernel = "linear", cost = 0.01)
best_radial <- svm(mpg_binary ~ ., data = Auto_clean, kernel = "radial", cost = 1, gamma = 1)
best_poly <- svm(mpg_binary ~ ., data = Auto_clean, kernel = "polynomial", cost = 100, degree = 2)
# Now try plotting
plot(best_linear, Auto_clean, displacement ~ horsepower)
plot(best_radial, Auto_clean, displacement ~ horsepower)
plot(best_poly, Auto_clean, displacement ~ horsepower)
This problem involves the OJ data set which is part of the ISLR2 package.
(a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.
# Load necessary libraries
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
##
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Auto
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(e1071)
# Set seed for reproducibility
set.seed(123)
# Load the OJ dataset
data(OJ)
# Create a training set with 800 random samples
train_indices <- sample(1:nrow(OJ), 800)
train_set <- OJ[train_indices, ]
# Create a test set with the remaining samples
test_set <- OJ[-train_indices, ]
(b) Fit a support vector classifier to the training data using cost = 0.01, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics, and describe the results obtained.
set.seed(123)
# Fit a support vector classifier with cost = 0.01
svm_linear <- svm(Purchase ~ ., data = train_set, kernel = "linear", cost = 0.01)
# Produce summary statistics
summary(svm_linear)
##
## Call:
## svm(formula = Purchase ~ ., data = train_set, kernel = "linear",
## cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 442
##
## ( 220 222 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
(c) What are the training and test error rates?
# Predict on training set
train_predictions <- predict(svm_linear, train_set)
# Calculate training error rate
train_error_rate <- mean(train_predictions != train_set$Purchase)
# Predict on test set
test_predictions <- predict(svm_linear, test_set)
# Calculate test error rate
test_error_rate <- mean(test_predictions != test_set$Purchase)
train_error_rate
## [1] 0.165
test_error_rate
## [1] 0.1777778
(d) Use the tune() function to select an optimal cost. Consider values in the range 0.01 to 10.
# Tune the SVM model to find the optimal cost
tune_result <- tune(svm, Purchase ~ ., data = train_set, kernel = "linear",
ranges = list(cost = seq(0.01, 10, length.out = 100)))
# Get the best model
best_model <- tune_result$best.model
# Summary of the best model
summary(best_model)
##
## Call:
## best.tune(METHOD = svm, train.x = Purchase ~ ., data = train_set,
## ranges = list(cost = seq(0.01, 10, length.out = 100)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1.422727
##
## Number of Support Vectors: 337
##
## ( 168 169 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
(e) Compute the training and test error rates using this new value for cost.
set.seed(123)
# Predict on training set with the best model
best_train_predictions <- predict(best_model, train_set)
# Calculate training error rate
best_train_error_rate <- mean(best_train_predictions != train_set$Purchase)
# Predict on test set with the best model
best_test_predictions <- predict(best_model, test_set)
# Calculate test error rate
best_test_error_rate <- mean(best_test_predictions != test_set$Purchase)
best_train_error_rate
## [1] 0.15875
best_test_error_rate
## [1] 0.1555556
(f) Repeat parts (b) through (e) using a support vector machine with a radial kernel. Use the default value for gamma.
set.seed(123)
# Fit a support vector machine with a radial kernel
svm_radial <- svm(Purchase ~ ., data = train_set, kernel = "radial")
# Tune the SVM model with a radial kernel
tune_radial <- tune(svm, Purchase ~ ., data = train_set, kernel = "radial",
ranges = list(cost = seq(0.01, 10, length.out = 100)))
# Get the best radial model
best_radial_model <- tune_radial$best.model
# Calculate error rates for the best radial model
radial_train_predictions <- predict(best_radial_model, train_set)
radial_train_error_rate <- mean(radial_train_predictions != train_set$Purchase)
radial_test_predictions <- predict(best_radial_model, test_set)
radial_test_error_rate <- mean(radial_test_predictions != test_set$Purchase)
radial_train_error_rate
## [1] 0.13625
radial_test_error_rate
## [1] 0.1851852
(g) Repeat parts (b) through (e) using a support vector machine with a polynomial kernel. Set degree = 2.
# Fit a support vector machine with a polynomial kernel
svm_poly <- svm(Purchase ~ ., data = train_set, kernel = "polynomial", degree = 2)
# Tune the SVM model with a polynomial kernel
tune_poly <- tune(svm, Purchase ~ ., data = train_set, kernel = "polynomial", degree = 2,
ranges = list(cost = seq(0.01, 10, length.out = 100)))
# Get the best polynomial model
best_poly_model <- tune_poly$best.model
# Calculate error rates for the best polynomial model
poly_train_predictions <- predict(best_poly_model, train_set)
poly_train_error_rate <- mean(poly_train_predictions != train_set$Purchase)
poly_test_predictions <- predict(best_poly_model, test_set)
poly_test_error_rate <- mean(poly_test_predictions != test_set$Purchase)
poly_train_error_rate
## [1] 0.14
poly_test_error_rate
## [1] 0.2
(h) Overall, which approach seems to give the best results on
this data?
The Tuned Linear SVM gives the best overall results for this dataset
because Lowest test error rate (15.56%)