##Question, 5) ##We have seen that we can fit an SVM with a non-linear kernel in order ##to perform classification using a non-linear decision boundary. We will ##now see that we can also obtain a non-linear decision boundary by ##performing logistic regression using non-linear transformations of the ##features.
#(a) Generate a data set with n = 500 and p = 2, such that the observations #belong to two classes with a quadratic decision boundary #between them. For instance, you can do this as follows:
x1 <- runif(500) - 0.5
x2 <- runif(500) - 0.5
y <- 1 * (x1^2 - x2^2 > 0)
df <- data.frame(x1 = x1, x2 = x2, y = as.factor(y))
#(b) Plot the observations, colored according to their class labels. #Your plot should display X1 on the x-axis, and X2 on the yaxis.
library(ggplot2)
ggplot(df, aes(x = x1, y = x2, color = y)) +
geom_point() +
labs(title = "Original Data Colored by True Class",
x = "X1", y = "X2") +
theme_minimal()
#(c) Fit a logistic regression model to the data, using X1 and X2 as
#predictors.
glm1 <- glm(y ~ x1 + x2, data = df, family = "binomial")
#(d) Apply this model to the training data in order to obtain a predicted #class label for each training observation. Plot the observations, #colored according to the predicted class labels. The #decision boundary should be linear.
df$pred1 <- ifelse(predict(glm1, type = "response") > 0.5, 1, 0)
ggplot(df, aes(x = x1, y = x2, color = as.factor(pred1))) +
geom_point() +
labs(title = "Logistic Regression (Linear Boundary)",
x = "X1", y = "X2", color = "Predicted") +
theme_minimal()
#(e) Now fit a logistic regression model to the data using non-linear
#functions of X1 and X2 as predictors (e.g. X2 1 , X1×X2, log(X2), #and
so forth).
glm2 <- glm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2), data = df, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#(f) Apply this model to the training data in order to obtain a predicted #class label for each training observation. Plot the observations, #colored according to the predicted class labels. The #decision boundary should be obviously non-linear. If it is not, #then repeat (a)-(e) until you come up with an example in which #the predicted class labels are obviously non-linear.
df$pred2 <- ifelse(predict(glm2, type = "response") > 0.5, 1, 0)
ggplot(df, aes(x = x1, y = x2, color = as.factor(pred2))) +
geom_point() +
labs(title = "Logistic Regression (Non-Linear Boundary)",
x = "X1", y = "X2", color = "Predicted") +
theme_minimal()
#non-linear decision boundary, which better matches the true split.
#(g) Fit a support vector classifier to the data with X1 and X2 as #predictors. Obtain a class prediction for each training observation. #Plot the observations, colored according to the predicted #class labels.
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
svm_linear <- svm(y ~ x1 + x2, data = df, kernel = "linear", cost = 10, scale = FALSE)
df$svm_pred_linear <- predict(svm_linear)
ggplot(df, aes(x = x1, y = x2, color = svm_pred_linear)) +
geom_point() +
labs(title = "SVM with Linear Kernel",
x = "X1", y = "X2") +
theme_minimal()
#(h) Fit a SVM using a non-linear kernel to the data. Obtain a class #prediction for each training observation. Plot the observations, #colored according to the predicted class labels.
svm_radial <- svm(y ~ x1 + x2, data = df, kernel = "radial", gamma = 1, cost = 10, scale = FALSE)
df$svm_pred_rbf <- predict(svm_radial)
ggplot(df, aes(x = x1, y = x2, color = svm_pred_rbf)) +
geom_point() +
labs(title = "SVM with RBF Kernel",
x = "X1", y = "X2") +
theme_minimal()
#(i) Comment on your results. #Linear logistic regression and linear SVM both fail to capture the curved boundary #showing straight-line separations.
#Non-linear logistic regression (via polynomial terms) improves performance and captures curvature.
#SVM with RBF kernel performs best — the decision boundary is flexible #It adapts to the non-linear shape
#In this problem, you will use support vector approaches in order to #predict whether a given car gets high or low gas mileage based on the #Auto data set.
#(a) Create a binary variable that takes on a 1 for cars with gas #mileage above the median, and a 0 for cars with gas mileage #below the median.
library(ISLR2)
library(e1071)
# Remove rows with missing data
Auto <- na.omit(Auto)
# Create binary response variable
mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
Auto$mpg01 <- as.factor(Auto$mpg01)
# Remove mpg to avoid leakage
Auto_svm <- Auto[, !(names(Auto) %in% c("mpg"))]
#(b) Fit a support vector classifier to the data with various values #of cost, in order to predict whether a car gets high or low gas #mileage. Report the cross-validation errors associated with different #values of this parameter. Comment on your results. #Note you will need to fit the classifier without the gas mileage variable #to produce sensible results.
set.seed(1)
# Tune cost parameter using 10-fold cross-validation
tune_linear <- tune(svm, mpg01 ~ ., data = Auto_svm,
kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 10, 100)))
summary(tune_linear)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.08673077
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.08923077 0.04698309
## 2 1e-01 0.08673077 0.04040897
## 3 1e+00 0.09961538 0.04923181
## 4 1e+01 0.11237179 0.05701890
## 5 1e+02 0.11750000 0.06208951
#- best performance: 0.08673077
#(c) Now repeat (b), this time using SVMs with radial and polynomial #basis kernels, with different values of gamma and degree and #cost. Comment on your results.
#Radial Kernel
set.seed(1)
tune_radial <- tune(svm, mpg01 ~ ., data = Auto_svm,
kernel = "radial",
ranges = list(cost = c(0.1, 1, 10), gamma = c(0.5, 1, 2)))
summary(tune_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 1
##
## - best performance: 0.07897436
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.5 0.08410256 0.04164179
## 2 1.0 0.5 0.08673077 0.04708817
## 3 10.0 0.5 0.09173077 0.04008042
## 4 0.1 1.0 0.55115385 0.04366593
## 5 1.0 1.0 0.07903846 0.04891067
## 6 10.0 1.0 0.07897436 0.04869339
## 7 0.1 2.0 0.55115385 0.04366593
## 8 1.0 2.0 0.13769231 0.06926822
## 9 10.0 2.0 0.13512821 0.06692968
#best performance: 0.07897436
#Polynomial Kernel
set.seed(1)
tune_poly <- tune(svm, mpg01 ~ ., data = Auto_svm,
kernel = "polynomial",
ranges = list(cost = c(0.1, 1, 10), degree = c(2, 3, 4)))
summary(tune_poly)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 10 2
##
## - best performance: 0.520641
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 0.5511538 0.04366593
## 2 1.0 2 0.5511538 0.04366593
## 3 10.0 2 0.5206410 0.08505283
## 4 0.1 3 0.5511538 0.04366593
## 5 1.0 3 0.5511538 0.04366593
## 6 10.0 3 0.5511538 0.04366593
## 7 0.1 4 0.5511538 0.04366593
## 8 1.0 4 0.5511538 0.04366593
## 9 10.0 4 0.5511538 0.04366593
#best performance: 0.520641
#(d) Make some plots to back up your assertions in (b) and (c). #Hint: In the lab, we used the plot() function for svm objects #only in cases with p = 2. When p > 2, you can use the plot() #function to create plots displaying pairs of variables at a time. #Essentially, instead of typing #> plot(svmfit , dat) #where svmfit contains your fitted model and dat is a data frame #containing your data, you can type #> plot(svmfit , dat , x1 ∼ x4) #in order to plot just the first and fourth variables. However, you #must replace x1 and x4 with the correct variable names. To find #out more, type ?plot.svm.
best_radial <- tune_radial$best.model
plot(best_radial, Auto_svm, horsepower ~ weight)
plot(best_radial, Auto_svm, acceleration ~ displacement)
plot(tune_linear$best.model, Auto_svm, horsepower ~ weight)
#Linear kernel works decently, however, non-linear kernels often achieve lower CV error.
#Radial SVM is flexible and adapts better to non-linear boundaries.
#Polynomial kernel can overfit if degree is high or cost is large.
#Cross-validation helps prevent overfitting by selecting the optimal complexity.
##Question 8) #This problem involves the OJ data set which is part of the ISLR2 package.
#(a) Create a training set containing a random sample of 800 #observations, and a test set containing the remaining observations.
library(ISLR2)
library(e1071)
set.seed(1)
train_indices <- sample(1:nrow(OJ), 800)
oj_train <- OJ[train_indices, ]
oj_test <- OJ[-train_indices, ]
#(b) Fit a support vector classifier to the training data using #cost = 0.01, with Purchase as the response and the other variables #as predictors. Use the summary() function to produce summary #statistics, and describe the results obtained.
svm_linear <- svm(Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01, scale = TRUE)
summary(svm_linear)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01,
## scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 435
##
## ( 219 216 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
#high support vector count may indicate underfitting
#SVM-Type: C-classification
#(c) What are the training and test error rates?
# Training error
train_pred_linear <- predict(svm_linear, oj_train)
train_error_linear <- mean(train_pred_linear != oj_train$Purchase)
# Test error
test_pred_linear <- predict(svm_linear, oj_test)
test_error_linear <- mean(test_pred_linear != oj_test$Purchase)
train_error_linear #: 0.175
## [1] 0.175
test_error_linear #: 0.1777778
## [1] 0.1777778
#(d) Use the tune() function to select an optimal cost. Consider values #in the range 0.01 to 10.
set.seed(2)
tune_linear <- tune(svm, Purchase ~ ., data = oj_train,
kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 10)))
summary(tune_linear)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.17
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17625 0.04059026
## 2 0.10 0.17125 0.04168749
## 3 1.00 0.17000 0.04090979
## 4 10.00 0.17000 0.03736085
#(e) Compute the training and test error rates using this new value #for cost.
best_linear <- tune_linear$best.model
train_pred_best_linear <- predict(best_linear, oj_train)
train_error_best_linear <- mean(train_pred_best_linear != oj_train$Purchase)
test_pred_best_linear <- predict(best_linear, oj_test)
test_error_best_linear <- mean(test_pred_best_linear != oj_test$Purchase)
train_error_best_linear#: 0.16375
## [1] 0.16375
test_error_best_linear#: 0.1555556
## [1] 0.1555556
#(f) Repeat parts (b) through (e) using a support vector machine #with a radial kernel. Use the default value for gamma.
# Initial model with default gamma and cost = 0.01
svm_radial <- svm(Purchase ~ ., data = oj_train, kernel = "radial", cost = 0.01)
summary(svm_radial)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "radial", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.01
##
## Number of Support Vectors: 634
##
## ( 319 315 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
# Errors
train_pred_radial <- predict(svm_radial, oj_train)
test_pred_radial <- predict(svm_radial, oj_test)
train_error_radial <- mean(train_pred_radial != oj_train$Purchase)
test_error_radial <- mean(test_pred_radial != oj_test$Purchase)
# Tune RBF model
set.seed(3)
tune_radial <- tune(svm, Purchase ~ ., data = oj_train,
kernel = "radial",
ranges = list(cost = c(0.01, 0.1, 1, 10)))
best_radial <- tune_radial$best.model
# Best errors
train_pred_best_radial <- predict(best_radial, oj_train)
test_pred_best_radial <- predict(best_radial, oj_test)
train_error_best_radial <- mean(train_pred_best_radial != oj_train$Purchase)
test_error_best_radial <- mean(test_pred_best_radial != oj_test$Purchase)
#(g) Repeat parts (b) through (e) using a support vector machine #with a polynomial kernel. Set degree = 2.
# Fit SVM with polynomial kernel
svm_poly <- svm(Purchase ~ ., data = oj_train, kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_poly)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "polynomial",
## degree = 2, cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 0.01
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 636
##
## ( 321 315 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
# Errors
train_pred_poly <- predict(svm_poly, oj_train)
test_pred_poly <- predict(svm_poly, oj_test)
train_error_poly <- mean(train_pred_poly != oj_train$Purchase)
test_error_poly <- mean(test_pred_poly != oj_test$Purchase)
# Tune polynomial kernel (degree fixed at 2)
set.seed(4)
tune_poly <- tune(svm, Purchase ~ ., data = oj_train,
kernel = "polynomial",
degree = 2,
ranges = list(cost = c(0.01, 0.1, 1, 10)))
best_poly <- tune_poly$best.model
train_pred_best_poly <- predict(best_poly, oj_train)
test_pred_best_poly <- predict(best_poly, oj_test)
train_error_best_poly <- mean(train_pred_best_poly != oj_train$Purchase)
test_error_best_poly <- mean(test_pred_best_poly != oj_test$Purchase)
#(h) Overall, which approach seems to give the best results on this #data?
results <- data.frame(
Model = c("Linear (cost=0.01)", "Linear (tuned)",
"Radial (cost=0.01)", "Radial (tuned)",
"Poly (cost=0.01)", "Poly (tuned)"),
Train_Error = c(train_error_linear, train_error_best_linear,
train_error_radial, train_error_best_radial,
train_error_poly, train_error_best_poly),
Test_Error = c(test_error_linear, test_error_best_linear,
test_error_radial, test_error_best_radial,
test_error_poly, test_error_best_poly)
)
print(results)
## Model Train_Error Test_Error
## 1 Linear (cost=0.01) 0.17500 0.1777778
## 2 Linear (tuned) 0.16375 0.1555556
## 3 Radial (cost=0.01) 0.39375 0.3777778
## 4 Radial (tuned) 0.15125 0.1851852
## 5 Poly (cost=0.01) 0.37250 0.3666667
## 6 Poly (tuned) 0.15000 0.1888889
#The best performing model on the OJ dataset is the SVM with a linear kernel and tuned cost parameter, achieving the lowest test error rate. This suggests the decision boundary between CH and MM is approximately linear and doesn't benefit significantly from non-linear kernels.