Assignment 8

##Question, 5) ##We have seen that we can fit an SVM with a non-linear kernel in order ##to perform classification using a non-linear decision boundary. We will ##now see that we can also obtain a non-linear decision boundary by ##performing logistic regression using non-linear transformations of the ##features.

#(a) Generate a data set with n = 500 and p = 2, such that the observations #belong to two classes with a quadratic decision boundary #between them. For instance, you can do this as follows:

 x1 <- runif(500) - 0.5
 x2 <- runif(500) - 0.5
 y <- 1 * (x1^2 - x2^2 > 0)
 
 df <- data.frame(x1 = x1, x2 = x2, y = as.factor(y))

#(b) Plot the observations, colored according to their class labels. #Your plot should display X1 on the x-axis, and X2 on the yaxis.

library(ggplot2)

ggplot(df, aes(x = x1, y = x2, color = y)) +
  geom_point() +
  labs(title = "Original Data Colored by True Class",
       x = "X1", y = "X2") +
  theme_minimal()

#(c) Fit a logistic regression model to the data, using X1 and X2 as #predictors.

glm1 <- glm(y ~ x1 + x2, data = df, family = "binomial")

#(d) Apply this model to the training data in order to obtain a predicted #class label for each training observation. Plot the observations, #colored according to the predicted class labels. The #decision boundary should be linear.

df$pred1 <- ifelse(predict(glm1, type = "response") > 0.5, 1, 0)

ggplot(df, aes(x = x1, y = x2, color = as.factor(pred1))) +
  geom_point() +
  labs(title = "Logistic Regression (Linear Boundary)",
       x = "X1", y = "X2", color = "Predicted") +
  theme_minimal()

#(e) Now fit a logistic regression model to the data using non-linear #functions of X1 and X2 as predictors (e.g. X2 1 , X1×X2, log(X2), #and so forth).

glm2 <- glm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2), data = df, family = "binomial")

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

#(f) Apply this model to the training data in order to obtain a predicted #class label for each training observation. Plot the observations, #colored according to the predicted class labels. The #decision boundary should be obviously non-linear. If it is not, #then repeat (a)-(e) until you come up with an example in which #the predicted class labels are obviously non-linear.

df$pred2 <- ifelse(predict(glm2, type = "response") > 0.5, 1, 0)

ggplot(df, aes(x = x1, y = x2, color = as.factor(pred2))) +
  geom_point() +
  labs(title = "Logistic Regression (Non-Linear Boundary)",
       x = "X1", y = "X2", color = "Predicted") +
  theme_minimal()

#non-linear decision boundary, which better matches the true split.

#(g) Fit a support vector classifier to the data with X1 and X2 as #predictors. Obtain a class prediction for each training observation. #Plot the observations, colored according to the predicted #class labels.

library(e1071)

## Warning: package 'e1071' was built under R version 4.4.3

svm_linear <- svm(y ~ x1 + x2, data = df, kernel = "linear", cost = 10, scale = FALSE)
df$svm_pred_linear <- predict(svm_linear)

ggplot(df, aes(x = x1, y = x2, color = svm_pred_linear)) +
  geom_point() +
  labs(title = "SVM with Linear Kernel",
       x = "X1", y = "X2") +
  theme_minimal()

#(h) Fit a SVM using a non-linear kernel to the data. Obtain a class #prediction for each training observation. Plot the observations, #colored according to the predicted class labels.

svm_radial <- svm(y ~ x1 + x2, data = df, kernel = "radial", gamma = 1, cost = 10, scale = FALSE)
df$svm_pred_rbf <- predict(svm_radial)

ggplot(df, aes(x = x1, y = x2, color = svm_pred_rbf)) +
  geom_point() +
  labs(title = "SVM with RBF Kernel",
       x = "X1", y = "X2") +
  theme_minimal()

#(i) Comment on your results. #Linear logistic regression and linear SVM both fail to capture the curved boundary #showing straight-line separations.

#Non-linear logistic regression (via polynomial terms) improves performance and captures curvature.

#SVM with RBF kernel performs best — the decision boundary is flexible #It adapts to the non-linear shape

Question 7

#In this problem, you will use support vector approaches in order to #predict whether a given car gets high or low gas mileage based on the #Auto data set.

#(a) Create a binary variable that takes on a 1 for cars with gas #mileage above the median, and a 0 for cars with gas mileage #below the median.

library(ISLR2)
library(e1071)

# Remove rows with missing data
Auto <- na.omit(Auto)

# Create binary response variable
mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
Auto$mpg01 <- as.factor(Auto$mpg01)

# Remove mpg to avoid leakage
Auto_svm <- Auto[, !(names(Auto) %in% c("mpg"))]

#(b) Fit a support vector classifier to the data with various values #of cost, in order to predict whether a car gets high or low gas #mileage. Report the cross-validation errors associated with different #values of this parameter. Comment on your results. #Note you will need to fit the classifier without the gas mileage variable #to produce sensible results.

set.seed(1)

# Tune cost parameter using 10-fold cross-validation
tune_linear <- tune(svm, mpg01 ~ ., data = Auto_svm,
                    kernel = "linear",
                    ranges = list(cost = c(0.01, 0.1, 1, 10, 100)))

summary(tune_linear)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.08673077 
## 
## - Detailed performance results:
##    cost      error dispersion
## 1 1e-02 0.08923077 0.04698309
## 2 1e-01 0.08673077 0.04040897
## 3 1e+00 0.09961538 0.04923181
## 4 1e+01 0.11237179 0.05701890
## 5 1e+02 0.11750000 0.06208951

#- best performance: 0.08673077

#(c) Now repeat (b), this time using SVMs with radial and polynomial #basis kernels, with different values of gamma and degree and #cost. Comment on your results.

#Radial Kernel

set.seed(1)

tune_radial <- tune(svm, mpg01 ~ ., data = Auto_svm,
                    kernel = "radial",
                    ranges = list(cost = c(0.1, 1, 10), gamma = c(0.5, 1, 2)))

summary(tune_radial)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10     1
## 
## - best performance: 0.07897436 
## 
## - Detailed performance results:
##   cost gamma      error dispersion
## 1  0.1   0.5 0.08410256 0.04164179
## 2  1.0   0.5 0.08673077 0.04708817
## 3 10.0   0.5 0.09173077 0.04008042
## 4  0.1   1.0 0.55115385 0.04366593
## 5  1.0   1.0 0.07903846 0.04891067
## 6 10.0   1.0 0.07897436 0.04869339
## 7  0.1   2.0 0.55115385 0.04366593
## 8  1.0   2.0 0.13769231 0.06926822
## 9 10.0   2.0 0.13512821 0.06692968

#best performance: 0.07897436

#Polynomial Kernel

set.seed(1)

tune_poly <- tune(svm, mpg01 ~ ., data = Auto_svm,
                  kernel = "polynomial",
                  ranges = list(cost = c(0.1, 1, 10), degree = c(2, 3, 4)))

summary(tune_poly)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost degree
##    10      2
## 
## - best performance: 0.520641 
## 
## - Detailed performance results:
##   cost degree     error dispersion
## 1  0.1      2 0.5511538 0.04366593
## 2  1.0      2 0.5511538 0.04366593
## 3 10.0      2 0.5206410 0.08505283
## 4  0.1      3 0.5511538 0.04366593
## 5  1.0      3 0.5511538 0.04366593
## 6 10.0      3 0.5511538 0.04366593
## 7  0.1      4 0.5511538 0.04366593
## 8  1.0      4 0.5511538 0.04366593
## 9 10.0      4 0.5511538 0.04366593

#best performance: 0.520641

#(d) Make some plots to back up your assertions in (b) and (c). #Hint: In the lab, we used the plot() function for svm objects #only in cases with p = 2. When p > 2, you can use the plot() #function to create plots displaying pairs of variables at a time. #Essentially, instead of typing #> plot(svmfit , dat) #where svmfit contains your fitted model and dat is a data frame #containing your data, you can type #> plot(svmfit , dat , x1 ∼ x4) #in order to plot just the first and fourth variables. However, you #must replace x1 and x4 with the correct variable names. To find #out more, type ?plot.svm.

best_radial <- tune_radial$best.model

plot(best_radial, Auto_svm, horsepower ~ weight)

plot(best_radial, Auto_svm, acceleration ~ displacement)

plot(tune_linear$best.model, Auto_svm, horsepower ~ weight)

#Linear kernel works decently, however, non-linear kernels often achieve lower CV error.

#Radial SVM is flexible and adapts better to non-linear boundaries.

#Polynomial kernel can overfit if degree is high or cost is large.

#Cross-validation helps prevent overfitting by selecting the optimal complexity.

##Question 8) #This problem involves the OJ data set which is part of the ISLR2 package.

#(a) Create a training set containing a random sample of 800 #observations, and a test set containing the remaining observations.

library(ISLR2)
library(e1071)

set.seed(1)
train_indices <- sample(1:nrow(OJ), 800)
oj_train <- OJ[train_indices, ]
oj_test <- OJ[-train_indices, ]

#(b) Fit a support vector classifier to the training data using #cost = 0.01, with Purchase as the response and the other variables #as predictors. Use the summary() function to produce summary #statistics, and describe the results obtained.

svm_linear <- svm(Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01, scale = TRUE)
summary(svm_linear)

## 
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01, 
##     scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  435
## 
##  ( 219 216 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

#high support vector count may indicate underfitting
#SVM-Type:  C-classification

#(c) What are the training and test error rates?

# Training error
train_pred_linear <- predict(svm_linear, oj_train)
train_error_linear <- mean(train_pred_linear != oj_train$Purchase)

# Test error
test_pred_linear <- predict(svm_linear, oj_test)
test_error_linear <- mean(test_pred_linear != oj_test$Purchase)

train_error_linear #: 0.175

## [1] 0.175

test_error_linear #: 0.1777778

## [1] 0.1777778

#(d) Use the tune() function to select an optimal cost. Consider values #in the range 0.01 to 10.

set.seed(2)
tune_linear <- tune(svm, Purchase ~ ., data = oj_train,
                    kernel = "linear",
                    ranges = list(cost = c(0.01, 0.1, 1, 10)))

summary(tune_linear)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.17 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.17625 0.04059026
## 2  0.10 0.17125 0.04168749
## 3  1.00 0.17000 0.04090979
## 4 10.00 0.17000 0.03736085

#(e) Compute the training and test error rates using this new value #for cost.

best_linear <- tune_linear$best.model

train_pred_best_linear <- predict(best_linear, oj_train)
train_error_best_linear <- mean(train_pred_best_linear != oj_train$Purchase)

test_pred_best_linear <- predict(best_linear, oj_test)
test_error_best_linear <- mean(test_pred_best_linear != oj_test$Purchase)

train_error_best_linear#: 0.16375

## [1] 0.16375

test_error_best_linear#: 0.1555556

## [1] 0.1555556

#(f) Repeat parts (b) through (e) using a support vector machine #with a radial kernel. Use the default value for gamma.

# Initial model with default gamma and cost = 0.01
svm_radial <- svm(Purchase ~ ., data = oj_train, kernel = "radial", cost = 0.01)
summary(svm_radial)

## 
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "radial", cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.01 
## 
## Number of Support Vectors:  634
## 
##  ( 319 315 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

# Errors
train_pred_radial <- predict(svm_radial, oj_train)
test_pred_radial <- predict(svm_radial, oj_test)

train_error_radial <- mean(train_pred_radial != oj_train$Purchase)
test_error_radial <- mean(test_pred_radial != oj_test$Purchase)

# Tune RBF model
set.seed(3)
tune_radial <- tune(svm, Purchase ~ ., data = oj_train,
                    kernel = "radial",
                    ranges = list(cost = c(0.01, 0.1, 1, 10)))

best_radial <- tune_radial$best.model

# Best errors
train_pred_best_radial <- predict(best_radial, oj_train)
test_pred_best_radial <- predict(best_radial, oj_test)

train_error_best_radial <- mean(train_pred_best_radial != oj_train$Purchase)
test_error_best_radial <- mean(test_pred_best_radial != oj_test$Purchase)

#(g) Repeat parts (b) through (e) using a support vector machine #with a polynomial kernel. Set degree = 2.

# Fit SVM with polynomial kernel
svm_poly <- svm(Purchase ~ ., data = oj_train, kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_poly)

## 
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "polynomial", 
##     degree = 2, cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  0.01 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  636
## 
##  ( 321 315 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

# Errors
train_pred_poly <- predict(svm_poly, oj_train)
test_pred_poly <- predict(svm_poly, oj_test)

train_error_poly <- mean(train_pred_poly != oj_train$Purchase)
test_error_poly <- mean(test_pred_poly != oj_test$Purchase)

# Tune polynomial kernel (degree fixed at 2)
set.seed(4)
tune_poly <- tune(svm, Purchase ~ ., data = oj_train,
                  kernel = "polynomial",
                  degree = 2,
                  ranges = list(cost = c(0.01, 0.1, 1, 10)))

best_poly <- tune_poly$best.model

train_pred_best_poly <- predict(best_poly, oj_train)
test_pred_best_poly <- predict(best_poly, oj_test)

train_error_best_poly <- mean(train_pred_best_poly != oj_train$Purchase)
test_error_best_poly <- mean(test_pred_best_poly != oj_test$Purchase)

#(h) Overall, which approach seems to give the best results on this #data?

results <- data.frame(
  Model = c("Linear (cost=0.01)", "Linear (tuned)", 
            "Radial (cost=0.01)", "Radial (tuned)",
            "Poly (cost=0.01)", "Poly (tuned)"),
  Train_Error = c(train_error_linear, train_error_best_linear,
                  train_error_radial, train_error_best_radial,
                  train_error_poly, train_error_best_poly),
  Test_Error = c(test_error_linear, test_error_best_linear,
                 test_error_radial, test_error_best_radial,
                 test_error_poly, test_error_best_poly)
)

print(results)

##                Model Train_Error Test_Error
## 1 Linear (cost=0.01)     0.17500  0.1777778
## 2     Linear (tuned)     0.16375  0.1555556
## 3 Radial (cost=0.01)     0.39375  0.3777778
## 4     Radial (tuned)     0.15125  0.1851852
## 5   Poly (cost=0.01)     0.37250  0.3666667
## 6       Poly (tuned)     0.15000  0.1888889

#The best performing model on the OJ dataset is the SVM with a linear kernel and tuned cost parameter, achieving the lowest test error rate. This suggests the decision boundary between CH and MM is approximately linear and doesn't benefit significantly from non-linear kernels.

Assignment 8

2025-05-06

Question 7