Question 4:

library(e1071)
set.seed(123)
x = rnorm(100)
y = 4 * x^2 + 1 + rnorm(100)
class = sample(100, 50)
y[class] = y[class] + 3
y[-class] = y[-class] - 3
plot(x[class], y[class], col = "red", xlab = "X", ylab = "Y", ylim = c(-6, 30))
points(x[-class], y[-class], col = "blue")

a. Linear Kernel:

# On training data:
z = rep(-1, 100)
z[class] = 1
data = data.frame(x = x, y = y, z = as.factor(z))
train = sample(100, 50)
data.train = data[train, ]
data.test = data[-train, ]
svm.linear = svm(z ~ ., data = data.train, kernel = "linear", cost = 10)
plot(svm.linear, data.train)

# Making preditction:
table(predict = predict(svm.linear, data.train), truth = data.train$z)
##        truth
## predict -1  1
##      -1 18  1
##      1   6 25
# On testing data:
plot(svm.linear, data.test)

# Making prediction:
table(predict = predict(svm.linear, data.test), truth = data.test$z)
##        truth
## predict -1  1
##      -1 20  2
##      1   6 22

b. Polynomial Kernel:

# On training data:
svm.poly = svm(z ~ ., data = data.train, kernel = "polynomial", cost = 10)
plot(svm.poly, data.train)

# Making prediction:
table(predict = predict(svm.poly, data.train), truth = data.train$z)
##        truth
## predict -1  1
##      -1 18  0
##      1   6 26
# On testing data:
plot(svm.poly, data.test)

# Making predictions:
table(predict = predict(svm.poly, data.test), truth = data.test$z)
##        truth
## predict -1  1
##      -1 16  0
##      1  10 24

c. Radial Kernel:

# On training data:
svm.radial = svm(z ~ ., data = data.train, kernel = "radial", gamma = 1, cost = 10)
plot(svm.radial, data.train)

# Making prediction:
table(predict = predict(svm.radial, data.train), truth = data.train$z)
##        truth
## predict -1  1
##      -1 24  0
##      1   0 26
# On testing data:
plot(svm.radial, data.test)

# Making predictions:
table(predict = predict(svm.radial, data.test), truth = data.test$z)
##        truth
## predict -1  1
##      -1 25  0
##      1   1 24

Interpretation:

From the above results, we can clearly see that on the training data, linear, polynomial and radial kernel has respective training error of 7, 6 and 0. This shows radial kernel outperforms both linear and polynomial kernels.

Similarly, on testing data, linear, polynomial and radial kernel has respective testing error of 8, 10 and 1, which indicates that radial kernel is the best technique to outperform others on test data. All of these statements are evidenced by the plot, and truth table presented above.

Question 7:

a. Binary Variable:

library(ISLR)
library(e1071)

set.seed(1)

# Creating binary mpg variable
data(Auto)
Auto$high_mpg <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto$high_mpg <- factor(Auto$high_mpg, levels=c(0,1),
                        labels=c("Low","High"))

# Dropping mpg from predictors:
predictors <- subset(Auto, select = -c(mpg, name, high_mpg))
y <- Auto$high_mpg

b. Support Vector Classifiers:

# Support Vector Classifier:
cost_grid <- 10^seq(-2, 2, by=1)
tune_lin <- tune(svm, high_mpg~., data=Auto,
                 kernel="linear",
                 ranges=list(cost=cost_grid),
                 tunecontrol = tune.control(cross=10))
print(tune_lin)       # CV error for each cost
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.01025641
best_lin <- tune_lin$best.model

# Plotting CV error vs cost
plot(best_lin, data=Auto, horsepower~weight)

From the above results, the cost of 1 seems to be the best in achieving the lowest cross validation error.

c. Using Kernels:

# Radial kernels:
gamma_grid <- 10^seq(-3, 0, by=1)
tune_rbf <- tune(svm, high_mpg~., data=Auto,
                 kernel="radial",
                 ranges=list(cost=cost_grid, gamma=gamma_grid),
                 tunecontrol = tune.control(cross=10))
print(tune_rbf)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   100  0.01
## 
## - best performance: 0.01019231
# Polynomial Kernel:
degree_grid <- 2:4
tune_poly <- tune(svm, high_mpg~., data=Auto,
                  kernel="polynomial",
                  ranges=list(cost=cost_grid, degree=degree_grid),
                  tunecontrol = tune.control(cross=10))
print(tune_poly)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost degree
##   100      2
## 
## - best performance: 0.3167308
# Plotting CV error surfaces
par(mfrow=c(1,2))
plot(tune_rbf, main="RBF SVM: CV Error")

plot(tune_poly, main="Poly SVM: CV Error")

For a radial kernel, the lowest cross-validation error is obtained for a gamma of 0.01 and a cost of 100. For a polynomial kernel, the lowest cross-validation error is obtained for a degree of 2 and a cost of 100. This suggest that the linear kernel is good enough for this dataset since the relationship is not complex enough to require non-linear kernels like radial and polynomial.

d. Plots:

# 2D decision‐boundary plots for the best models
# picking two features, e.g., horsepower vs. weight:

best_rbf  <- tune_rbf$best.model
best_poly <- tune_poly$best.model

par(mfrow=c(1,2), mar=c(4,4,2,1))
plot(best_rbf, data=Auto, horsepower~weight,
     slice=list(acceleration=median(Auto$acceleration),
                displacement=median(Auto$displacement),
                year=median(Auto$year),
                origin=median(Auto$origin)))
title("RBF SVM Decision Boundary")

plot(best_poly, data=Auto, horsepower~weight,
     slice=list(acceleration=median(Auto$acceleration),
                displacement=median(Auto$displacement),
                year=median(Auto$year),
                origin=median(Auto$origin)))
title("Poly SVM Decision Boundary")

Question 8:

a. Training set:

set.seed(222)

# splitting into train (800) and test (remaining):
n <- nrow(OJ)
train_idx <- sample(1:n, 800)
OJ.train  <- OJ[train_idx, ]
OJ.test   <- OJ[-train_idx, ]

# For the sake of computing error:
err_rate <- function(model, data) {
  preds <- predict(model, data)
  mean(preds != data$Purchase)
}

b. Support vector classifier:

# Linear SVC with cost=0.01:
svc_lin01 <- svm(Purchase ~ ., data=OJ.train,
                 kernel="linear", cost=0.01, scale=TRUE)

summary(svc_lin01)
## 
## Call:
## svm(formula = Purchase ~ ., data = OJ.train, kernel = "linear", cost = 0.01, 
##     scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  431
## 
##  ( 216 215 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

With the cost of 0.01 and linear SVM kernel, 431 support vectors are used to separate the CH and MM classes with hyper-plane. Out of 435, 216 belongs to class CH (Citrus Hills) and 215 belongs to MM (Minute Maid).

c. Training and Test error rates:

train_err_lin01 <- err_rate(svc_lin01, OJ.train)
test_err_lin01  <- err_rate(svc_lin01, OJ.test)

cat("Linear SVC (C=0.01):\n",
    "  Train error =", round(train_err_lin01,3), "\n",
    "  Test  error =", round(test_err_lin01,3), "\n\n")
## Linear SVC (C=0.01):
##    Train error = 0.16 
##    Test  error = 0.178

With linear kernel, train error and test error are 16% and 17.8% respectively.

d. Optimal cost:

tune_lin <- tune(svm, Purchase ~ ., data=OJ.train,
                 kernel="linear",
                 ranges=list(cost=c(0.01,0.1,1,5,10)),
                 tunecontrol = tune.control(cross=10))
best_cost_lin <- tune_lin$best.parameters$cost
cat("Best linear cost:", best_cost_lin, "\n\n")
## Best linear cost: 0.1

e. New Training and Test error rates:

svc_lin_best <- svm(Purchase ~ ., data=OJ.train,
                    kernel="linear", cost=best_cost_lin)
cat("Linear SVC (C=", best_cost_lin, "):\n",
    "  Train error =", round(err_rate(svc_lin_best, OJ.train),3), "\n",
    "  Test  error =", round(err_rate(svc_lin_best, OJ.test),3), "\n\n")
## Linear SVC (C= 0.1 ):
##    Train error = 0.16 
##    Test  error = 0.181

With optimal cost value and linear kernel, train error and test error are 16% and 18.1% respectively.

f. Using Radial Kernel:

library(e1071)
set.seed(1)

# RBF SVM with default gamma and cost = 0.01:
svm_rbf_001 <- svm(Purchase ~ ., data = OJ.train,
                   kernel = "radial", cost = 0.01)

# train & test error:
train_err_001 <- mean(predict(svm_rbf_001, OJ.train) != OJ.train$Purchase)
test_err_001  <- mean(predict(svm_rbf_001, OJ.test ) != OJ.test$Purchase)

cat("RBF SVM (cost=0.01):\n",
    sprintf("  Train error = %.3f\n", train_err_001),
    sprintf("  Test  error = %.3f\n\n", test_err_001))
## RBF SVM (cost=0.01):
##    Train error = 0.384
##    Test  error = 0.407
# Tuning cost (gamma left at its default = 1/(#features)):
cost.grid <- c(0.01, 0.1, 1, 5, 10)
tune_rbf <- tune(svm, Purchase ~ ., data = OJ.train,
                 kernel    = "radial",
                 ranges    = list(cost = cost.grid),
                 tunecontrol = tune.control(cross = 10))

best_cost_rbf <- tune_rbf$best.parameters$cost
cat("Optimal cost (RBF):", best_cost_rbf, "\n\n")
## Optimal cost (RBF): 1
# Refitting at optimal cost and recompute errors:
svm_rbf_best <- svm(Purchase ~ ., data = OJ.train,
                    kernel = "radial", cost = best_cost_rbf)

train_err_best <- mean(predict(svm_rbf_best, OJ.train) != OJ.train$Purchase)
test_err_best  <- mean(predict(svm_rbf_best, OJ.test ) != OJ.test$Purchase)

cat(sprintf("RBF SVM (cost=%.2f) final:\n", best_cost_rbf),
    sprintf("  Train error = %.3f\n", train_err_best),
    sprintf("  Test  error = %.3f\n", test_err_best))
## RBF SVM (cost=1.00) final:
##    Train error = 0.145
##    Test  error = 0.193

With radial kernel, train error and test error are 38.4% and 40.7% respectively at cost = 0.01. However after finding optimal cost value of 1, train error and test error are 14.5% and 19.3% respectively.

g. Using Polynomial Kernel:

library(e1071)
set.seed(1)

# Poly-2 SVM with cost = 0.01:
svm_poly_001 <- svm(Purchase ~ ., data = OJ.train,
                    kernel = "polynomial",
                    degree = 2,
                    cost   = 0.01,
                    scale  = TRUE)

# train & test error:
train_err_p001 <- mean(predict(svm_poly_001, OJ.train) != OJ.train$Purchase)
test_err_p001  <- mean(predict(svm_poly_001, OJ.test ) != OJ.test$Purchase)

cat("Poly-2 SVM (cost=0.01):\n",
    sprintf("  Train error = %.3f\n", train_err_p001),
    sprintf("  Test  error = %.3f\n\n", test_err_p001))
## Poly-2 SVM (cost=0.01):
##    Train error = 0.360
##    Test  error = 0.396
# Tuning cost for Poly-2 kernel:
cost.grid <- c(0.01, 0.1, 1, 5, 10)
tune_poly <- tune(svm, Purchase ~ ., data = OJ.train,
                  kernel     = "polynomial",
                  degree     = 2,
                  ranges     = list(cost = cost.grid),
                  tunecontrol = tune.control(cross = 10))

best_cost_poly <- tune_poly$best.parameters$cost
cat("Optimal cost (Poly-2):", best_cost_poly, "\n\n")
## Optimal cost (Poly-2): 5
# Refitting at optimal cost and recompute errors:
svm_poly_best <- svm(Purchase ~ ., data = OJ.train,
                     kernel = "polynomial",
                     degree = 2,
                     cost   = best_cost_poly,
                     scale  = TRUE)

train_err_pbest <- mean(predict(svm_poly_best, OJ.train) != OJ.train$Purchase)
test_err_pbest  <- mean(predict(svm_poly_best, OJ.test ) != OJ.test$Purchase)

cat(sprintf("Poly-2 SVM (cost=%.2f) final:\n", best_cost_poly),
    sprintf("  Train error = %.3f\n", train_err_pbest),
    sprintf("  Test  error = %.3f\n", test_err_pbest))
## Poly-2 SVM (cost=5.00) final:
##    Train error = 0.145
##    Test  error = 0.207

With polynomial kernel, train error and test error are 36% and 39.6% respectively at cost = 0.01. However after finding optimal cost value of 5, train error and test error are 14.5% and 20.7% respectively.

h. Best Results:

Kernel Training & Test Error Rate at (C = 0.01) Training & Test Error Rate at Optimal Cost
  1. Linear Kernel
16% & 17.8% 16% & 18.1% at (C=0.1)
  1. Radial Kernel
38.4% & 40.7% 14.5% & 19.3% (C=1)
  1. Polynomial Kernel
36% & 39.6% 14.5% & 20.7% (C=5)

From the above table, we can conclude that Radial Kernel technique is the best one to achieve lowest possible training test error rate at optimal value of cost.