R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Load Libraries
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
# Problem 5 We have seen that we can fit an SVM with a non-linear
# kernel in order to perform classification using a non-linear decision
# boundary. We will now see that we can also obtain a non-linear decision
# boundary by performing logistic regression using non-linear transformations
# of the features.

# Part a) Generate a data set with n = 500 and p = 2, such that the observations
# belong to two classes with a quadratic decision boundary
# between them. 

set.seed(7)
x1 <- runif(500) - 0.5 
x2 <- runif(500) - 0.5
y <- 1*(x1^2 - x2^2>0)
# Part (b) Plot the observations, colored according to their class labels.
# Your plot should display X1 on the x-axis, and X2 on the yaxis.

plot(x1[y==0], x2[y==0], col="red", xlab="X1", ylab="X2")
points(x1[y==1], x2[y==1], col="blue")

# Part (c) Fit a logistic regression model to the data, using X1 and X2 as
# predictors.

glm_fit = glm(y ~ x1 + x2, family = binomial)
summary(glm_fit)
## 
## Call:
## glm(formula = y ~ x1 + x2, family = binomial)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.07024    0.08986  -0.782    0.434
## x1           0.29100    0.32001   0.909    0.363
## x2          -0.49403    0.30840  -1.602    0.109
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 692.50  on 499  degrees of freedom
## Residual deviance: 688.94  on 497  degrees of freedom
## AIC: 694.94
## 
## Number of Fisher Scoring iterations: 3
# Part d) Apply this model to the training data in order to obtain a predicted
# class label for each training observation. Plot the observations,
# colored according to the predicted class labels. The
# decision boundary should be linear.

data = data.frame(x1 = x1, x2 = x2, y = y)
lm_prob = predict(glm_fit, data, type = "response")
lm_pred = ifelse(lm_prob > 0.50, 1, 0)
data_pos = data[lm_pred == 1, ]
data_neg = data[lm_pred == 0, ]
plot(data_pos$x1, data_pos$x2, col = "blue", xlab = "X1", ylab = "X2")
points(data_neg$x1, data_neg$x2, col = "red")

# Part e) Now fit a logistic regression model to the data using non-linear
# functions of X1 and X2 as predictors (e.g. X2, X1×X2, log(X2),
# and so forth).

glm_fit2 = glm(y ~ poly(x1, 2) + poly(x2, 2), data = data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Part f) Apply this model to the training data in order to obtain a predicted
# class label for each training observation. Plot the observations,
# colored according to the predicted class labels. The
# decision boundary should be obviously non-linear. If it is not,
# then repeat (a)-(e) until you come up with an example in which
# the predicted class labels are obviously non-linear.

lm_prob2 = predict(glm_fit2, data, type = "response")
lm_pred2 = ifelse(lm_prob2 > 0.5, 1, 0)
data_pos2 = data[lm_pred2 == 1, ]
data_neg2 = data[lm_pred2 == 0, ]
plot(data_pos2$x1, data_pos2$x2, col = "blue", xlab = "X1", ylab = "X2")
points(data_neg2$x1, data_neg2$x2, col = "red")

# Part g) Fit a support vector classifier to the data with X1 and X2 as
# predictors. Obtain a class prediction for each training observation.
# Plot the observations, colored according to the predicted
# class labels.

svm_fit = svm(as.factor(y) ~ x1 + x2, data, kernel = "linear", cost = 0.1)
svm_pred = predict(svm_fit, data)
data_pos = data[svm_pred == 1, ]
data_neg = data[svm_pred == 0, ]
plot(data_pos$x1, data_pos$x2, col = "blue", xlab = "X1", ylab = "X2")
points(data_neg$x1, data_neg$x2, col = "red")

# Part h) Fit a SVM using a non-linear kernel to the data. Obtain a class
# prediction for each training observation. Plot the observations,
# colored according to the predicted class labels.

svm_fit2 = svm(as.factor(y) ~ x1 + x2, data, gamma = 1)
svm_pred2 = predict(svm_fit2, data)
data.pos = data[svm_pred2 == 1, ]
data.neg = data[svm_pred2 == 0, ]
plot(data.pos$x1, data.pos$x2, col = "blue", xlab = "X1", ylab = "X2")
points(data.neg$x1, data.neg$x2, col = "red")

# Part (i) Comment on your results.
# The non-linear boundaries can be seen in most results except
# from part d) which shows the parameters that separate
# plots form linear vs non linear. 
# Problem 7 In this problem, you will use support vector approaches in order to
# predict whether a given car gets high or low gas mileage based on the
# Auto data set.

str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
# Part a) Create a binary variable that takes on a 1 for cars with gas
# mileage above the median, and a 0 for cars with gas mileage
# below the median.

gas_med = median(Auto$mpg)
new_bi_var = ifelse(Auto$mpg > gas_med, 1, 0)
Auto$mpglevel = as.factor(new_bi_var)
# Part b) (b) Fit a support vector classifier to the data with various values
# of cost, in order to predict whether a car gets high or low gas
# mileage. Report the cross-validation errors associated with different
# values of this parameter. Comment on your results. Note
# you will need to fit the classifier without the gas mileage variable
# to produce sensible results.

set.seed(1)
tune_out = tune(svm, mpglevel ~ ., data = Auto, kernel = "linear",
                ranges = list(cost = c(0.01, 0.1, 1, 5, 10, 100)))
summary(tune_out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.01025641 
## 
## - Detailed performance results:
##    cost      error dispersion
## 1 1e-02 0.07653846 0.03617137
## 2 1e-01 0.04596154 0.03378238
## 3 1e+00 0.01025641 0.01792836
## 4 5e+00 0.02051282 0.02648194
## 5 1e+01 0.02051282 0.02648194
## 6 1e+02 0.03076923 0.03151981
# Based on the results when cost is 1 then the error rate is at its lowest
# in comparison to other cost. 
# Part (c) Now repeat (b), this time using SVMs with radial and polynomial
# basis kernels, with different values of gamma and degree and
# cost. Comment on your results.

set.seed(7)
tune_out2 = tune(svm, mpglevel ~ ., data = Auto, kernel = "polynomial",
                 ranges = list(cost = c(0.1, 1, 5, 10), degree = c(2, 3, 4)))
summary(tune_out2)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost degree
##    10      2
## 
## - best performance: 0.5207051 
## 
## - Detailed performance results:
##    cost degree     error dispersion
## 1   0.1      2 0.5585897 0.03870200
## 2   1.0      2 0.5585897 0.03870200
## 3   5.0      2 0.5585897 0.03870200
## 4  10.0      2 0.5207051 0.09208784
## 5   0.1      3 0.5585897 0.03870200
## 6   1.0      3 0.5585897 0.03870200
## 7   5.0      3 0.5585897 0.03870200
## 8  10.0      3 0.5585897 0.03870200
## 9   0.1      4 0.5585897 0.03870200
## 10  1.0      4 0.5585897 0.03870200
## 11  5.0      4 0.5585897 0.03870200
## 12 10.0      4 0.5585897 0.03870200
# Based on the results when cost is 10 and degree is 2
# then the error rate is at its lowest in comparison to other cost. 
set.seed(7)
tune_out3 = tune(svm, mpglevel ~ ., data = Auto, kernel = "radial",
                 ranges = list(cost = c(0.1, 1, 5, 10),
                               gamma = c(0.01, 0.1, 1, 5, 10, 100)))
summary(tune_out3)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10   0.1
## 
## - best performance: 0.02794872 
## 
## - Detailed performance results:
##    cost gamma      error dispersion
## 1   0.1 1e-02 0.09198718 0.04227017
## 2   1.0 1e-02 0.07403846 0.03704323
## 3   5.0 1e-02 0.05352564 0.03286051
## 4  10.0 1e-02 0.02801282 0.03036301
## 5   0.1 1e-01 0.07660256 0.04179222
## 6   1.0 1e-01 0.05871795 0.02711605
## 7   5.0 1e-01 0.03307692 0.02397013
## 8  10.0 1e-01 0.02794872 0.02199962
## 9   0.1 1e+00 0.55858974 0.03870200
## 10  1.0 1e+00 0.06897436 0.04669593
## 11  5.0 1e+00 0.06897436 0.04669593
## 12 10.0 1e+00 0.06897436 0.04669593
## 13  0.1 5e+00 0.55858974 0.03870200
## 14  1.0 5e+00 0.50256410 0.04642607
## 15  5.0 5e+00 0.49743590 0.04482497
## 16 10.0 5e+00 0.49743590 0.04482497
## 17  0.1 1e+01 0.55858974 0.03870200
## 18  1.0 1e+01 0.51782051 0.05141170
## 19  5.0 1e+01 0.51782051 0.05141170
## 20 10.0 1e+01 0.51782051 0.05141170
## 21  0.1 1e+02 0.55858974 0.03870200
## 22  1.0 1e+02 0.55858974 0.03870200
## 23  5.0 1e+02 0.55858974 0.03870200
## 24 10.0 1e+02 0.55858974 0.03870200
# Based on the results when cost is 10 and gamma is 0.1
# then the error rate is at its lowest in comparison to other cost. 
# Part(d) Make some plots to back up your assertions in (b) and (c).
svm_lin = svm(mpglevel ~ ., data = Auto, kernel = "linear", cost = 1)
svm_poly = svm(mpglevel ~ ., data = Auto, kernel = "polynomial", cost = 10, 
    degree = 2)
svm_radial = svm(mpglevel ~ ., data = Auto, kernel = "radial", cost = 10,
                 gamma = 0.1)
plotpairs = function(fit) {
    for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
        plot(fit, Auto, as.formula(paste("mpg~", name, sep = "")))
    }
}
plotpairs(svm_lin)

plotpairs(svm_poly)

plotpairs(svm_radial)

# Problem 8 This problem involves the OJ data set which is part of the ISLR2
# package.
str(OJ)
## 'data.frame':    1070 obs. of  18 variables:
##  $ Purchase      : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
##  $ WeekofPurchase: num  237 239 245 227 228 230 232 234 235 238 ...
##  $ StoreID       : num  1 1 1 1 7 7 7 7 7 7 ...
##  $ PriceCH       : num  1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceMM       : num  1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
##  $ DiscCH        : num  0 0 0.17 0 0 0 0 0 0 0 ...
##  $ DiscMM        : num  0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
##  $ SpecialCH     : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ SpecialMM     : num  0 1 0 0 0 1 1 0 0 0 ...
##  $ LoyalCH       : num  0.5 0.6 0.68 0.4 0.957 ...
##  $ SalePriceMM   : num  1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
##  $ SalePriceCH   : num  1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceDiff     : num  0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
##  $ Store7        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
##  $ PctDiscMM     : num  0 0.151 0 0 0 ...
##  $ PctDiscCH     : num  0 0 0.0914 0 0 ...
##  $ ListPriceDiff : num  0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
##  $ STORE         : num  1 1 1 1 0 0 0 0 0 0 ...
# Part (a) Create a training set containing a random sample of 800
# observations, and a test set containing the remaining
# observations.

set.seed(7)
data_Train = sample(nrow(OJ), 800)
oj_train = OJ[data_Train,]
oj_test = OJ[-data_Train,]
# Part b) Fit a support vector classifier to the training data using
# cost = 0.01, with Purchase as the response and the other variables
# as predictors. Use the summary() function to produce summary
# statistics, and describe the results obtained.

svc = svm(Purchase~., data = oj_train, kernel = 'linear',cost = 0.01)
summary(svc)
## 
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  438
## 
##  ( 218 220 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
# From the results generated using the summary function, 438 vectors
# have been created. From these vectors 218 belong to CH (Citrus Hill)
# and 220 belong to Minute Maid (MM).
# Part (c) What are the training and test error rates?

train_pred = predict(svc, oj_train)
table(oj_train$Purchase, train_pred)
##     train_pred
##       CH  MM
##   CH 424  60
##   MM  76 240
136/800
## [1] 0.17
# Train error is 0.17
test_pred = predict(svc, oj_test)
table(oj_test$Purchase, test_pred)
##     test_pred
##       CH  MM
##   CH 154  15
##   MM  29  72
44/270
## [1] 0.162963
# Test error is 0.162963
# Part (d) Use the tune() function to select an optimal cost. Consider values
# in the range 0.01 to 10.

set.seed(7)
tune_svc = tune(svm, Purchase ~ ., data = oj_train, kernel = "linear",
                ranges = list(cost = c(0.01,0.1,1,10)))
summary(tune_svc)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##    10
## 
## - best performance: 0.17375 
## 
## - Detailed performance results:
##    cost   error dispersion
## 1  0.01 0.17750 0.03106892
## 2  0.10 0.17750 0.03270236
## 3  1.00 0.17625 0.02531057
## 4 10.00 0.17375 0.02316157
# The optimal cost is 10 
# Part (e) Compute the training and test error rates using this new value
# for cost.

svm_lin_1 = svm(Purchase ~ ., kernel = "linear", data = oj_train,
                cost = tune_out$best.parameters$cost)
pred_train_1 = predict(svm_lin_1, oj_train)
table(oj_train$Purchase, pred_train_1)
##     pred_train_1
##       CH  MM
##   CH 423  61
##   MM  73 243
134/800
## [1] 0.1675
# The train error is 0.1675
test_pred_1 = predict(svm_lin_1, oj_test)
table(oj_test$Purchase, test_pred_1)
##     test_pred_1
##       CH  MM
##   CH 153  16
##   MM  25  76
41/270
## [1] 0.1518519
# Test error is 0.1518519
# Part (f) Repeat parts (b) through (e) using a support vector machine
# with a radial kernel. Use the default value for gamma.

set.seed(7)
svm_rad1 = svm(Purchase ~ ., data = oj_train, kernel = "radial")
summary(svm_rad1)
## 
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  381
## 
##  ( 194 187 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
pred_train_2 = predict(svm_rad1, oj_train)
table(oj_train$Purchase, pred_train_2)
##     pred_train_2
##       CH  MM
##   CH 433  51
##   MM  75 241
# Train error calculation
(51+75)/(433+51+75+241)
## [1] 0.1575
# Train error is 0.1575
test_pred2 = predict(svm_rad1, oj_test)
table(oj_test$Purchase, test_pred2)
##     test_pred2
##       CH  MM
##   CH 158  11
##   MM  29  72
# Test error calculation 
(11+29)/(158+11+29+72)
## [1] 0.1481481
# Test error is 0.1481481
svm_rad1 = svm(Purchase ~ ., data = oj_train, kernel = "radial",
                cost = tune_svc$best.parameters$cost)
pred_train_1 = predict(svm_rad1, oj_train)
table(oj_train$Purchase, pred_train_1)
##     pred_train_1
##       CH  MM
##   CH 439  45
##   MM  73 243
(45+73)/(439+45+73+243)
## [1] 0.1475
# Train error 0.1475
test_pred_3 = predict(svm_rad1, oj_test)
table(oj_test$Purchase, test_pred_3)
##     test_pred_3
##       CH  MM
##   CH 157  12
##   MM  31  70
(12+31)/(157+12+31+70)
## [1] 0.1592593
# Test error 0.1592593
# Part (g) Repeat parts (b) through (e) using a support vector machine
# with a polynomial kernel. Set degree = 2.

set.seed(7)
svm_pol1 = svm(Purchase ~ ., kernel = "poly", data = oj_train, degree=2)
summary(svm_pol1)
## 
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "poly", degree = 2)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  460
## 
##  ( 234 226 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
pred_train4 = predict(svm_pol1, oj_train)
table(oj_train$Purchase, pred_train4)
##     pred_train4
##       CH  MM
##   CH 447  37
##   MM 115 201
(37+115)/(447+37+115+201)
## [1] 0.19
# Train error 0.19
test_pred4 = predict(svm_pol1, oj_test)
table(oj_test$Purchase, test_pred4)
##     test_pred4
##       CH  MM
##   CH 162   7
##   MM  39  62
(39+7)/(162+7+39+62)
## [1] 0.1703704
# Test error is 0.1703704
# Part (h) Overall, which approach seems to give the best results on this
# data?
# Based on the results it appears that radial kernal is the best.