This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Load Libraries
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
# Problem 5 We have seen that we can fit an SVM with a non-linear
# kernel in order to perform classification using a non-linear decision
# boundary. We will now see that we can also obtain a non-linear decision
# boundary by performing logistic regression using non-linear transformations
# of the features.
# Part a) Generate a data set with n = 500 and p = 2, such that the observations
# belong to two classes with a quadratic decision boundary
# between them.
set.seed(7)
x1 <- runif(500) - 0.5
x2 <- runif(500) - 0.5
y <- 1*(x1^2 - x2^2>0)
# Part (b) Plot the observations, colored according to their class labels.
# Your plot should display X1 on the x-axis, and X2 on the yaxis.
plot(x1[y==0], x2[y==0], col="red", xlab="X1", ylab="X2")
points(x1[y==1], x2[y==1], col="blue")
# Part (c) Fit a logistic regression model to the data, using X1 and X2 as
# predictors.
glm_fit = glm(y ~ x1 + x2, family = binomial)
summary(glm_fit)
##
## Call:
## glm(formula = y ~ x1 + x2, family = binomial)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.07024 0.08986 -0.782 0.434
## x1 0.29100 0.32001 0.909 0.363
## x2 -0.49403 0.30840 -1.602 0.109
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 692.50 on 499 degrees of freedom
## Residual deviance: 688.94 on 497 degrees of freedom
## AIC: 694.94
##
## Number of Fisher Scoring iterations: 3
# Part d) Apply this model to the training data in order to obtain a predicted
# class label for each training observation. Plot the observations,
# colored according to the predicted class labels. The
# decision boundary should be linear.
data = data.frame(x1 = x1, x2 = x2, y = y)
lm_prob = predict(glm_fit, data, type = "response")
lm_pred = ifelse(lm_prob > 0.50, 1, 0)
data_pos = data[lm_pred == 1, ]
data_neg = data[lm_pred == 0, ]
plot(data_pos$x1, data_pos$x2, col = "blue", xlab = "X1", ylab = "X2")
points(data_neg$x1, data_neg$x2, col = "red")
# Part e) Now fit a logistic regression model to the data using non-linear
# functions of X1 and X2 as predictors (e.g. X2, X1×X2, log(X2),
# and so forth).
glm_fit2 = glm(y ~ poly(x1, 2) + poly(x2, 2), data = data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Part f) Apply this model to the training data in order to obtain a predicted
# class label for each training observation. Plot the observations,
# colored according to the predicted class labels. The
# decision boundary should be obviously non-linear. If it is not,
# then repeat (a)-(e) until you come up with an example in which
# the predicted class labels are obviously non-linear.
lm_prob2 = predict(glm_fit2, data, type = "response")
lm_pred2 = ifelse(lm_prob2 > 0.5, 1, 0)
data_pos2 = data[lm_pred2 == 1, ]
data_neg2 = data[lm_pred2 == 0, ]
plot(data_pos2$x1, data_pos2$x2, col = "blue", xlab = "X1", ylab = "X2")
points(data_neg2$x1, data_neg2$x2, col = "red")
# Part g) Fit a support vector classifier to the data with X1 and X2 as
# predictors. Obtain a class prediction for each training observation.
# Plot the observations, colored according to the predicted
# class labels.
svm_fit = svm(as.factor(y) ~ x1 + x2, data, kernel = "linear", cost = 0.1)
svm_pred = predict(svm_fit, data)
data_pos = data[svm_pred == 1, ]
data_neg = data[svm_pred == 0, ]
plot(data_pos$x1, data_pos$x2, col = "blue", xlab = "X1", ylab = "X2")
points(data_neg$x1, data_neg$x2, col = "red")
# Part h) Fit a SVM using a non-linear kernel to the data. Obtain a class
# prediction for each training observation. Plot the observations,
# colored according to the predicted class labels.
svm_fit2 = svm(as.factor(y) ~ x1 + x2, data, gamma = 1)
svm_pred2 = predict(svm_fit2, data)
data.pos = data[svm_pred2 == 1, ]
data.neg = data[svm_pred2 == 0, ]
plot(data.pos$x1, data.pos$x2, col = "blue", xlab = "X1", ylab = "X2")
points(data.neg$x1, data.neg$x2, col = "red")
# Part (i) Comment on your results.
# The non-linear boundaries can be seen in most results except
# from part d) which shows the parameters that separate
# plots form linear vs non linear.
# Problem 7 In this problem, you will use support vector approaches in order to
# predict whether a given car gets high or low gas mileage based on the
# Auto data set.
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
# Part a) Create a binary variable that takes on a 1 for cars with gas
# mileage above the median, and a 0 for cars with gas mileage
# below the median.
gas_med = median(Auto$mpg)
new_bi_var = ifelse(Auto$mpg > gas_med, 1, 0)
Auto$mpglevel = as.factor(new_bi_var)
# Part b) (b) Fit a support vector classifier to the data with various values
# of cost, in order to predict whether a car gets high or low gas
# mileage. Report the cross-validation errors associated with different
# values of this parameter. Comment on your results. Note
# you will need to fit the classifier without the gas mileage variable
# to produce sensible results.
set.seed(1)
tune_out = tune(svm, mpglevel ~ ., data = Auto, kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 5, 10, 100)))
summary(tune_out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.01025641
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-02 0.07653846 0.03617137
## 2 1e-01 0.04596154 0.03378238
## 3 1e+00 0.01025641 0.01792836
## 4 5e+00 0.02051282 0.02648194
## 5 1e+01 0.02051282 0.02648194
## 6 1e+02 0.03076923 0.03151981
# Based on the results when cost is 1 then the error rate is at its lowest
# in comparison to other cost.
# Part (c) Now repeat (b), this time using SVMs with radial and polynomial
# basis kernels, with different values of gamma and degree and
# cost. Comment on your results.
set.seed(7)
tune_out2 = tune(svm, mpglevel ~ ., data = Auto, kernel = "polynomial",
ranges = list(cost = c(0.1, 1, 5, 10), degree = c(2, 3, 4)))
summary(tune_out2)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost degree
## 10 2
##
## - best performance: 0.5207051
##
## - Detailed performance results:
## cost degree error dispersion
## 1 0.1 2 0.5585897 0.03870200
## 2 1.0 2 0.5585897 0.03870200
## 3 5.0 2 0.5585897 0.03870200
## 4 10.0 2 0.5207051 0.09208784
## 5 0.1 3 0.5585897 0.03870200
## 6 1.0 3 0.5585897 0.03870200
## 7 5.0 3 0.5585897 0.03870200
## 8 10.0 3 0.5585897 0.03870200
## 9 0.1 4 0.5585897 0.03870200
## 10 1.0 4 0.5585897 0.03870200
## 11 5.0 4 0.5585897 0.03870200
## 12 10.0 4 0.5585897 0.03870200
# Based on the results when cost is 10 and degree is 2
# then the error rate is at its lowest in comparison to other cost.
set.seed(7)
tune_out3 = tune(svm, mpglevel ~ ., data = Auto, kernel = "radial",
ranges = list(cost = c(0.1, 1, 5, 10),
gamma = c(0.01, 0.1, 1, 5, 10, 100)))
summary(tune_out3)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 0.1
##
## - best performance: 0.02794872
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 1e-02 0.09198718 0.04227017
## 2 1.0 1e-02 0.07403846 0.03704323
## 3 5.0 1e-02 0.05352564 0.03286051
## 4 10.0 1e-02 0.02801282 0.03036301
## 5 0.1 1e-01 0.07660256 0.04179222
## 6 1.0 1e-01 0.05871795 0.02711605
## 7 5.0 1e-01 0.03307692 0.02397013
## 8 10.0 1e-01 0.02794872 0.02199962
## 9 0.1 1e+00 0.55858974 0.03870200
## 10 1.0 1e+00 0.06897436 0.04669593
## 11 5.0 1e+00 0.06897436 0.04669593
## 12 10.0 1e+00 0.06897436 0.04669593
## 13 0.1 5e+00 0.55858974 0.03870200
## 14 1.0 5e+00 0.50256410 0.04642607
## 15 5.0 5e+00 0.49743590 0.04482497
## 16 10.0 5e+00 0.49743590 0.04482497
## 17 0.1 1e+01 0.55858974 0.03870200
## 18 1.0 1e+01 0.51782051 0.05141170
## 19 5.0 1e+01 0.51782051 0.05141170
## 20 10.0 1e+01 0.51782051 0.05141170
## 21 0.1 1e+02 0.55858974 0.03870200
## 22 1.0 1e+02 0.55858974 0.03870200
## 23 5.0 1e+02 0.55858974 0.03870200
## 24 10.0 1e+02 0.55858974 0.03870200
# Based on the results when cost is 10 and gamma is 0.1
# then the error rate is at its lowest in comparison to other cost.
# Part(d) Make some plots to back up your assertions in (b) and (c).
svm_lin = svm(mpglevel ~ ., data = Auto, kernel = "linear", cost = 1)
svm_poly = svm(mpglevel ~ ., data = Auto, kernel = "polynomial", cost = 10,
degree = 2)
svm_radial = svm(mpglevel ~ ., data = Auto, kernel = "radial", cost = 10,
gamma = 0.1)
plotpairs = function(fit) {
for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
plot(fit, Auto, as.formula(paste("mpg~", name, sep = "")))
}
}
plotpairs(svm_lin)
plotpairs(svm_poly)
plotpairs(svm_radial)
# Problem 8 This problem involves the OJ data set which is part of the ISLR2
# package.
str(OJ)
## 'data.frame': 1070 obs. of 18 variables:
## $ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
## $ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
## $ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
## $ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
## $ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
## $ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
## $ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
## $ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
## $ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
## $ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
## $ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
## $ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
## $ PctDiscMM : num 0 0.151 0 0 0 ...
## $ PctDiscCH : num 0 0 0.0914 0 0 ...
## $ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
## $ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
# Part (a) Create a training set containing a random sample of 800
# observations, and a test set containing the remaining
# observations.
set.seed(7)
data_Train = sample(nrow(OJ), 800)
oj_train = OJ[data_Train,]
oj_test = OJ[-data_Train,]
# Part b) Fit a support vector classifier to the training data using
# cost = 0.01, with Purchase as the response and the other variables
# as predictors. Use the summary() function to produce summary
# statistics, and describe the results obtained.
svc = svm(Purchase~., data = oj_train, kernel = 'linear',cost = 0.01)
summary(svc)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 438
##
## ( 218 220 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
# From the results generated using the summary function, 438 vectors
# have been created. From these vectors 218 belong to CH (Citrus Hill)
# and 220 belong to Minute Maid (MM).
# Part (c) What are the training and test error rates?
train_pred = predict(svc, oj_train)
table(oj_train$Purchase, train_pred)
## train_pred
## CH MM
## CH 424 60
## MM 76 240
136/800
## [1] 0.17
# Train error is 0.17
test_pred = predict(svc, oj_test)
table(oj_test$Purchase, test_pred)
## test_pred
## CH MM
## CH 154 15
## MM 29 72
44/270
## [1] 0.162963
# Test error is 0.162963
# Part (d) Use the tune() function to select an optimal cost. Consider values
# in the range 0.01 to 10.
set.seed(7)
tune_svc = tune(svm, Purchase ~ ., data = oj_train, kernel = "linear",
ranges = list(cost = c(0.01,0.1,1,10)))
summary(tune_svc)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 10
##
## - best performance: 0.17375
##
## - Detailed performance results:
## cost error dispersion
## 1 0.01 0.17750 0.03106892
## 2 0.10 0.17750 0.03270236
## 3 1.00 0.17625 0.02531057
## 4 10.00 0.17375 0.02316157
# The optimal cost is 10
# Part (e) Compute the training and test error rates using this new value
# for cost.
svm_lin_1 = svm(Purchase ~ ., kernel = "linear", data = oj_train,
cost = tune_out$best.parameters$cost)
pred_train_1 = predict(svm_lin_1, oj_train)
table(oj_train$Purchase, pred_train_1)
## pred_train_1
## CH MM
## CH 423 61
## MM 73 243
134/800
## [1] 0.1675
# The train error is 0.1675
test_pred_1 = predict(svm_lin_1, oj_test)
table(oj_test$Purchase, test_pred_1)
## test_pred_1
## CH MM
## CH 153 16
## MM 25 76
41/270
## [1] 0.1518519
# Test error is 0.1518519
# Part (f) Repeat parts (b) through (e) using a support vector machine
# with a radial kernel. Use the default value for gamma.
set.seed(7)
svm_rad1 = svm(Purchase ~ ., data = oj_train, kernel = "radial")
summary(svm_rad1)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 381
##
## ( 194 187 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
pred_train_2 = predict(svm_rad1, oj_train)
table(oj_train$Purchase, pred_train_2)
## pred_train_2
## CH MM
## CH 433 51
## MM 75 241
# Train error calculation
(51+75)/(433+51+75+241)
## [1] 0.1575
# Train error is 0.1575
test_pred2 = predict(svm_rad1, oj_test)
table(oj_test$Purchase, test_pred2)
## test_pred2
## CH MM
## CH 158 11
## MM 29 72
# Test error calculation
(11+29)/(158+11+29+72)
## [1] 0.1481481
# Test error is 0.1481481
svm_rad1 = svm(Purchase ~ ., data = oj_train, kernel = "radial",
cost = tune_svc$best.parameters$cost)
pred_train_1 = predict(svm_rad1, oj_train)
table(oj_train$Purchase, pred_train_1)
## pred_train_1
## CH MM
## CH 439 45
## MM 73 243
(45+73)/(439+45+73+243)
## [1] 0.1475
# Train error 0.1475
test_pred_3 = predict(svm_rad1, oj_test)
table(oj_test$Purchase, test_pred_3)
## test_pred_3
## CH MM
## CH 157 12
## MM 31 70
(12+31)/(157+12+31+70)
## [1] 0.1592593
# Test error 0.1592593
# Part (g) Repeat parts (b) through (e) using a support vector machine
# with a polynomial kernel. Set degree = 2.
set.seed(7)
svm_pol1 = svm(Purchase ~ ., kernel = "poly", data = oj_train, degree=2)
summary(svm_pol1)
##
## Call:
## svm(formula = Purchase ~ ., data = oj_train, kernel = "poly", degree = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 460
##
## ( 234 226 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
pred_train4 = predict(svm_pol1, oj_train)
table(oj_train$Purchase, pred_train4)
## pred_train4
## CH MM
## CH 447 37
## MM 115 201
(37+115)/(447+37+115+201)
## [1] 0.19
# Train error 0.19
test_pred4 = predict(svm_pol1, oj_test)
table(oj_test$Purchase, test_pred4)
## test_pred4
## CH MM
## CH 162 7
## MM 39 62
(39+7)/(162+7+39+62)
## [1] 0.1703704
# Test error is 0.1703704
# Part (h) Overall, which approach seems to give the best results on this
# data?
# Based on the results it appears that radial kernal is the best.