Load Required Libraries
library(ISLR2)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.95 loaded
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
Question 13: Weekly Data Set Analysis
13(a) Summary and Visualization
data("Weekly")
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
weekly_numeric <- dplyr::select(Weekly, -Direction, -Year)
corr_matrix <- cor(weekly_numeric)
corrplot(corr_matrix, method = "circle")

pairs(Weekly[, 2:6], col = ifelse(Weekly$Direction == "Up", "blue", "red"))

ggplot(Weekly, aes(x = Year, y = Volume)) +
geom_line(color = "darkgreen") +
labs(title = "Volume Over Time") +
theme_minimal()

table(Weekly$Direction)
##
## Down Up
## 484 605
13(b) Logistic Regression (Full Data)
log_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(log_fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
13(c) Confusion Matrix & Accuracy
log_probs <- predict(log_fit, type = "response")
log_pred <- ifelse(log_probs > 0.5, "Up", "Down")
table(Predicted = log_pred, Actual = Weekly$Direction)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
mean(log_pred == Weekly$Direction)
## [1] 0.5610652
13(d–h) Various Classifiers on Test Data
train <- Weekly$Year <= 2008
test <- Weekly$Year > 2008
# Logistic
log_fit2 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
log_probs2 <- predict(log_fit2, Weekly[!train, ], type = "response")
log_pred2 <- ifelse(log_probs2 > 0.5, "Up", "Down")
# LDA
lda_fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_pred <- predict(lda_fit, Weekly[!train, ])
# QDA
qda_fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_pred <- predict(qda_fit, Weekly[!train, ])
# KNN
train_X <- Weekly[train, "Lag2", drop = FALSE]
test_X <- Weekly[!train, "Lag2", drop = FALSE]
train_labels <- Weekly$Direction[train]
set.seed(1)
knn_pred <- knn(train_X, test_X, train_labels, k = 1)
# Naive Bayes
nb_fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb_pred <- predict(nb_fit, Weekly[!train, ])
13(i) Accuracy Comparison
data.frame(
Method = c("Logistic", "LDA", "QDA", "KNN (k=1)", "Naive Bayes"),
Accuracy = c(
mean(log_pred2 == Weekly$Direction[!train]),
mean(lda_pred$class == Weekly$Direction[!train]),
mean(qda_pred$class == Weekly$Direction[!train]),
mean(knn_pred == Weekly$Direction[!train]),
mean(nb_pred == Weekly$Direction[!train])
)
)
## Method Accuracy
## 1 Logistic 0.6250000
## 2 LDA 0.6250000
## 3 QDA 0.5865385
## 4 KNN (k=1) 0.5000000
## 5 Naive Bayes 0.5865385
13(j) Alternative Logistic Model
log_fit3 <- glm(Direction ~ Lag1 * Lag2, data = Weekly, family = binomial, subset = train)
log_probs3 <- predict(log_fit3, Weekly[!train, ], type = "response")
log_pred3 <- ifelse(log_probs3 > 0.5, "Up", "Down")
table(Predicted = log_pred3, Actual = Weekly$Direction[!train])
## Actual
## Predicted Down Up
## Down 7 8
## Up 36 53
mean(log_pred3 == Weekly$Direction[!train])
## [1] 0.5769231
Question 14: Auto Dataset - Binary Classification on MPG
14(a–b) Create mpg01 and Explore
data("Auto")
Auto <- na.omit(Auto)
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto <- data.frame(Auto, mpg01)
boxplot(horsepower ~ mpg01, data = Auto)

boxplot(weight ~ mpg01, data = Auto)

14(c) Split Train/Test
set.seed(1)
train_auto <- sample(1:nrow(Auto), nrow(Auto) * 0.7)
Auto_train <- Auto[train_auto, ]
Auto_test <- Auto[-train_auto, ]
14(d–g) LDA, QDA, Logit, Naive Bayes
lda_a <- lda(mpg01 ~ horsepower + weight, data = Auto_train)
pred_lda <- predict(lda_a, Auto_test)$class
mean(pred_lda != Auto_test$mpg01)
## [1] 0.1355932
qda_a <- qda(mpg01 ~ horsepower + weight, data = Auto_train)
pred_qda <- predict(qda_a, Auto_test)$class
mean(pred_qda != Auto_test$mpg01)
## [1] 0.1186441
log_auto <- glm(mpg01 ~ horsepower + weight, data = Auto_train, family = binomial)
probs <- predict(log_auto, Auto_test, type = "response")
pred_log <- ifelse(probs > 0.5, 1, 0)
mean(pred_log != Auto_test$mpg01)
## [1] 0.1271186
nb_auto <- naiveBayes(mpg01 ~ horsepower + weight, data = Auto_train)
pred_nb <- predict(nb_auto, Auto_test)
mean(pred_nb != Auto_test$mpg01)
## [1] 0.1186441
14(h) KNN for Various K
train_X <- scale(Auto_train[, c("horsepower", "weight")])
test_X <- scale(Auto_test[, c("horsepower", "weight")])
train_y <- Auto_train$mpg01
errs <- sapply(1:20, function(k) {
pred <- knn(train_X, test_X, train_y, k = k)
mean(pred != Auto_test$mpg01)
})
plot(1:20, errs, type = "b", xlab = "K", ylab = "Test Error")

Question 16: Boston Data - Crime Rate Classification
16(a) Create HighCrime Variable
data("Boston")
Boston$high_crime <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
16(b) Split Train/Test
set.seed(1)
train_boston <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
Boston_train <- Boston[train_boston, ]
Boston_test <- Boston[-train_boston, ]
16(c) Logistic, LDA, Naive Bayes, KNN
log_b <- glm(high_crime ~ nox + dis + age, data = Boston_train, family = binomial)
pred_b_log <- ifelse(predict(log_b, Boston_test, type = "response") > 0.5, 1, 0)
mean(pred_b_log != Boston_test$high_crime)
## [1] 0.1776316
lda_b <- lda(high_crime ~ nox + dis + age, data = Boston_train)
pred_b_lda <- predict(lda_b, Boston_test)$class
mean(pred_b_lda != Boston_test$high_crime)
## [1] 0.1447368
nb_b <- naiveBayes(as.factor(high_crime) ~ nox + dis + age, data = Boston_train)
pred_b_nb <- predict(nb_b, Boston_test)
mean(pred_b_nb != Boston_test$high_crime)
## [1] 0.1315789
tX <- scale(Boston_test[, c("nox", "dis", "age")])
trX <- scale(Boston_train[, c("nox", "dis", "age")])
trY <- Boston_train$high_crime
knn_pred_b <- knn(trX, tX, trY, k = 5)
mean(knn_pred_b != Boston_test$high_crime)
## [1] 0.08552632