Load Required Libraries

library(ISLR2)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.95 loaded
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(class)
library(e1071)

Question 13: Weekly Data Set Analysis

13(a) Summary and Visualization

data("Weekly")
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
str(Weekly)
## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
weekly_numeric <- dplyr::select(Weekly, -Direction, -Year)
corr_matrix <- cor(weekly_numeric)
corrplot(corr_matrix, method = "circle")

pairs(Weekly[, 2:6], col = ifelse(Weekly$Direction == "Up", "blue", "red"))

ggplot(Weekly, aes(x = Year, y = Volume)) +
  geom_line(color = "darkgreen") +
  labs(title = "Volume Over Time") +
  theme_minimal()

table(Weekly$Direction)
## 
## Down   Up 
##  484  605

13(b) Logistic Regression (Full Data)

log_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
               data = Weekly, family = binomial)
summary(log_fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

13(c) Confusion Matrix & Accuracy

log_probs <- predict(log_fit, type = "response")
log_pred <- ifelse(log_probs > 0.5, "Up", "Down")
table(Predicted = log_pred, Actual = Weekly$Direction)
##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557
mean(log_pred == Weekly$Direction)
## [1] 0.5610652

13(d–h) Various Classifiers on Test Data

train <- Weekly$Year <= 2008
test <- Weekly$Year > 2008

# Logistic
log_fit2 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
log_probs2 <- predict(log_fit2, Weekly[!train, ], type = "response")
log_pred2 <- ifelse(log_probs2 > 0.5, "Up", "Down")

# LDA
lda_fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_pred <- predict(lda_fit, Weekly[!train, ])

# QDA
qda_fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_pred <- predict(qda_fit, Weekly[!train, ])

# KNN
train_X <- Weekly[train, "Lag2", drop = FALSE]
test_X <- Weekly[!train, "Lag2", drop = FALSE]
train_labels <- Weekly$Direction[train]
set.seed(1)
knn_pred <- knn(train_X, test_X, train_labels, k = 1)

# Naive Bayes
nb_fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb_pred <- predict(nb_fit, Weekly[!train, ])

13(i) Accuracy Comparison

data.frame(
  Method = c("Logistic", "LDA", "QDA", "KNN (k=1)", "Naive Bayes"),
  Accuracy = c(
    mean(log_pred2 == Weekly$Direction[!train]),
    mean(lda_pred$class == Weekly$Direction[!train]),
    mean(qda_pred$class == Weekly$Direction[!train]),
    mean(knn_pred == Weekly$Direction[!train]),
    mean(nb_pred == Weekly$Direction[!train])
  )
)
##        Method  Accuracy
## 1    Logistic 0.6250000
## 2         LDA 0.6250000
## 3         QDA 0.5865385
## 4   KNN (k=1) 0.5000000
## 5 Naive Bayes 0.5865385

13(j) Alternative Logistic Model

log_fit3 <- glm(Direction ~ Lag1 * Lag2, data = Weekly, family = binomial, subset = train)
log_probs3 <- predict(log_fit3, Weekly[!train, ], type = "response")
log_pred3 <- ifelse(log_probs3 > 0.5, "Up", "Down")
table(Predicted = log_pred3, Actual = Weekly$Direction[!train])
##          Actual
## Predicted Down Up
##      Down    7  8
##      Up     36 53
mean(log_pred3 == Weekly$Direction[!train])
## [1] 0.5769231

Question 14: Auto Dataset - Binary Classification on MPG

14(a–b) Create mpg01 and Explore

data("Auto")
Auto <- na.omit(Auto)
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto <- data.frame(Auto, mpg01)
boxplot(horsepower ~ mpg01, data = Auto)

boxplot(weight ~ mpg01, data = Auto)

14(c) Split Train/Test

set.seed(1)
train_auto <- sample(1:nrow(Auto), nrow(Auto) * 0.7)
Auto_train <- Auto[train_auto, ]
Auto_test <- Auto[-train_auto, ]

14(d–g) LDA, QDA, Logit, Naive Bayes

lda_a <- lda(mpg01 ~ horsepower + weight, data = Auto_train)
pred_lda <- predict(lda_a, Auto_test)$class
mean(pred_lda != Auto_test$mpg01)
## [1] 0.1355932
qda_a <- qda(mpg01 ~ horsepower + weight, data = Auto_train)
pred_qda <- predict(qda_a, Auto_test)$class
mean(pred_qda != Auto_test$mpg01)
## [1] 0.1186441
log_auto <- glm(mpg01 ~ horsepower + weight, data = Auto_train, family = binomial)
probs <- predict(log_auto, Auto_test, type = "response")
pred_log <- ifelse(probs > 0.5, 1, 0)
mean(pred_log != Auto_test$mpg01)
## [1] 0.1271186
nb_auto <- naiveBayes(mpg01 ~ horsepower + weight, data = Auto_train)
pred_nb <- predict(nb_auto, Auto_test)
mean(pred_nb != Auto_test$mpg01)
## [1] 0.1186441

14(h) KNN for Various K

train_X <- scale(Auto_train[, c("horsepower", "weight")])
test_X <- scale(Auto_test[, c("horsepower", "weight")])
train_y <- Auto_train$mpg01
errs <- sapply(1:20, function(k) {
  pred <- knn(train_X, test_X, train_y, k = k)
  mean(pred != Auto_test$mpg01)
})
plot(1:20, errs, type = "b", xlab = "K", ylab = "Test Error")

Question 16: Boston Data - Crime Rate Classification

16(a) Create HighCrime Variable

data("Boston")
Boston$high_crime <- ifelse(Boston$crim > median(Boston$crim), 1, 0)

16(b) Split Train/Test

set.seed(1)
train_boston <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
Boston_train <- Boston[train_boston, ]
Boston_test <- Boston[-train_boston, ]

16(c) Logistic, LDA, Naive Bayes, KNN

log_b <- glm(high_crime ~ nox + dis + age, data = Boston_train, family = binomial)
pred_b_log <- ifelse(predict(log_b, Boston_test, type = "response") > 0.5, 1, 0)
mean(pred_b_log != Boston_test$high_crime)
## [1] 0.1776316
lda_b <- lda(high_crime ~ nox + dis + age, data = Boston_train)
pred_b_lda <- predict(lda_b, Boston_test)$class
mean(pred_b_lda != Boston_test$high_crime)
## [1] 0.1447368
nb_b <- naiveBayes(as.factor(high_crime) ~ nox + dis + age, data = Boston_train)
pred_b_nb <- predict(nb_b, Boston_test)
mean(pred_b_nb != Boston_test$high_crime)
## [1] 0.1315789
tX <- scale(Boston_test[, c("nox", "dis", "age")])
trX <- scale(Boston_train[, c("nox", "dis", "age")])
trY <- Boston_train$high_crime
knn_pred_b <- knn(trX, tX, trY, k = 5)
mean(knn_pred_b != Boston_test$high_crime)
## [1] 0.08552632