Assignment #3

Load required packages

options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("ISLR2") 

The downloaded binary packages are in
    /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//RtmpE3d66x/downloaded_packages
library(ISLR2)
library(ggplot2)
install.packages("GGally")

The downloaded binary packages are in
    /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//RtmpE3d66x/downloaded_packages
library(GGally)
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
knitr::opts_chunk$set(echo = TRUE)
library(MASS)

Attaching package: 'MASS'
The following object is masked from 'package:ISLR2':

    Boston
library(class)
library(e1071)

==========================

Problem 13: Weekly Dataset

==========================

(a) Summary and Visualization

summary(Weekly)
      Year           Lag1               Lag2               Lag3         
 Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
 1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
 Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
 Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
 3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
 Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
      Lag4               Lag5              Volume            Today         
 Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
 1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
 Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
 Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
 3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
 Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
 Direction 
 Down:484  
 Up  :605  
           
           
           
           
pairs(Weekly[, -9])

cor(Weekly[, -c(9)])
              Year         Lag1        Lag2        Lag3         Lag4
Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
               Lag5      Volume        Today
Year   -0.030519101  0.84194162 -0.032459894
Lag1   -0.008183096 -0.06495131 -0.075031842
Lag2   -0.072499482 -0.08551314  0.059166717
Lag3    0.060657175 -0.06928771 -0.071243639
Lag4   -0.075675027 -0.06107462 -0.007825873
Lag5    1.000000000 -0.05851741  0.011012698
Volume -0.058517414  1.00000000 -0.033077783
Today   0.011012698 -0.03307778  1.000000000
plot(Weekly$Volume, type = "l", main = "Volume over Time")

(b) Logistic Regression on Full Dataset

glm_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
               data = Weekly, family = binomial)
summary(glm_fit)

Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
    Volume, family = binomial, data = Weekly)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  0.26686    0.08593   3.106   0.0019 **
Lag1        -0.04127    0.02641  -1.563   0.1181   
Lag2         0.05844    0.02686   2.175   0.0296 * 
Lag3        -0.01606    0.02666  -0.602   0.5469   
Lag4        -0.02779    0.02646  -1.050   0.2937   
Lag5        -0.01447    0.02638  -0.549   0.5833   
Volume      -0.02274    0.03690  -0.616   0.5377   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1496.2  on 1088  degrees of freedom
Residual deviance: 1486.4  on 1082  degrees of freedom
AIC: 1500.4

Number of Fisher Scoring iterations: 4

(c) Confusion Matrix and Accuracy

glm_probs <- predict(glm_fit, type = "response")
glm_preds <- ifelse(glm_probs > 0.5, "Up", "Down")
table(Predicted = glm_preds, Actual = Weekly$Direction)
         Actual
Predicted Down  Up
     Down   54  48
     Up    430 557
mean(glm_preds == Weekly$Direction)
[1] 0.5610652

(d) Logistic Regression (Train: 1990–2008, Test: 2009–2010)

train <- Weekly$Year <= 2008
test <- !train
glm_train <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
glm_test_probs <- predict(glm_train, Weekly[test, ], type = "response")
glm_test_preds <- ifelse(glm_test_probs > 0.5, "Up", "Down")
table(Predicted = glm_test_preds, Actual = Weekly$Direction[test])
         Actual
Predicted Down Up
     Down    9  5
     Up     34 56
mean(glm_test_preds == Weekly$Direction[test])
[1] 0.625

(e) LDA

lda_fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_pred <- predict(lda_fit, Weekly[test, ])
table(Predicted = lda_pred$class, Actual = Weekly$Direction[test])
         Actual
Predicted Down Up
     Down    9  5
     Up     34 56
mean(lda_pred$class == Weekly$Direction[test])
[1] 0.625

(f) QDA

qda_fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_pred <- predict(qda_fit, Weekly[test, ])
table(Predicted = qda_pred$class, Actual = Weekly$Direction[test])
         Actual
Predicted Down Up
     Down    0  0
     Up     43 61
mean(qda_pred$class == Weekly$Direction[test])
[1] 0.5865385

(g) KNN (K = 1)

train_X <- as.matrix(Weekly[train, "Lag2", drop = FALSE])
test_X <- as.matrix(Weekly[test, "Lag2", drop = FALSE])
train_y <- Weekly$Direction[train]
knn_pred <- knn(train_X, test_X, train_y, k = 1)
table(Predicted = knn_pred, Actual = Weekly$Direction[test])
         Actual
Predicted Down Up
     Down   21 29
     Up     22 32
mean(knn_pred == Weekly$Direction[test])
[1] 0.5096154

(h) Naive Bayes

nb_fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb_pred <- predict(nb_fit, Weekly[test, ])
table(Predicted = nb_pred, Actual = Weekly$Direction[test])
         Actual
Predicted Down Up
     Down    0  0
     Up     43 61
mean(nb_pred == Weekly$Direction[test])
[1] 0.5865385

==========================

Problem 14: Auto Dataset - Predict High MPG

==========================

(a) Create mpg01

mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto2 <- data.frame(mpg01, Auto)

(b) Exploratory Analysis

pairs(Auto2[, c("mpg01", "displacement", "horsepower", "weight", "acceleration")])

boxplot(weight ~ mpg01, data = Auto2)

(c) Train/Test Split

set.seed(1)
train_auto <- sample(1:nrow(Auto2), nrow(Auto2)/2)
test_auto <- -train_auto

(d) LDA

lda_auto <- lda(mpg01 ~ weight + horsepower + displacement, data = Auto2, subset = train_auto)
lda_auto_pred <- predict(lda_auto, Auto2[test_auto, ])
mean(lda_auto_pred$class != Auto2$mpg01[test_auto])
[1] 0.127551

(e) QDA

qda_auto <- qda(mpg01 ~ weight + horsepower + displacement, data = Auto2, subset = train_auto)
qda_auto_pred <- predict(qda_auto, Auto2[test_auto, ])
mean(qda_auto_pred$class != Auto2$mpg01[test_auto])
[1] 0.127551

(f) Logistic Regression

glm_auto <- glm(mpg01 ~ weight + horsepower + displacement, data = Auto2, family = binomial, subset = train_auto)
glm_auto_probs <- predict(glm_auto, Auto2[test_auto, ], type = "response")
glm_auto_preds <- ifelse(glm_auto_probs > 0.5, 1, 0)
mean(glm_auto_preds != Auto2$mpg01[test_auto])
[1] 0.127551

(g) Naive Bayes

nb_auto <- naiveBayes(mpg01 ~ weight + horsepower + displacement, data = Auto2, subset = train_auto)
nb_auto_pred <- predict(nb_auto, Auto2[test_auto, ])
mean(nb_auto_pred != Auto2$mpg01[test_auto])
[1] 0.1326531

(h) KNN

train_X_auto <- scale(Auto2[train_auto, c("weight", "horsepower", "displacement")])
test_X_auto <- scale(Auto2[test_auto, c("weight", "horsepower", "displacement")])
train_y_auto <- Auto2$mpg01[train_auto]

for (k in c(1, 3, 5, 10)) {
  knn_pred_auto <- knn(train_X_auto, test_X_auto, train_y_auto, k = k)
  print(paste("K =", k, "Error Rate =", mean(knn_pred_auto != Auto2$mpg01[test_auto])))
}
[1] "K = 1 Error Rate = 0.158163265306122"
[1] "K = 3 Error Rate = 0.142857142857143"
[1] "K = 5 Error Rate = 0.142857142857143"
[1] "K = 10 Error Rate = 0.11734693877551"

==========================

Problem 16: Boston Dataset - Predict High Crime Areas

==========================

(a) Create Binary Crime Variable

data("Boston")
crime01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
Boston2 <- data.frame(crime01, Boston)

(b) Split into Train/Test

set.seed(2)
train_boston <- sample(1:nrow(Boston2), nrow(Boston2)/2)
test_boston <- -train_boston

(c) Logistic Regression

glm_boston <- glm(crime01 ~ nox + dis + rad + tax + lstat, 
                  data = Boston2, family = binomial, subset = train_boston)
glm_boston_probs <- predict(glm_boston, Boston2[test_boston, ], type = "response")
glm_boston_preds <- ifelse(glm_boston_probs > 0.5, 1, 0)
mean(glm_boston_preds != Boston2$crime01[test_boston])
[1] 0.1541502

(d) LDA

lda_boston <- lda(crime01 ~ nox + dis + rad + tax + lstat, data = Boston2, subset = train_boston)
lda_boston_pred <- predict(lda_boston, Boston2[test_boston, ])
mean(lda_boston_pred$class != Boston2$crime01[test_boston])
[1] 0.1660079

(e) Naive Bayes

nb_boston <- naiveBayes(crime01 ~ nox + dis + rad + tax + lstat, data = Boston2, subset = train_boston)
nb_boston_pred <- predict(nb_boston, Boston2[test_boston, ])
mean(nb_boston_pred != Boston2$crime01[test_boston])
[1] 0.173913

(f) KNN

train_X_boston <- scale(Boston2[train_boston, c("nox", "dis", "rad", "tax", "lstat")])
test_X_boston <- scale(Boston2[test_boston, c("nox", "dis", "rad", "tax", "lstat")])
train_y_boston <- Boston2$crime01[train_boston]

for (k in c(1, 3, 5, 10)) {
  knn_pred_boston <- knn(train_X_boston, test_X_boston, train_y_boston, k = k)
  print(paste("K =", k, "Error Rate =", mean(knn_pred_boston != Boston2$crime01[test_boston])))
}
[1] "K = 1 Error Rate = 0.102766798418972"
[1] "K = 3 Error Rate = 0.0988142292490119"
[1] "K = 5 Error Rate = 0.102766798418972"
[1] "K = 10 Error Rate = 0.138339920948617"