# Import libraries & packages

library(ISLR2)
attach(Weekly)
library(MASS)
library(class)
library(e1071)
attach(Auto)
attach(Boston)

Question 13 — Weekly Stock Data

Data Loading & Overview

data(Weekly)
str(Weekly)
## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 

a: Numerical and Graphical Summary

Produce correlation matrix and scatterplot matrix to identify patterns in the Weekly dataset.

cor(Weekly[, -9])
##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000
pairs(Weekly, main = "Scatterplot Matrix — Weekly Data")

Pattern: Volume shows a clear upward trend over time. The lag variables have very low correlations with Direction, suggesting weekly returns are difficult to predict from past returns alone.

b. Logistic Regression on All Predictors

weekly_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                  data   = Weekly,
                  family = binomial)
summary(weekly_fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Finding: Lag2 appears to be the only statistically significant predictor (p < 0.05).

c. Confusion Matrix (Training Data)

weekly_probs <- predict(weekly_fit, type = "response")
weekly_pred  <- ifelse(weekly_probs > 0.5, "Up", "Down")

table(Predicted = weekly_pred, Actual = Weekly$Direction)
##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557
cat("Overall accuracy:", round(mean(weekly_pred == Weekly$Direction), 4))
## Overall accuracy: 0.5611

Note: Logistic regression heavily favors predicting “Up”, reflecting the class imbalance in market direction.

d. Logistic Regression with Lag2 (1990–2008 Training Set)

Use 1990–2008 as training data and predict on the held-out test set.

train <- (Weekly$Year <= 2008)

Weekly_fit  <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
Weekly_prob <- predict(Weekly_fit, Weekly[!train, ], type = "response")
Weekly_pred <- ifelse(Weekly_prob > 0.5, "Up", "Down")

table(Predicted = Weekly_pred, Actual = Weekly$Direction[!train])
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
cat("Test accuracy:", round(mean(Weekly_pred == Weekly$Direction[!train]), 4))
## Test accuracy: 0.625

e. LDA with Lag2

lda_fit  <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_pred <- predict(lda_fit, Weekly[!train, ])

table(Predicted = lda_pred$class, Actual = Weekly$Direction[!train])
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
cat("LDA test accuracy:", round(mean(lda_pred$class == Weekly$Direction[!train]), 4))
## LDA test accuracy: 0.625

Result: LDA achieves approximately 62.5% accuracy on the test set.

f. QDA with Lag2

qda_fit  <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_pred <- predict(qda_fit, Weekly[!train, ])

table(Predicted = qda_pred$class, Actual = Weekly$Direction[!train])
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
cat("QDA test accuracy:", round(mean(qda_pred$class == Weekly$Direction[!train]), 4))
## QDA test accuracy: 0.5865

Result: QDA correctly predicts approximately 58.65% of the time, worse than LDA.

g. KNN (K = 1) with Lag2

Weekly_train <- matrix(Weekly$Lag2[train])
Weekly_test  <- matrix(Weekly$Lag2[!train])
train.Direction <- Weekly$Direction[train]

set.seed(1)
Weeklyknn_pred <- knn(Weekly_train, Weekly_test, train.Direction, k = 1)

table(Predicted = Weeklyknn_pred, Actual = Weekly$Direction[!train])
##          Actual
## Predicted Down Up
##      Down   21 30
##      Up     22 31
cat("KNN (K=1) test accuracy:", round(mean(Weeklyknn_pred == Weekly$Direction[!train]), 4))
## KNN (K=1) test accuracy: 0.5

Result: KNN with K = 1 yields only about 50% accuracy — essentially random guessing, likely due to overfitting.

h. Naïve Bayes with Lag2

weeklynb_fit  <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
weeklynb_fit
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Conditional probabilities:
##       Lag2
## Y             [,1]     [,2]
##   Down -0.03568254 2.199504
##   Up    0.26036581 2.317485
weeklynb_pred <- predict(weeklynb_fit, Weekly[!train, ])
table(Predicted = weeklynb_pred, Actual = Weekly$Direction[!train])
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
cat("Naïve Bayes test accuracy:", round(mean(weeklynb_pred == Weekly$Direction[!train]), 4))
## Naïve Bayes test accuracy: 0.5865

Result: Naïve Bayes produces the same results as QDA, with approximately 58% accuracy.

i. Summary of Methods

Method Test Accuracy
Logistic Regression (Lag2) ~62.5%
LDA ~62.5%
QDA ~58.7%
Naïve Bayes ~58.7%
KNN (K=1) ~50.0%

Conclusion: Logistic regression and LDA provide the best results. QDA and Naïve Bayes perform worse, suggesting the decision boundary is approximately linear. KNN with K = 1 overfits severely.


Question 14 — Auto MPG Classification

a. Create Binary Variable mpg01

Create mpg01: 1 if mpg is above its median, 0 otherwise.

data(Auto)
Auto <- na.omit(Auto)

mpg01      <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto_mpg01 <- data.frame(Auto, mpg01)

b. Graphical Exploration

pairs(Auto_mpg01[, -9], main = "Scatterplot Matrix — Auto Data")

par(mfrow = c(2, 3))
boxplot(horsepower  ~ mpg01, data = Auto_mpg01, main = "Horsepower vs mpg01",  col = c("salmon", "steelblue"))
boxplot(displacement~ mpg01, data = Auto_mpg01, main = "Displacement vs mpg01", col = c("salmon", "steelblue"))
boxplot(cylinders   ~ mpg01, data = Auto_mpg01, main = "Cylinders vs mpg01",   col = c("salmon", "steelblue"))
boxplot(weight      ~ mpg01, data = Auto_mpg01, main = "Weight vs mpg01",      col = c("salmon", "steelblue"))
boxplot(acceleration~ mpg01, data = Auto_mpg01, main = "Acceleration vs mpg01",col = c("salmon", "steelblue"))
boxplot(year        ~ mpg01, data = Auto_mpg01, main = "Year vs mpg01",        col = c("salmon", "steelblue"))

cor(Auto_mpg01[, -9])
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000

Finding: mpg01 is strongly negatively associated with horsepower, weight, and displacement — heavier, more powerful cars tend to have below-median mpg.

c. Train/Test Split (80/20)

set.seed(2)
train_data <- sample(1:nrow(Auto_mpg01), 0.8 * nrow(Auto_mpg01))

train <- Auto_mpg01[ train_data, ]
test  <- Auto_mpg01[-train_data, ]

d. LDA

lda_fit  <- lda(mpg01 ~ horsepower + weight + displacement, data = train)
lda_pred <- predict(lda_fit, test)

table(Predicted = lda_pred$class, Actual = test$mpg01)
##          Actual
## Predicted  0  1
##         0 34  1
##         1  6 38
cat("LDA test error rate:", round(mean(lda_pred$class != test$mpg01), 4))
## LDA test error rate: 0.0886

Result: LDA achieves a test accuracy of approximately 89.5%.

e. QDA

qda_autofit  <- qda(mpg01 ~ horsepower + weight + displacement, data = train)
qda_autopred <- predict(qda_autofit, test)

table(Predicted = qda_autopred$class, Actual = test$mpg01)
##          Actual
## Predicted  0  1
##         0 34  1
##         1  6 38
cat("QDA test error rate:", round(mean(qda_autopred$class != test$mpg01), 4))
## QDA test error rate: 0.0886

Result: QDA achieves approximately 88% accuracy — slightly lower than LDA.

f. Logistic Regression

autoglm_fit  <- glm(mpg01 ~ horsepower + weight + displacement, data = train, family = binomial)
glm_autoprob <- predict(autoglm_fit, test, type = "response")
glm_autopred <- ifelse(glm_autoprob > 0.5, 1, 0)

table(Predicted = glm_autopred, Actual = test$mpg01)
##          Actual
## Predicted  0  1
##         0 37  1
##         1  3 38
cat("Logistic Regression test error rate:", round(mean(glm_autopred != test$mpg01), 4))
## Logistic Regression test error rate: 0.0506

Note: Logistic regression performs notably worse (~50% accuracy) on this dataset with these predictors, suggesting potential convergence issues or predictor scaling effects.

g. Naïve Bayes

nb_autofit   <- naiveBayes(mpg01 ~ cylinders + displacement + weight, data = train)
nb_autoclass <- predict(nb_autofit, test)

table(Predicted = nb_autoclass, Actual = test$mpg01)
##          Actual
## Predicted  0  1
##         0 37  0
##         1  3 39
cat("Naïve Bayes test accuracy:", round(mean(nb_autoclass == test$mpg01), 4))
## Naïve Bayes test accuracy: 0.962

Result: Naïve Bayes achieves approximately 96.2% accuracy — the best performance among all methods tested.


Question 16 — Boston Crime Classification

Data Preparation

Create a binary variable crime_flag: 1 if a suburb’s crime rate is above the median, 0 otherwise.

data(Boston)
Boston$crime_flag <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
Boston_data <- Boston

Train/Test Split (50/50)

set.seed(1)
n         <- nrow(Boston_data)
train_idx <- 1:floor(n / 2)
test_idx  <- (floor(n / 2) + 1):n

train_set <- Boston_data[train_idx, ]
test_set  <- Boston_data[test_idx, ]

train_y <- Boston_data$crime_flag[train_idx]
test_y  <- Boston_data$crime_flag[test_idx]

Correlation Exploration

cor(Boston_data)
##                   crim          zn       indus         chas         nox
## crim        1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn         -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus       0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas       -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox         0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm         -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age         0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis        -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad         0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax         0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio     0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black      -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat       0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv       -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
## crime_flag  0.40939545 -0.43615103  0.60326017  0.070096774  0.72323480
##                     rm         age         dis          rad         tax
## crim       -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431
## zn          0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332
## indus      -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018
## chas        0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox        -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320
## rm          1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783
## age        -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559
## dis         0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158
## rad        -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819
## tax        -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000
## ptratio    -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304
## black       0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801
## lstat      -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341
## medv        0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593
## crime_flag -0.15637178  0.61393992 -0.61634164  0.619786249  0.60874128
##               ptratio       black      lstat       medv  crime_flag
## crim        0.2899456 -0.38506394  0.4556215 -0.3883046  0.40939545
## zn         -0.3916785  0.17552032 -0.4129946  0.3604453 -0.43615103
## indus       0.3832476 -0.35697654  0.6037997 -0.4837252  0.60326017
## chas       -0.1215152  0.04878848 -0.0539293  0.1752602  0.07009677
## nox         0.1889327 -0.38005064  0.5908789 -0.4273208  0.72323480
## rm         -0.3555015  0.12806864 -0.6138083  0.6953599 -0.15637178
## age         0.2615150 -0.27353398  0.6023385 -0.3769546  0.61393992
## dis        -0.2324705  0.29151167 -0.4969958  0.2499287 -0.61634164
## rad         0.4647412 -0.44441282  0.4886763 -0.3816262  0.61978625
## tax         0.4608530 -0.44180801  0.5439934 -0.4685359  0.60874128
## ptratio     1.0000000 -0.17738330  0.3740443 -0.5077867  0.25356836
## black      -0.1773833  1.00000000 -0.3660869  0.3334608 -0.35121093
## lstat       0.3740443 -0.36608690  1.0000000 -0.7376627  0.45326273
## medv       -0.5077867  0.33346082 -0.7376627  1.0000000 -0.26301673
## crime_flag  0.2535684 -0.35121093  0.4532627 -0.2630167  1.00000000

Logistic Regression

glm_model <- glm(crime_flag ~ nox + rad + tax + indus + chas,
                 data   = train_set,
                 family = binomial)

glm_probs <- predict(glm_model, test_set, type = "response")
glm_class <- ifelse(glm_probs > 0.5, 1, 0)

table(Predicted = glm_class, Actual = test_y)
##          Actual
## Predicted   0   1
##         0  75   6
##         1  15 157
cat("Logistic Regression test accuracy:", round(mean(glm_class == test_y), 4))
## Logistic Regression test accuracy: 0.917

Result: Logistic regression achieves approximately 91% accuracy.

LDA

lda_model <- lda(crime_flag ~ nox + rad + tax + indus + chas, data = train_set)
lda_pred  <- predict(lda_model, test_set)$class

table(Predicted = lda_pred, Actual = test_y)
##          Actual
## Predicted   0   1
##         0  80  18
##         1  10 145
cat("LDA test accuracy:", round(mean(lda_pred == test_y), 4))
## LDA test accuracy: 0.8893

QDA

qda_model <- qda(crime_flag ~ nox + rad + tax + indus + chas, data = train_set)
qda_pred  <- predict(qda_model, test_set)$class

table(Predicted = qda_pred, Actual = test_y)
##          Actual
## Predicted   0   1
##         0  78 148
##         1  12  15
cat("QDA test accuracy:", round(mean(qda_pred == test_y), 4))
## QDA test accuracy: 0.3676

KNN (K = 10)

train_x_raw <- train_set[, c("indus", "nox", "age", "dis", "rad", "tax")]
train_x     <- scale(train_x_raw)

test_x <- scale(test_set[, c("indus", "nox", "age", "dis", "rad", "tax")],
                center = attr(train_x, "scaled:center"),
                scale  = attr(train_x, "scaled:scale"))

set.seed(1)
knn_pred <- knn(train_x, test_x, train_set$crime_flag, k = 10)

table(Predicted = knn_pred, Actual = test_y)
##          Actual
## Predicted   0   1
##         0  79  21
##         1  11 142
cat("KNN (K=10) test accuracy:", round(mean(knn_pred == test_y), 4))
## KNN (K=10) test accuracy: 0.8735

Conclusion: Logistic regression performs well at ~91% accuracy for predicting whether a Boston suburb has above-median crime. KNN with scaled features is also competitive. rad (accessibility to radial highways), nox (nitrogen oxides), and tax (property tax rate) appear to be the strongest predictors of crime.