## Warning: package 'ISLR2' was built under R version 4.5.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
## Warning: package 'rmarkdown' was built under R version 4.5.3

Question 13

a)

head(Weekly)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
cor(Weekly[, -9])
##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000
pairs(Weekly, col = ifelse(Weekly$Direction == "Up", "blue", "red"),
      main = "Pairs plot of Weekly data (Blue = Up, Red = Down)")

plot(Weekly$Volume, type = "l", col = "blue",
     xlab = "Week Index", ylab = "Volume",
     main = "Trading Volume Over Time")

boxplot(Today ~ Direction, data = Weekly, col = c("red", "blue"),
        main = "Today's Return by Direction",
        xlab = "Direction", ylab = "Today's Return (%)")

Volume of billions of trades increases over time.

b)

logistic1 <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                data = Weekly, family = binomial)
summary(logistic1)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Lag 2 is statistically significant.

c)

glm.probs <- predict(logistic1, type = "response")
glm.pred  <- ifelse(glm.probs > 0.5, "Up", "Down")

confusion_full <- table(Predicted = glm.pred, Actual = Weekly$Direction)
print(confusion_full)
##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557
overall_correct <- mean(glm.pred == Weekly$Direction)
cat(sprintf("Overall fraction correct: %.4f (%.1f%%)\n",
            overall_correct, overall_correct * 100))
## Overall fraction correct: 0.5611 (56.1%)

The model is more likely to predict “Up” correctly than “Down”.It is more likely to misclassify “Down” as “Up”.

d)

train <- Weekly$Year <= 2008
test  <- !train

glm.train <- glm(Direction ~ Lag2, data = Weekly, subset = train,
                 family = binomial)

glm.test.probs <- predict(glm.train, newdata = Weekly[test, ], type = "response")
glm.test.pred  <- ifelse(glm.test.probs > 0.5, "Up", "Down")

cm_lr <- table(Predicted = glm.test.pred, Actual = Weekly$Direction[test])
print(cm_lr)
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
cat(sprintf("Logistic Regression accuracy: %.4f\n", mean(glm.test.pred == Weekly$Direction[test])))
## Logistic Regression accuracy: 0.6250

e)

train <- Weekly$Year <= 2008
test  <- !train
lda.fit  <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda.pred <- predict(lda.fit, newdata = Weekly[test, ])

cm_lda <- table(Predicted = lda.pred$class, Actual = Weekly$Direction[test])
print(cm_lda)
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
cat(sprintf("LDA accuracy: %.4f\n", mean(lda.pred$class == Weekly$Direction[test])))
## LDA accuracy: 0.6250

f)

qda.fit  <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.pred <- predict(qda.fit, newdata = Weekly[test, ])

cm_qda <- table(Predicted = qda.pred$class, Actual = Weekly$Direction[test])
print(cm_qda)
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
cat(sprintf("QDA accuracy: %.4f\n", mean(qda.pred$class == Weekly$Direction[test])))
## QDA accuracy: 0.5865

g)

set.seed(1)
train.X <- matrix(Weekly$Lag2[train])
test.X  <- matrix(Weekly$Lag2[test])
train.Y <- Weekly$Direction[train]

knn.pred1 <- knn(train.X, test.X, train.Y, k = 1)

cm_knn1 <- table(Predicted = knn.pred1, Actual = Weekly$Direction[test])
print(cm_knn1)
##          Actual
## Predicted Down Up
##      Down   21 30
##      Up     22 31
cat(sprintf("KNN (K=1) accuracy: %.4f\n", mean(knn.pred1 == Weekly$Direction[test])))
## KNN (K=1) accuracy: 0.5000

h)

nb.fit  <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb.pred <- predict(nb.fit, newdata = Weekly[test, ])

cm_nb <- table(Predicted = nb.pred, Actual = Weekly$Direction[test])
print(cm_nb)
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
cat(sprintf("Naive Bayes accuracy: %.4f\n", mean(nb.pred == Weekly$Direction[test])))
## Naive Bayes accuracy: 0.5865

i)

methods <- c("Logistic Regression", "LDA", "QDA", "KNN (K=1)", "Naive Bayes")
accs <- c(
  mean(glm.test.pred          == Weekly$Direction[test]),
  mean(lda.pred$class         == Weekly$Direction[test]),
  mean(qda.pred$class         == Weekly$Direction[test]),
  mean(knn.pred1              == Weekly$Direction[test]),
  mean(nb.pred                == Weekly$Direction[test])
)

results <- data.frame(Method = methods, Accuracy = round(accs, 4))
print(results[order(-results$Accuracy), ])
##                Method Accuracy
## 1 Logistic Regression   0.6250
## 2                 LDA   0.6250
## 3                 QDA   0.5865
## 5         Naive Bayes   0.5865
## 4           KNN (K=1)   0.5000

Question 14

a)

mpg_median <- median(Auto$mpg)
mpg01      <- ifelse(Auto$mpg > mpg_median, 1, 0)
Auto2      <- data.frame(mpg01, Auto)

cat("mpg median:", mpg_median, "\n")
## mpg median: 22.75
cat("mpg01 distribution:\n")
## mpg01 distribution:
print(table(Auto2$mpg01))
## 
##   0   1 
## 196 196

b)

pairs(Auto2[, c("mpg01","cylinders","displacement","horsepower",
                "weight","acceleration","year","origin")],
      col  = ifelse(Auto2$mpg01 == 1, "blue", "red"),
      main = "Pairs Plot — Blue = High MPG, Red = Low MPG")

par(mfrow = c(2, 3))

boxplot(cylinders    ~ mpg01, data = Auto2, col = c("red","blue"),
        main = "Cylinders vs mpg01",    xlab = "mpg01", ylab = "Cylinders")
boxplot(displacement ~ mpg01, data = Auto2, col = c("red","blue"),
        main = "Displacement vs mpg01", xlab = "mpg01", ylab = "Displacement")
boxplot(horsepower   ~ mpg01, data = Auto2, col = c("red","blue"),
        main = "Horsepower vs mpg01",   xlab = "mpg01", ylab = "Horsepower")
boxplot(weight       ~ mpg01, data = Auto2, col = c("red","blue"),
        main = "Weight vs mpg01",       xlab = "mpg01", ylab = "Weight")
boxplot(acceleration ~ mpg01, data = Auto2, col = c("red","blue"),
        main = "Acceleration vs mpg01", xlab = "mpg01", ylab = "Acceleration")
boxplot(year         ~ mpg01, data = Auto2, col = c("red","blue"),
        main = "Year vs mpg01",         xlab = "mpg01", ylab = "Year")

par(mfrow = c(1, 1))

cat("\nCorrelations with mpg01:\n")
## 
## Correlations with mpg01:
num_vars <- Auto2[, c("cylinders","displacement","horsepower",
                      "weight","acceleration","year","origin")]
print(round(cor(cbind(mpg01 = Auto2$mpg01, num_vars)), 3))
##               mpg01 cylinders displacement horsepower weight acceleration
## mpg01         1.000    -0.759       -0.753     -0.667 -0.758        0.347
## cylinders    -0.759     1.000        0.951      0.843  0.898       -0.505
## displacement -0.753     0.951        1.000      0.897  0.933       -0.544
## horsepower   -0.667     0.843        0.897      1.000  0.865       -0.689
## weight       -0.758     0.898        0.933      0.865  1.000       -0.417
## acceleration  0.347    -0.505       -0.544     -0.689 -0.417        1.000
## year          0.430    -0.346       -0.370     -0.416 -0.309        0.290
## origin        0.514    -0.569       -0.615     -0.455 -0.585        0.213
##                year origin
## mpg01         0.430  0.514
## cylinders    -0.346 -0.569
## displacement -0.370 -0.615
## horsepower   -0.416 -0.455
## weight       -0.309 -0.585
## acceleration  0.290  0.213
## year          1.000  0.182
## origin        0.182  1.000

Cylinders, displacement, horsepower, and weight have a negative correlation with mpg01. Year, origin and acceleration have positive correlations with mpg01.

c)

predictors <- c("cylinders", "displacement", "horsepower",
                "weight", "acceleration", "year", "origin")

fmla <- as.formula(
  paste("mpg01 ~", paste(predictors, collapse = " + "))
)
set.seed(42)
n          <- nrow(Auto2)
train_idx  <- sample(1:n, size = floor(0.8 * n))
train_data <- Auto2[ train_idx, ]
test_data  <- Auto2[-train_idx, ]

cat(sprintf("Training rows: %d  |  Test rows: %d\n",
            nrow(train_data), nrow(test_data)))
## Training rows: 313  |  Test rows: 79

d)

lda.fit  <- lda(fmla, data = train_data)
lda.pred <- predict(lda.fit, newdata = test_data)$class

cm_lda  <- table(Predicted = lda.pred, Actual = test_data$mpg01)
lda_err <- mean(lda.pred != test_data$mpg01)

print(cm_lda)
##          Actual
## Predicted  0  1
##         0 29  2
##         1  2 46
cat(sprintf("LDA test error: %.4f (%.1f%%)\n", lda_err, lda_err * 100))
## LDA test error: 0.0506 (5.1%)

e)

qda.fit  <- qda(fmla, data = train_data)
qda.pred <- predict(qda.fit, newdata = test_data)$class

cm_qda  <- table(Predicted = qda.pred, Actual = test_data$mpg01)
qda_err <- mean(qda.pred != test_data$mpg01)

print(cm_qda)
##          Actual
## Predicted  0  1
##         0 28  3
##         1  3 45
cat(sprintf("QDA test error: %.4f (%.1f%%)\n", qda_err, qda_err * 100))
## QDA test error: 0.0759 (7.6%)

f)

glm.fit   <- glm(fmla, data = train_data, family = binomial)
glm.probs <- predict(glm.fit, newdata = test_data, type = "response")
glm.pred  <- ifelse(glm.probs > 0.5, 1, 0)

cm_glm  <- table(Predicted = glm.pred, Actual = test_data$mpg01)
glm_err <- mean(glm.pred != test_data$mpg01)

print(cm_glm)
##          Actual
## Predicted  0  1
##         0 30  5
##         1  1 43
cat(sprintf("Logistic Regression test error: %.4f (%.1f%%)\n", glm_err, glm_err * 100))
## Logistic Regression test error: 0.0759 (7.6%)

g)

nb.fit  <- naiveBayes(fmla, data = train_data)
nb.pred <- predict(nb.fit, newdata = test_data)

cm_nb  <- table(Predicted = nb.pred, Actual = test_data$mpg01)
nb_err <- mean(nb.pred != test_data$mpg01)

print(cm_nb)
##          Actual
## Predicted  0  1
##         0 28  3
##         1  3 45
cat(sprintf("Naive Bayes test error: %.4f (%.1f%%)\n", nb_err, nb_err * 100))
## Naive Bayes test error: 0.0759 (7.6%)

h)

train.X <- scale(train_data[, predictors])
test.X  <- scale(test_data[, predictors],
                 center = attr(train.X, "scaled:center"),
                 scale  = attr(train.X, "scaled:scale"))
train.Y <- train_data$mpg01

set.seed(42)
k_values <- 1:50
knn_errs <- sapply(k_values, function(k) {
  pred <- knn(train.X, test.X, train.Y, k = k)
  mean(pred != test_data$mpg01)
})

plot(k_values, knn_errs, type = "b", pch = 19, col = "blue",
     xlab = "K", ylab = "Test Error Rate",
     main = "KNN Test Error vs K (all predictors)")
best_k <- k_values[which.min(knn_errs)]
abline(v = best_k, col = "red", lty = 2)
legend("topright", legend = paste0("Best K = ", best_k), col = "red", lty = 2)

cat(sprintf("Best K = %d  |  Test error = %.4f (%.1f%%)\n",
            best_k, min(knn_errs), min(knn_errs) * 100))
## Best K = 28  |  Test error = 0.0506 (5.1%)
cat("\nErrors for selected K values:\n")
## 
## Errors for selected K values:
for (k in unique(c(1, 3, 5, 10, 15, 20, best_k))) {
  cat(sprintf("  K = %2d  ->  error = %.4f\n", k, knn_errs[k]))
}
##   K =  1  ->  error = 0.1139
##   K =  3  ->  error = 0.0633
##   K =  5  ->  error = 0.0759
##   K = 10  ->  error = 0.0759
##   K = 15  ->  error = 0.0633
##   K = 20  ->  error = 0.0633
##   K = 28  ->  error = 0.0506

K=28 performs best.

Question 16

crim_median <- median(Boston$crim)
crim01      <- ifelse(Boston$crim > crim_median, 1, 0)
Boston2     <- data.frame(crim01, Boston[, -1])  # drop original crim

cat("Median crime rate:", round(crim_median, 4), "\n")
## Median crime rate: 0.2565
cat("crim01 distribution:\n")
## crim01 distribution:
print(table(crim01))
## crim01
##   0   1 
## 253 253
cat("\nCorrelations with crim01:\n")
## 
## Correlations with crim01:
print(round(sort(cor(Boston2)[,"crim01"], decreasing = TRUE), 3))
##  crim01     nox     rad     age     tax   indus   lstat ptratio    chas      rm 
##   1.000   0.723   0.620   0.614   0.609   0.603   0.453   0.254   0.070  -0.156 
##    medv   black      zn     dis 
##  -0.263  -0.351  -0.436  -0.616
par(mfrow = c(3, 4))
for (var in names(Boston2)[-1]) {
  boxplot(Boston2[[var]] ~ Boston2$crim01,
          col  = c("blue", "red"),
          main = var,
          xlab = "crim01 (0=Low, 1=High)",
          ylab = var)
}

par(mfrow = c(1, 1))

predictors <- names(Boston2)[-1]
fmla       <- as.formula(paste("crim01 ~", paste(predictors, collapse = " + ")))
set.seed(1)
n         <- nrow(Boston2)
train_idx <- sample(1:n, size = floor(0.7 * n))
train     <- Boston2[ train_idx, ]
test      <- Boston2[-train_idx, ]
test_y    <- test$crim01

cat(sprintf("\nTraining rows: %d  |  Test rows: %d\n", nrow(train), nrow(test)))
## 
## Training rows: 354  |  Test rows: 152
eval_pred <- function(pred, actual, label) {
  cm  <- table(Predicted = pred, Actual = actual)
  err <- mean(pred != actual)
  cat(sprintf("\n[%s]  Test error: %.4f (%.1f%%)\n", label, err, err * 100))
  print(cm)
  invisible(err)
}

glm.fit  <- glm(fmla, data = train, family = binomial)
glm.prob <- predict(glm.fit, newdata = test, type = "response")
glm.pred <- ifelse(glm.prob > 0.5, 1, 0)
lr_err   <- eval_pred(glm.pred, test_y, "Logistic Regression")
## 
## [Logistic Regression]  Test error: 0.1053 (10.5%)
##          Actual
## Predicted  0  1
##         0 62  5
##         1 11 74
lda.fit  <- lda(fmla, data = train)
lda.pred <- predict(lda.fit, newdata = test)$class
lda_err  <- eval_pred(lda.pred, test_y, "LDA")
## 
## [LDA]  Test error: 0.1513 (15.1%)
##          Actual
## Predicted  0  1
##         0 70 20
##         1  3 59
nb.fit  <- naiveBayes(fmla, data = train)
nb.pred <- predict(nb.fit, newdata = test)
nb_err  <- eval_pred(as.integer(as.character(nb.pred)), test_y, "Naive Bayes")
## 
## [Naive Bayes]  Test error: 0.1645 (16.4%)
##          Actual
## Predicted  0  1
##         0 68 20
##         1  5 59
train.X <- scale(train[, predictors])
test.X  <- scale(test[,  predictors],
                 center = attr(train.X, "scaled:center"),
                 scale  = attr(train.X, "scaled:scale"))

set.seed(1)
knn_errs <- sapply(1:50, function(k) {
  pred <- knn(train.X, test.X, train$crim01, k = k)
  mean(as.integer(as.character(pred)) != test_y)
})

best_k   <- which.min(knn_errs)
best_err <- knn_errs[best_k]

plot(1:50, knn_errs, type = "b", pch = 19, col = "blue",
     xlab = "K", ylab = "Test Error Rate",
     main = "KNN Test Error vs K (all predictors)")
abline(v = best_k, col = "red", lty = 2)
legend("topright", legend = paste0("Best K = ", best_k), col = "red", lty = 2)

set.seed(1)
knn.pred <- knn(train.X, test.X, train$crim01, k = best_k)
knn_err  <- eval_pred(as.integer(as.character(knn.pred)), test_y,
                      sprintf("KNN (K=%d)", best_k))
## 
## [KNN (K=3)]  Test error: 0.0461 (4.6%)
##          Actual
## Predicted  0  1
##         0 71  5
##         1  2 74
cat("\nTest errors for selected K values:\n")
## 
## Test errors for selected K values:
for (k in unique(c(1, 3, 5, 10, 15, 20, best_k))) {
  cat(sprintf("  K = %2d  ->  error = %.4f\n", k, knn_errs[k]))
}
##   K =  1  ->  error = 0.0789
##   K =  3  ->  error = 0.0461
##   K =  5  ->  error = 0.0592
##   K = 10  ->  error = 0.1053
##   K = 15  ->  error = 0.1184
##   K = 20  ->  error = 0.1447
summary_df <- data.frame(
  Method    = c("Logistic Regression", "LDA", "Naive Bayes",
                sprintf("KNN (K=%d)", best_k)),
  TestError = round(c(lr_err, lda_err, nb_err, best_err), 4)
)
print(summary_df[order(summary_df$TestError), ], row.names = FALSE)
##               Method TestError
##            KNN (K=3)    0.0461
##  Logistic Regression    0.1053
##                  LDA    0.1513
##          Naive Bayes    0.1645