library(tidymodels)
library(ISLR2)
library(dplyr)
library(discrim)
library(kknn)
# ✅ 3. Load and Inspect Data (Weekly dataset)

data(Weekly)
glimpse(Weekly)
## Rows: 1,089
## Columns: 9
## $ Year      <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, …
## $ Lag1      <dbl> 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0…
## $ Lag2      <dbl> 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0…
## $ Lag3      <dbl> -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -…
## $ Lag4      <dbl> -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, …
## $ Lag5      <dbl> -3.484, -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514,…
## $ Volume    <dbl> 0.1549760, 0.1485740, 0.1598375, 0.1616300, 0.1537280, 0.154…
## $ Today     <dbl> -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0.041, 1…
## $ Direction <fct> Down, Down, Up, Up, Up, Down, Up, Up, Up, Down, Down, Up, Up…
# 🔹 Section 4.1 — Logistic Regression

## 1: Fit Logistic Regression Model

log_model <- logistic_reg() %>%
  set_engine("glm")

log_fit <- log_model %>%
  fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
      data = Weekly)

summary(log_fit$fit)
## 
## Call:
## stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + 
##     Lag5 + Volume, family = stats::binomial, data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
## 2: Predictions

log_preds <- predict(log_fit, Weekly, type = "prob") %>%
  bind_cols(predict(log_fit, Weekly)) %>%
  bind_cols(Weekly)

head(log_preds)
## # A tibble: 6 × 12
##   .pred_Down .pred_Up .pred_class  Year   Lag1   Lag2   Lag3   Lag4   Lag5
##        <dbl>    <dbl> <fct>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1      0.391    0.609 Up           1990  0.816  1.57  -3.94  -0.229 -3.48 
## 2      0.399    0.601 Up           1990 -0.27   0.816  1.57  -3.94  -0.229
## 3      0.412    0.588 Up           1990 -2.58  -0.27   0.816  1.57  -3.94 
## 4      0.518    0.482 Down         1990  3.51  -2.58  -0.27   0.816  1.57 
## 5      0.383    0.617 Up           1990  0.712  3.51  -2.58  -0.27   0.816
## 6      0.432    0.568 Up           1990  1.18   0.712  3.51  -2.58  -0.27 
## # ℹ 3 more variables: Volume <dbl>, Today <dbl>, Direction <fct>
## 3: Confusion Matrix

log_preds %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   54  48
##       Up    430 557
## 4: Accuracy

log_preds %>%
  metrics(truth = Direction, estimate = .pred_class)
## # A tibble: 2 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary        0.561 
## 2 kap      binary        0.0350
## 5: Train/Test Split (IMPORTANT)

weekly_split <- initial_time_split(Weekly, prop = 0.7)

train_data <- training(weekly_split)
test_data  <- testing(weekly_split)
## 6: Train on Training Data

log_fit2 <- log_model %>%
  fit(Direction ~ Lag2, data = train_data)
## 7: Test Performance

test_preds <- predict(log_fit2, test_data) %>%
  bind_cols(test_data)

test_preds %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   12   8
##       Up    134 173
test_preds %>%
  metrics(truth = Direction, estimate = .pred_class)
## # A tibble: 2 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary        0.566 
## 2 kap      binary        0.0415
# 🔹 Section 4.2 — LDA, QDA, and KNN

## 🔸 LDA (Linear Discriminant Analysis)

lda_model <- discrim_linear() %>%
  set_engine("MASS")

lda_fit <- lda_model %>%
  fit(Direction ~ Lag2, data = train_data)

lda_preds <- predict(lda_fit, test_data) %>%
  bind_cols(test_data)

lda_preds %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   12   8
##       Up    134 173
lda_preds %>%
  metrics(truth = Direction, estimate = .pred_class)
## # A tibble: 2 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary        0.566 
## 2 kap      binary        0.0415
## 🔸 QDA (Quadratic Discriminant Analysis)

qda_model <- discrim_quad() %>%
  set_engine("MASS")

qda_fit <- qda_model %>%
  fit(Direction ~ Lag2, data = train_data)

qda_preds <- predict(qda_fit, test_data) %>%
  bind_cols(test_data)

qda_preds %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down    0   0
##       Up    146 181
qda_preds %>%
  metrics(truth = Direction, estimate = .pred_class)
## # A tibble: 2 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.554
## 2 kap      binary         0
## 🔸 KNN (K-Nearest Neighbors)

knn_model <- nearest_neighbor(neighbors = 5) %>%
  set_engine("kknn") %>%
  set_mode("classification")

knn_fit <- knn_model %>%
  fit(Direction ~ Lag2, data = train_data)

knn_preds <- predict(knn_fit, test_data) %>%
  bind_cols(test_data)

knn_preds %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   55  72
##       Up     91 109
knn_preds %>%
  metrics(truth = Direction, estimate = .pred_class)
## # A tibble: 2 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary        0.502 
## 2 kap      binary       -0.0213

Results Summary

From the results, Logistic Regression and LDA perform similarly and achieve the highest accuracy among the models. This is expected because both methods assume a linear decision boundary.

QDA performs slightly worse, likely because it estimates more flexible (quadratic) boundaries, which may not be necessary for this dataset and can lead to overfitting.

KNN shows the lowest performance, which may be due to the small number of predictors (only Lag2) and lack of strong local structure in the data.

Overall, Logistic Regression and LDA are the best-performing models for predicting market direction in this case.