library(tidymodels)
library(ISLR2)
library(dplyr)
library(discrim)
library(kknn)
# ✅ 3. Load and Inspect Data (Weekly dataset)
data(Weekly)
glimpse(Weekly)
## Rows: 1,089
## Columns: 9
## $ Year <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, …
## $ Lag1 <dbl> 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0…
## $ Lag2 <dbl> 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0…
## $ Lag3 <dbl> -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -…
## $ Lag4 <dbl> -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, …
## $ Lag5 <dbl> -3.484, -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514,…
## $ Volume <dbl> 0.1549760, 0.1485740, 0.1598375, 0.1616300, 0.1537280, 0.154…
## $ Today <dbl> -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0.041, 1…
## $ Direction <fct> Down, Down, Up, Up, Up, Down, Up, Up, Up, Down, Down, Up, Up…
# 🔹 Section 4.1 — Logistic Regression
## 1: Fit Logistic Regression Model
log_model <- logistic_reg() %>%
set_engine("glm")
log_fit <- log_model %>%
fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly)
summary(log_fit$fit)
##
## Call:
## stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 +
## Lag5 + Volume, family = stats::binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
## 2: Predictions
log_preds <- predict(log_fit, Weekly, type = "prob") %>%
bind_cols(predict(log_fit, Weekly)) %>%
bind_cols(Weekly)
head(log_preds)
## # A tibble: 6 × 12
## .pred_Down .pred_Up .pred_class Year Lag1 Lag2 Lag3 Lag4 Lag5
## <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.391 0.609 Up 1990 0.816 1.57 -3.94 -0.229 -3.48
## 2 0.399 0.601 Up 1990 -0.27 0.816 1.57 -3.94 -0.229
## 3 0.412 0.588 Up 1990 -2.58 -0.27 0.816 1.57 -3.94
## 4 0.518 0.482 Down 1990 3.51 -2.58 -0.27 0.816 1.57
## 5 0.383 0.617 Up 1990 0.712 3.51 -2.58 -0.27 0.816
## 6 0.432 0.568 Up 1990 1.18 0.712 3.51 -2.58 -0.27
## # ℹ 3 more variables: Volume <dbl>, Today <dbl>, Direction <fct>
## 3: Confusion Matrix
log_preds %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 54 48
## Up 430 557
## 4: Accuracy
log_preds %>%
metrics(truth = Direction, estimate = .pred_class)
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.561
## 2 kap binary 0.0350
## 5: Train/Test Split (IMPORTANT)
weekly_split <- initial_time_split(Weekly, prop = 0.7)
train_data <- training(weekly_split)
test_data <- testing(weekly_split)
## 6: Train on Training Data
log_fit2 <- log_model %>%
fit(Direction ~ Lag2, data = train_data)
## 7: Test Performance
test_preds <- predict(log_fit2, test_data) %>%
bind_cols(test_data)
test_preds %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 12 8
## Up 134 173
test_preds %>%
metrics(truth = Direction, estimate = .pred_class)
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.566
## 2 kap binary 0.0415
# 🔹 Section 4.2 — LDA, QDA, and KNN
## 🔸 LDA (Linear Discriminant Analysis)
lda_model <- discrim_linear() %>%
set_engine("MASS")
lda_fit <- lda_model %>%
fit(Direction ~ Lag2, data = train_data)
lda_preds <- predict(lda_fit, test_data) %>%
bind_cols(test_data)
lda_preds %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 12 8
## Up 134 173
lda_preds %>%
metrics(truth = Direction, estimate = .pred_class)
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.566
## 2 kap binary 0.0415
## 🔸 QDA (Quadratic Discriminant Analysis)
qda_model <- discrim_quad() %>%
set_engine("MASS")
qda_fit <- qda_model %>%
fit(Direction ~ Lag2, data = train_data)
qda_preds <- predict(qda_fit, test_data) %>%
bind_cols(test_data)
qda_preds %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 0 0
## Up 146 181
qda_preds %>%
metrics(truth = Direction, estimate = .pred_class)
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.554
## 2 kap binary 0
## 🔸 KNN (K-Nearest Neighbors)
knn_model <- nearest_neighbor(neighbors = 5) %>%
set_engine("kknn") %>%
set_mode("classification")
knn_fit <- knn_model %>%
fit(Direction ~ Lag2, data = train_data)
knn_preds <- predict(knn_fit, test_data) %>%
bind_cols(test_data)
knn_preds %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 55 72
## Up 91 109
knn_preds %>%
metrics(truth = Direction, estimate = .pred_class)
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.502
## 2 kap binary -0.0213
From the results, Logistic Regression and LDA perform similarly and achieve the highest accuracy among the models. This is expected because both methods assume a linear decision boundary.
QDA performs slightly worse, likely because it estimates more flexible (quadratic) boundaries, which may not be necessary for this dataset and can lead to overfitting.
KNN shows the lowest performance, which may be due to the small number of predictors (only Lag2) and lack of strong local structure in the data.
Overall, Logistic Regression and LDA are the best-performing models for predicting market direction in this case.