4.1 The Stock Market Data

First, we load tidymodels and the necessary packages to access the data and specific models.

library(tidymodels)
library(ISLR)       # For the Smarket data set
library(ISLR2)      # For the Bikeshare data set
library(discrim)
library(poissonreg)
library(corrr)
library(paletteer)

We will be examining the Smarket data set. Before proceeding to modeling, let us explore the correlation between the numeric variables. We remove the qualitative Direction variable and use the corrr package to visualize the correlation matrix.

cor_Smarket <- Smarket %>%
  select(-Direction) %>%
  correlate()

# Simple correlation visualization
rplot(cor_Smarket, colours = c("indianred2", "black", "skyblue1"))

We can also create a customized heatmap-style correlation chart:

cor_Smarket %>%
  stretch() %>%
  ggplot(aes(x, y, fill = r)) +
  geom_tile() +
  geom_text(aes(label = as.character(fashion(r)))) +
  scale_fill_paletteer_c("scico::roma", limits = c(-1, 1), direction = -1)

Plotting Year against Volume reveals an upward trend in trade volume over time:

ggplot(Smarket, aes(Year, Volume)) +
  geom_jitter(height = 0)


4.2 Logistic Regression

Now we fit a logistic regression model to predict the Direction of the stock market based on the previous 5 days’ returns and the volume. We start by specifying the model with parsnip.

lr_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

Next, we fit the model using the entire Smarket dataset:

lr_fit <- lr_spec %>%
  fit(
    Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
    data = Smarket
  )

# Viewing the model parameters
lr_fit %>%
  pluck("fit") %>%
  summary()
## 
## Call:
## stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + 
##     Lag5 + Volume, family = stats::binomial, data = data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
# Tidy format output
tidy(lr_fit)
## # A tibble: 7 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -0.126      0.241     -0.523   0.601
## 2 Lag1        -0.0731     0.0502    -1.46    0.145
## 3 Lag2        -0.0423     0.0501    -0.845   0.398
## 4 Lag3         0.0111     0.0499     0.222   0.824
## 5 Lag4         0.00936    0.0500     0.187   0.851
## 6 Lag5         0.0103     0.0495     0.208   0.835
## 7 Volume       0.135      0.158      0.855   0.392

We can use predict() to see the class predictions and the probability distributions for the training data:

predict(lr_fit, new_data = Smarket)
## # A tibble: 1,250 × 1
##    .pred_class
##    <fct>      
##  1 Up         
##  2 Down       
##  3 Down       
##  4 Up         
##  5 Up         
##  6 Up         
##  7 Down       
##  8 Up         
##  9 Up         
## 10 Down       
## # ℹ 1,240 more rows
predict(lr_fit, new_data = Smarket, type = "prob")
## # A tibble: 1,250 × 2
##    .pred_Down .pred_Up
##         <dbl>    <dbl>
##  1      0.493    0.507
##  2      0.519    0.481
##  3      0.519    0.481
##  4      0.485    0.515
##  5      0.489    0.511
##  6      0.493    0.507
##  7      0.507    0.493
##  8      0.491    0.509
##  9      0.482    0.518
## 10      0.511    0.489
## # ℹ 1,240 more rows

To evaluate the model’s training performance, we generate a confusion matrix and calculate the accuracy using the yardstick package (via augment()):

# Confusion Matrix
augment(lr_fit, new_data = Smarket) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down  145 141
##       Up    457 507
# Visualizing the Confusion Matrix
augment(lr_fit, new_data = Smarket) %>%
  conf_mat(truth = Direction, estimate = .pred_class) %>%
  autoplot(type = "heatmap")

# Training Accuracy
augment(lr_fit, new_data = Smarket) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.522

Train / Test Split

Evaluating a model on the data it was trained on often yields overly optimistic metrics. Let’s split the data into a training set (Years prior to 2005) and a testing set (Year 2005).

Smarket_train <- Smarket %>%
  filter(Year != 2005)

Smarket_test <- Smarket %>%
  filter(Year == 2005)

We refit the model on just the training data and evaluate it on the testing data:

lr_fit2 <- lr_spec %>%
  fit(
    Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
    data = Smarket_train
  )

# Testing Confusion Matrix
augment(lr_fit2, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down Up
##       Down   77 97
##       Up     34 44
# Testing Accuracy
augment(lr_fit2, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.480

Refining the Model

The previous model performed poorly on new data. Removing predictors that do not have a strong relationship with the response can reduce variance and potentially improve performance. Let’s refit using only Lag1 and Lag2.

lr_fit3 <- lr_spec %>%
  fit(
    Direction ~ Lag1 + Lag2,
    data = Smarket_train
  )

# Confusion Matrix for Refined Model
augment(lr_fit3, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   35  35
##       Up     76 106
# Accuracy for Refined Model
augment(lr_fit3, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560

The model’s performance on the test data increased slightly.

Finally, suppose we want to predict the stock market direction for two specific new days based on specific Lag1 and Lag2 values:

Smarket_new <- tibble(
  Lag1 = c(1.2, 1.5),
  Lag2 = c(1.1, -0.8)
)

# Predicting Probabilities for custom data points
predict(
  lr_fit3,
  new_data = Smarket_new,
  type = "prob"
)
## # A tibble: 2 × 2
##   .pred_Down .pred_Up
##        <dbl>    <dbl>
## 1      0.521    0.479
## 2      0.504    0.496