Overview

This document replicates Sections 4.1 and 4.2 of the ISLR tidymodels labs by Emil Hvitfeldt, which accompany the textbook An Introduction to Statistical Learning (James et al.).


4.1 The Stock Market Data

Load Libraries

library(tidymodels)
library(ISLR)  
library(ISLR2)  
library(discrim)
library(poissonreg)
library(corrr)

Explore the Smarket Dataset

glimpse(Smarket)
## Rows: 1,250
## Columns: 9
## $ Year      <dbl> 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, …
## $ Lag1      <dbl> 0.381, 0.959, 1.032, -0.623, 0.614, 0.213, 1.392, -0.403, 0.…
## $ Lag2      <dbl> -0.192, 0.381, 0.959, 1.032, -0.623, 0.614, 0.213, 1.392, -0…
## $ Lag3      <dbl> -2.624, -0.192, 0.381, 0.959, 1.032, -0.623, 0.614, 0.213, 1…
## $ Lag4      <dbl> -1.055, -2.624, -0.192, 0.381, 0.959, 1.032, -0.623, 0.614, …
## $ Lag5      <dbl> 5.010, -1.055, -2.624, -0.192, 0.381, 0.959, 1.032, -0.623, …
## $ Volume    <dbl> 1.1913, 1.2965, 1.4112, 1.2760, 1.2057, 1.3491, 1.4450, 1.40…
## $ Today     <dbl> 0.959, 1.032, -0.623, 0.614, 0.213, 1.392, -0.403, 0.027, 1.…
## $ Direction <fct> Up, Up, Down, Up, Up, Up, Down, Up, Up, Up, Down, Down, Up, …
head(Smarket)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959        Up
## 2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032        Up
## 3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623      Down
## 4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614        Up
## 5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213        Up
## 6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392        Up

Correlation Matrix

cor_Smarket <- Smarket %>%
  select(-Direction) %>%
  correlate()

cor_Smarket
## # A tibble: 8 × 9
##   term      Year     Lag1     Lag2     Lag3     Lag4     Lag5  Volume    Today
##   <chr>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>   <dbl>    <dbl>
## 1 Year   NA       0.0297   0.0306   0.0332   0.0357   0.0298   0.539   0.0301 
## 2 Lag1    0.0297 NA       -0.0263  -0.0108  -0.00299 -0.00567  0.0409 -0.0262 
## 3 Lag2    0.0306 -0.0263  NA       -0.0259  -0.0109  -0.00356 -0.0434 -0.0103 
## 4 Lag3    0.0332 -0.0108  -0.0259  NA       -0.0241  -0.0188  -0.0418 -0.00245
## 5 Lag4    0.0357 -0.00299 -0.0109  -0.0241  NA       -0.0271  -0.0484 -0.00690
## 6 Lag5    0.0298 -0.00567 -0.00356 -0.0188  -0.0271  NA       -0.0220 -0.0349 
## 7 Volume  0.539   0.0409  -0.0434  -0.0418  -0.0484  -0.0220  NA       0.0146 
## 8 Today   0.0301 -0.0262  -0.0103  -0.00245 -0.00690 -0.0349   0.0146 NA

Correlation Plot (rplot)

rplot(cor_Smarket, colours = c("indianred2", "black", "skyblue1"))

Correlation Heatmap (manual ggplot2)

cor_Smarket %>%
  stretch() %>%
  ggplot(aes(x, y, fill = r)) +
  geom_tile() +
  geom_text(aes(label = as.character(fashion(r)))) +
  scale_fill_gradient2(
    low      = "darkorange",
    mid      = "lightgreen",
    high     = "darkgreen",
    midpoint = 0,
    limits   = c(-1, 1)
  ) +
  labs(title = "Correlation Matrix – Smarket",
       x = NULL, y = NULL, fill = "r") +
  theme_minimal()

Year vs Volume

ggplot(Smarket, aes(Year, Volume)) +
  geom_jitter(height = 0, alpha = 0.4, colour = "steelblue") +
  labs(title = "Trading Volume by Year",
       x = "Year", y = "Volume (billions of shares)") +
  theme_minimal()


4.2 Logistic Regression

Model Specification

lr_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

lr_spec
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm

Fit on Full Dataset

lr_fit <- lr_spec %>%
  fit(
    Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
    data = Smarket
  )

lr_fit
## parsnip model object
## 
## 
## Call:  stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + 
##     Lag5 + Volume, family = stats::binomial, data = data)
## 
## Coefficients:
## (Intercept)         Lag1         Lag2         Lag3         Lag4         Lag5  
##   -0.126000    -0.073074    -0.042301     0.011085     0.009359     0.010313  
##      Volume  
##    0.135441  
## 
## Degrees of Freedom: 1249 Total (i.e. Null);  1243 Residual
## Null Deviance:       1731 
## Residual Deviance: 1728  AIC: 1742

Detailed Summary

lr_fit %>%
  pluck("fit") %>%
  summary()
## 
## Call:
## stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + 
##     Lag5 + Volume, family = stats::binomial, data = data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3

Tidy Coefficient Table

tidy(lr_fit)
## # A tibble: 7 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -0.126      0.241     -0.523   0.601
## 2 Lag1        -0.0731     0.0502    -1.46    0.145
## 3 Lag2        -0.0423     0.0501    -0.845   0.398
## 4 Lag3         0.0111     0.0499     0.222   0.824
## 5 Lag4         0.00936    0.0500     0.187   0.851
## 6 Lag5         0.0103     0.0495     0.208   0.835
## 7 Volume       0.135      0.158      0.855   0.392

Predictions on Training Data

Class Predictions

predict(lr_fit, new_data = Smarket)
## # A tibble: 1,250 × 1
##    .pred_class
##    <fct>      
##  1 Up         
##  2 Down       
##  3 Down       
##  4 Up         
##  5 Up         
##  6 Up         
##  7 Down       
##  8 Up         
##  9 Up         
## 10 Down       
## # ℹ 1,240 more rows

Probability Predictions

predict(lr_fit, new_data = Smarket, type = "prob")
## # A tibble: 1,250 × 2
##    .pred_Down .pred_Up
##         <dbl>    <dbl>
##  1      0.493    0.507
##  2      0.519    0.481
##  3      0.519    0.481
##  4      0.485    0.515
##  5      0.489    0.511
##  6      0.493    0.507
##  7      0.507    0.493
##  8      0.491    0.509
##  9      0.482    0.518
## 10      0.511    0.489
## # ℹ 1,240 more rows

Confusion Matrix (Training Data)

augment(lr_fit, new_data = Smarket) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down  145 141
##       Up    457 507
augment(lr_fit, new_data = Smarket) %>%
  conf_mat(truth = Direction, estimate = .pred_class) %>%
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix – Full Data (Training = Test)")

Accuracy (Training Data)

augment(lr_fit, new_data = Smarket) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.522

Train / Test Split by Year

Since the data has a temporal dimension, we split on year: train on 2001–2004, test on 2005.

Smarket_train <- Smarket %>% filter(Year != 2005)
Smarket_test  <- Smarket %>% filter(Year == 2005)

nrow(Smarket_train)
## [1] 998
nrow(Smarket_test)
## [1] 252

Fit on Training Data (All 6 Predictors)

lr_fit2 <- lr_spec %>%
  fit(
    Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
    data = Smarket_train
  )

Confusion Matrix on Test Data

augment(lr_fit2, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down Up
##       Down   77 97
##       Up     34 44
augment(lr_fit2, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class) %>%
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix – 6 Predictors, Test Year = 2005")

Accuracy on Test Data

augment(lr_fit2, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.480

Reduced Model: Lag1 + Lag2 Only

lr_fit3 <- lr_spec %>%
  fit(
    Direction ~ Lag1 + Lag2,
    data = Smarket_train
  )

Confusion Matrix

augment(lr_fit3, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   35  35
##       Up     76 106
augment(lr_fit3, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class) %>%
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix – Lag1 + Lag2 Only, Test Year = 2005")

Accuracy

augment(lr_fit3, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560

Predicting New Observations

We predict Direction for two hypothetical trading days:

Scenario Lag1 Lag2
1 1.2 1.1
2 1.5 -0.8
Smarket_new <- tibble(
  Lag1 = c(1.2, 1.5),
  Lag2 = c(1.1, -0.8)
)

predict(
  lr_fit3,
  new_data = Smarket_new,
  type = "prob"
)
## # A tibble: 2 × 2
##   .pred_Down .pred_Up
##        <dbl>    <dbl>
## 1      0.521    0.479
## 2      0.504    0.496

Replicated from ISLR tidymodels labs, Ch. 4 by Emil Hvitfeldt.