Load Libraries

# Install any missing packages before loading
required_packages <- c("tidymodels", "ISLR2", "ggplot2", "corrplot")

for (pkg in required_packages) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg)
  }
}

library(tidymodels)
library(ISLR2)
library(ggplot2)
library(corrplot)

4.1 The Stock Market Data

Correlation Matrix

# Compute correlation matrix (excluding the Direction factor column)
cor_Smarket <- cor(Smarket[, sapply(Smarket, is.numeric)])
print(round(cor_Smarket, 3))
##         Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today
## Year   1.000  0.030  0.031  0.033  0.036  0.030  0.539  0.030
## Lag1   0.030  1.000 -0.026 -0.011 -0.003 -0.006  0.041 -0.026
## Lag2   0.031 -0.026  1.000 -0.026 -0.011 -0.004 -0.043 -0.010
## Lag3   0.033 -0.011 -0.026  1.000 -0.024 -0.019 -0.042 -0.002
## Lag4   0.036 -0.003 -0.011 -0.024  1.000 -0.027 -0.048 -0.007
## Lag5   0.030 -0.006 -0.004 -0.019 -0.027  1.000 -0.022 -0.035
## Volume 0.539  0.041 -0.043 -0.042 -0.048 -0.022  1.000  0.015
## Today  0.030 -0.026 -0.010 -0.002 -0.007 -0.035  0.015  1.000

Correlation Plot

corrplot(cor_Smarket,
         method      = "color",
         type        = "upper",
         tl.col      = "black",
         addCoef.col = "black",
         number.cex  = 0.7,
         col         = colorRampPalette(c("indianred2", "white", "skyblue1"))(200),
         title       = "Smarket Correlation Matrix",
         mar         = c(0, 0, 2, 0))

Heatmap-Style Correlation Plot

# Reshape correlation matrix to long format for ggplot2
cor_long <- as.data.frame(as.table(cor_Smarket))
names(cor_long) <- c("x", "y", "r")

ggplot(cor_long, aes(x = x, y = y, fill = r)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(r, 2)), size = 3) +
  scale_fill_gradient2(
    low      = "indianred2",
    mid      = "white",
    high     = "skyblue1",
    midpoint = 0,
    limits   = c(-1, 1),
    name     = "Correlation"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Smarket Heatmap-Style Correlation Plot", x = "", y = "")

Volume vs Year Plot

ggplot(Smarket, aes(x = Year, y = Volume)) +
  geom_jitter(height = 0, alpha = 0.4, color = "steelblue") +
  theme_minimal() +
  labs(title = "Volume vs Year", x = "Year", y = "Volume")


4.2 Logistic Regression

Model Specification

lr_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

Fit Full Model on Entire Dataset

lr_fit <- lr_spec %>%
  fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
      data = Smarket)

summary(lr_fit$fit)
## 
## Call:
## stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + 
##     Lag5 + Volume, family = stats::binomial, data = data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
tidy(lr_fit)
## # A tibble: 7 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -0.126      0.241     -0.523   0.601
## 2 Lag1        -0.0731     0.0502    -1.46    0.145
## 3 Lag2        -0.0423     0.0501    -0.845   0.398
## 4 Lag3         0.0111     0.0499     0.222   0.824
## 5 Lag4         0.00936    0.0500     0.187   0.851
## 6 Lag5         0.0103     0.0495     0.208   0.835
## 7 Volume       0.135      0.158      0.855   0.392

Predictions (Full Model)

predict(lr_fit, new_data = Smarket)
## # A tibble: 1,250 × 1
##    .pred_class
##    <fct>      
##  1 Up         
##  2 Down       
##  3 Down       
##  4 Up         
##  5 Up         
##  6 Up         
##  7 Down       
##  8 Up         
##  9 Up         
## 10 Down       
## # ℹ 1,240 more rows
predict(lr_fit, new_data = Smarket, type = "prob")
## # A tibble: 1,250 × 2
##    .pred_Down .pred_Up
##         <dbl>    <dbl>
##  1      0.493    0.507
##  2      0.519    0.481
##  3      0.519    0.481
##  4      0.485    0.515
##  5      0.489    0.511
##  6      0.493    0.507
##  7      0.507    0.493
##  8      0.491    0.509
##  9      0.482    0.518
## 10      0.511    0.489
## # ℹ 1,240 more rows

Confusion Matrix (Training Data)

augment(lr_fit, new_data = Smarket) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down  145 141
##       Up    457 507

Accuracy (Training Data)

augment(lr_fit, new_data = Smarket) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.522

Train / Test Split (Time-Based)

Smarket_train <- Smarket %>%
  filter(Year != 2005)

Smarket_test <- Smarket %>%
  filter(Year == 2005)

Fit Full Model on Training Data

lr_fit2 <- lr_spec %>%
  fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
      data = Smarket_train)

Evaluate on Test Data

augment(lr_fit2, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down Up
##       Down   77 97
##       Up     34 44
augment(lr_fit2, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.480

Reduced Model (Lag1 & Lag2 Only)

lr_fit3 <- lr_spec %>%
  fit(Direction ~ Lag1 + Lag2,
      data = Smarket_train)

Evaluate Reduced Model

augment(lr_fit3, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   35  35
##       Up     76 106
augment(lr_fit3, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560

Predict New Observations

Smarket_new <- tibble(
  Lag1 = c(1.2, 1.5),
  Lag2 = c(1.1, -0.8)
)

predict(lr_fit3, new_data = Smarket_new, type = "prob")
## # A tibble: 2 × 2
##   .pred_Down .pred_Up
##        <dbl>    <dbl>
## 1      0.521    0.479
## 2      0.504    0.496