2. Load and Explore Data

# Load the Smarket dataset
smarket <- ISLR2::Smarket

# Summary statistics using skimr
skimr::skim(smarket)
Data summary
Name smarket
Number of rows 1250
Number of columns 9
_______________________
Column type frequency:
factor 1
numeric 8
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Direction 0 1 FALSE 2 Up: 648, Dow: 602

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Year 0 1 2003.02 1.41 2001.00 2002.00 2003.00 2004.00 2005.00 ▇▇▇▇▇
Lag1 0 1 0.00 1.14 -4.92 -0.64 0.04 0.60 5.73 ▁▃▇▁▁
Lag2 0 1 0.00 1.14 -4.92 -0.64 0.04 0.60 5.73 ▁▃▇▁▁
Lag3 0 1 0.00 1.14 -4.92 -0.64 0.04 0.60 5.73 ▁▃▇▁▁
Lag4 0 1 0.00 1.14 -4.92 -0.64 0.04 0.60 5.73 ▁▃▇▁▁
Lag5 0 1 0.01 1.15 -4.92 -0.64 0.04 0.60 5.73 ▁▃▇▁▁
Volume 0 1 1.48 0.36 0.36 1.26 1.42 1.64 3.15 ▁▇▅▁▁
Today 0 1 0.00 1.14 -4.92 -0.64 0.04 0.60 5.73 ▁▃▇▁▁

3. Correlation Analysis

# Compute pairwise correlation and display as a gt table
smarket %>% 
  select(-Direction) %>%
  corrr::correlate(method = "pearson", quiet = TRUE) %>%
  gt(rowname_col = "term")
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today
Year NA 0.029699649 0.030596422 0.033194581 0.035688718 0.029787995 0.53900647 0.030095229
Lag1 0.02969965 NA -0.026294328 -0.010803402 -0.002985911 -0.005674606 0.04090991 -0.026155045
Lag2 0.03059642 -0.026294328 NA -0.025896670 -0.010853533 -0.003557949 -0.04338321 -0.010250033
Lag3 0.03319458 -0.010803402 -0.025896670 NA -0.024051036 -0.018808338 -0.04182369 -0.002447647
Lag4 0.03568872 -0.002985911 -0.010853533 -0.024051036 NA -0.027083641 -0.04841425 -0.006899527
Lag5 0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641 NA -0.02200231 -0.034860083
Volume 0.53900647 0.040909908 -0.043383215 -0.041823686 -0.048414246 -0.022002315 NA 0.014591823
Today 0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527 -0.034860083 0.01459182 NA

4. Visualization: Stock Market Volume Over Time

smarket %>%
  ggplot(aes(x = factor(Year), y = Volume)) +
  geom_jitter(width = 0.3, color = "yellow") +
  geom_boxplot(alpha = 0.3, outlier.shape = NA, width = 0.2)

5. Logistic Regression Model

# Fit logistic regression model
glm_direction_fit <- 
  logistic_reg(mode = "classification", engine = "glm") %>%
  fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = smarket)

# Confusion matrix
glm_direction_conf_mat <- 
  augment(glm_direction_fit, smarket) %>%
  yardstick::conf_mat(.pred_class, Direction)

glm_direction_conf_mat
##           Truth
## Prediction Down  Up
##       Down  145 457
##       Up    141 507

6. Model Accuracy

summary(glm_direction_conf_mat)
## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.522 
##  2 kap                  binary        0.0237
##  3 sens                 binary        0.507 
##  4 spec                 binary        0.526 
##  5 ppv                  binary        0.241 
##  6 npv                  binary        0.782 
##  7 mcc                  binary        0.0277
##  8 j_index              binary        0.0329
##  9 bal_accuracy         binary        0.516 
## 10 detection_prevalence binary        0.482 
## 11 precision            binary        0.241 
## 12 recall               binary        0.507 
## 13 f_meas               binary        0.327

7. Split Data into Training and Testing Sets

# Split data manually based on year
smarket_split <- make_splits(
  x = list(
    "analysis" = which(smarket$Year < 2005),
    "assessment" = which(smarket$Year == 2005)
  ),
  data = smarket
)

# Create training and testing datasets
smarket_train <- training(smarket_split)
smarket_test <- testing(smarket_split)

8. Re-train Model on Training Data

glm_direction_fit <- 
  logistic_reg(mode = "classification", engine = "glm") %>%
  fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = smarket_train)

# Evaluate training accuracy
glm_direction_train_pred <- glm_direction_fit %>% augment(smarket_train)

glm_direction_train_pred %>%
  accuracy(truth = Direction, .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.527

9. Test Model Performance

# Evaluate on test data
glm_direction_test_pred <- glm_direction_fit %>% augment(smarket_test)

glm_direction_test_pred %>%
  accuracy(truth = Direction, .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.480

10. Try a Simpler Model

# Fit a simpler logistic regression model with fewer predictors
glm_direction_fit_simple <- 
  logistic_reg(mode = "classification", engine = "glm") %>%
  fit(Direction ~ Lag1 + Lag2, data = smarket_train)

# Evaluate new model
glm_direction_fit_simple %>%
  augment(smarket_test) %>%
  accuracy(truth = Direction, .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560