2. Load and Explore Data
# Load the Smarket dataset
smarket <- ISLR2::Smarket
# Summary statistics using skimr
skimr::skim(smarket)
Data summary
Name |
smarket |
Number of rows |
1250 |
Number of columns |
9 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
8 |
________________________ |
|
Group variables |
None |
Variable type: factor
Direction |
0 |
1 |
FALSE |
2 |
Up: 648, Dow: 602 |
Variable type: numeric
Year |
0 |
1 |
2003.02 |
1.41 |
2001.00 |
2002.00 |
2003.00 |
2004.00 |
2005.00 |
▇▇▇▇▇ |
Lag1 |
0 |
1 |
0.00 |
1.14 |
-4.92 |
-0.64 |
0.04 |
0.60 |
5.73 |
▁▃▇▁▁ |
Lag2 |
0 |
1 |
0.00 |
1.14 |
-4.92 |
-0.64 |
0.04 |
0.60 |
5.73 |
▁▃▇▁▁ |
Lag3 |
0 |
1 |
0.00 |
1.14 |
-4.92 |
-0.64 |
0.04 |
0.60 |
5.73 |
▁▃▇▁▁ |
Lag4 |
0 |
1 |
0.00 |
1.14 |
-4.92 |
-0.64 |
0.04 |
0.60 |
5.73 |
▁▃▇▁▁ |
Lag5 |
0 |
1 |
0.01 |
1.15 |
-4.92 |
-0.64 |
0.04 |
0.60 |
5.73 |
▁▃▇▁▁ |
Volume |
0 |
1 |
1.48 |
0.36 |
0.36 |
1.26 |
1.42 |
1.64 |
3.15 |
▁▇▅▁▁ |
Today |
0 |
1 |
0.00 |
1.14 |
-4.92 |
-0.64 |
0.04 |
0.60 |
5.73 |
▁▃▇▁▁ |
3. Correlation Analysis
# Compute pairwise correlation and display as a gt table
smarket %>%
select(-Direction) %>%
corrr::correlate(method = "pearson", quiet = TRUE) %>%
gt(rowname_col = "term")
|
Year |
Lag1 |
Lag2 |
Lag3 |
Lag4 |
Lag5 |
Volume |
Today |
Year |
NA |
0.029699649 |
0.030596422 |
0.033194581 |
0.035688718 |
0.029787995 |
0.53900647 |
0.030095229 |
Lag1 |
0.02969965 |
NA |
-0.026294328 |
-0.010803402 |
-0.002985911 |
-0.005674606 |
0.04090991 |
-0.026155045 |
Lag2 |
0.03059642 |
-0.026294328 |
NA |
-0.025896670 |
-0.010853533 |
-0.003557949 |
-0.04338321 |
-0.010250033 |
Lag3 |
0.03319458 |
-0.010803402 |
-0.025896670 |
NA |
-0.024051036 |
-0.018808338 |
-0.04182369 |
-0.002447647 |
Lag4 |
0.03568872 |
-0.002985911 |
-0.010853533 |
-0.024051036 |
NA |
-0.027083641 |
-0.04841425 |
-0.006899527 |
Lag5 |
0.02978799 |
-0.005674606 |
-0.003557949 |
-0.018808338 |
-0.027083641 |
NA |
-0.02200231 |
-0.034860083 |
Volume |
0.53900647 |
0.040909908 |
-0.043383215 |
-0.041823686 |
-0.048414246 |
-0.022002315 |
NA |
0.014591823 |
Today |
0.03009523 |
-0.026155045 |
-0.010250033 |
-0.002447647 |
-0.006899527 |
-0.034860083 |
0.01459182 |
NA |
4. Visualization: Stock Market Volume Over Time
smarket %>%
ggplot(aes(x = factor(Year), y = Volume)) +
geom_jitter(width = 0.3, color = "yellow") +
geom_boxplot(alpha = 0.3, outlier.shape = NA, width = 0.2)

5. Logistic Regression Model
# Fit logistic regression model
glm_direction_fit <-
logistic_reg(mode = "classification", engine = "glm") %>%
fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = smarket)
# Confusion matrix
glm_direction_conf_mat <-
augment(glm_direction_fit, smarket) %>%
yardstick::conf_mat(.pred_class, Direction)
glm_direction_conf_mat
## Truth
## Prediction Down Up
## Down 145 457
## Up 141 507
6. Model Accuracy
summary(glm_direction_conf_mat)
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.522
## 2 kap binary 0.0237
## 3 sens binary 0.507
## 4 spec binary 0.526
## 5 ppv binary 0.241
## 6 npv binary 0.782
## 7 mcc binary 0.0277
## 8 j_index binary 0.0329
## 9 bal_accuracy binary 0.516
## 10 detection_prevalence binary 0.482
## 11 precision binary 0.241
## 12 recall binary 0.507
## 13 f_meas binary 0.327
7. Split Data into Training and Testing Sets
# Split data manually based on year
smarket_split <- make_splits(
x = list(
"analysis" = which(smarket$Year < 2005),
"assessment" = which(smarket$Year == 2005)
),
data = smarket
)
# Create training and testing datasets
smarket_train <- training(smarket_split)
smarket_test <- testing(smarket_split)
8. Re-train Model on Training Data
glm_direction_fit <-
logistic_reg(mode = "classification", engine = "glm") %>%
fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = smarket_train)
# Evaluate training accuracy
glm_direction_train_pred <- glm_direction_fit %>% augment(smarket_train)
glm_direction_train_pred %>%
accuracy(truth = Direction, .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.527
10. Try a Simpler Model
# Fit a simpler logistic regression model with fewer predictors
glm_direction_fit_simple <-
logistic_reg(mode = "classification", engine = "glm") %>%
fit(Direction ~ Lag1 + Lag2, data = smarket_train)
# Evaluate new model
glm_direction_fit_simple %>%
augment(smarket_test) %>%
accuracy(truth = Direction, .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.560