chapter4

# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.8     ✔ rsample      1.3.0
## ✔ dials        1.4.0     ✔ tune         1.3.0
## ✔ infer        1.0.8     ✔ workflows    1.2.0
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.3.1     ✔ yardstick    1.3.2
## ✔ recipes      1.3.0     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()

library(ISLR)
library(ISLR2)

## 
## Attaching package: 'ISLR2'
## 
## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit

library(kknn)
library(stringr)
library(forcats)
library(discrim)

## 
## Attaching package: 'discrim'
## 
## The following object is masked from 'package:dials':
## 
##     smoothness

library(parsnip)
library(recipes)
library(workflows)
library(poissonreg)
library(broom)

# Split the data
Smarket_train <- Smarket %>% filter(Year < 2005)
Smarket_test <- Smarket %>% filter(Year == 2005)

# Logistic Regression (Lag1 + Lag2)
lr_spec <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm")

lr_fit <- lr_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

# Model summary
summary(lr_fit$fit)

## 
## Call:
## stats::glm(formula = Direction ~ Lag1 + Lag2, family = stats::binomial, 
##     data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.03222    0.06338   0.508    0.611
## Lag1        -0.05562    0.05171  -1.076    0.282
## Lag2        -0.04449    0.05166  -0.861    0.389
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1383.3  on 997  degrees of freedom
## Residual deviance: 1381.4  on 995  degrees of freedom
## AIC: 1387.4
## 
## Number of Fisher Scoring iterations: 3

# Predictions + Confusion Matrix + Accuracy
augment(lr_fit, new_data = Smarket_test) %>% conf_mat(truth = Direction, estimate = .pred_class)

##           Truth
## Prediction Down  Up
##       Down   35  35
##       Up     76 106

augment(lr_fit, new_data = Smarket_test) %>% accuracy(truth = Direction, estimate = .pred_class)

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560

# Logistic Regression (all Lags + Volume)
lr_fit2 <- lr_spec %>%
  fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket_train)

augment(lr_fit2, new_data = Smarket_test) %>% conf_mat(truth = Direction, estimate = .pred_class)

##           Truth
## Prediction Down Up
##       Down   77 97
##       Up     34 44

augment(lr_fit2, new_data = Smarket_test) %>% accuracy(truth = Direction, estimate = .pred_class)

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.480

# Linear Discriminant Analysis
lda_spec <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS")

lda_fit <- lda_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

augment(lda_fit, new_data = Smarket_test) %>% conf_mat(truth = Direction, estimate = .pred_class)

##           Truth
## Prediction Down  Up
##       Down   35  35
##       Up     76 106

augment(lda_fit, new_data = Smarket_test) %>% accuracy(truth = Direction, estimate = .pred_class)

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560

# Quadratic Discriminant Analysis
qda_spec <- discrim_quad() %>%
  set_mode("classification") %>%
  set_engine("MASS")

qda_fit <- qda_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

augment(qda_fit, new_data = Smarket_test) %>% conf_mat(truth = Direction, estimate = .pred_class)

##           Truth
## Prediction Down  Up
##       Down   30  20
##       Up     81 121

augment(qda_fit, new_data = Smarket_test) %>% accuracy(truth = Direction, estimate = .pred_class)

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.599

# K-Nearest Neighbors (K=3)
knn_spec <- nearest_neighbor(neighbors = 3) %>%
  set_mode("classification") %>%
  set_engine("kknn")

knn_fit <- knn_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

augment(knn_fit, new_data = Smarket_test) %>% conf_mat(truth = Direction, estimate = .pred_class)

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored

##           Truth
## Prediction Down Up
##       Down   43 58
##       Up     68 83

augment(knn_fit, new_data = Smarket_test) %>% accuracy(truth = Direction, estimate = .pred_class)

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary           0.5

# Caravan Insurance data - KNN (K=1,3,5)
Caravan_test <- Caravan[1:1000, ]
Caravan_train <- Caravan[-(1:1000), ]

rec_spec <- recipe(Purchase ~ ., data = Caravan_train) %>%
  step_normalize(all_numeric_predictors())

Caravan_wf <- workflow() %>%
  add_recipe(rec_spec)

knn1_wf <- Caravan_wf %>% add_model(knn_spec %>% set_args(neighbors = 1))
knn3_wf <- Caravan_wf %>% add_model(knn_spec %>% set_args(neighbors = 3))
knn5_wf <- Caravan_wf %>% add_model(knn_spec %>% set_args(neighbors = 5))

knn1_fit <- fit(knn1_wf, data = Caravan_train)
knn3_fit <- fit(knn3_wf, data = Caravan_train)
knn5_fit <- fit(knn5_wf, data = Caravan_train)

augment(knn1_fit, new_data = Caravan_test) %>% conf_mat(truth = Purchase, estimate = .pred_class)

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored

##           Truth
## Prediction  No Yes
##        No  874  50
##        Yes  67   9

augment(knn3_fit, new_data = Caravan_test) %>% conf_mat(truth = Purchase, estimate = .pred_class)

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored

##           Truth
## Prediction  No Yes
##        No  875  50
##        Yes  66   9

augment(knn5_fit, new_data = Caravan_test) %>% conf_mat(truth = Purchase, estimate = .pred_class)

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored

##           Truth
## Prediction  No Yes
##        No  874  50
##        Yes  67   9

# Poisson Regression on Bikeshare
pois_spec <- poisson_reg() %>%
  set_mode("regression") %>%
  set_engine("glm")

pois_rec_spec <- recipe(bikers ~ mnth + hr + workingday + temp + weathersit, data = Bikeshare) %>%
  step_dummy(all_nominal_predictors())

pois_wf <- workflow() %>%
  add_recipe(pois_rec_spec) %>%
  add_model(pois_spec)

pois_fit <- pois_wf %>% fit(data = Bikeshare)

# Scatter plot
augment(pois_fit, new_data = Bikeshare, type.predict = "response") %>%
  ggplot(aes(bikers, .pred)) +
  geom_point(alpha = 0.1) +
  geom_abline(slope = 1, color = "grey40", linewidth = 1) +
  labs(title = "Predicting the number of bikers per hour using Poisson Regression",
       x = "Actual", y = "Predicted")

# Coefficient plots for months
pois_fit_coef_mnths <- tidy(pois_fit) %>%
  filter(str_detect(term, "^mnth")) %>%
  mutate(term = str_replace(term, "mnth_", ""),
         term = fct_inorder(term))

pois_fit_coef_mnths %>%
  ggplot(aes(term, estimate)) +
  geom_line(group = 1) +
  geom_point(shape = 21, size = 3, stroke = 1.5,
             fill = "black", color = "white") +
  labs(title = "Coefficient value from Poisson Regression by Month",
       x = "Month", y = "Coefficient")

# Coefficient plots for hours
pois_fit_coef_hr <- tidy(pois_fit) %>%
  filter(str_detect(term, "^hr")) %>%
  mutate(term = str_replace(term, "hr_", ""),
         term = fct_inorder(term))

pois_fit_coef_hr %>%
  ggplot(aes(term, estimate)) +
  geom_line(group = 1) +
  geom_point(shape = 21, size = 3, stroke = 1.5,
             fill = "black", color = "white") +
  labs(title = "Coefficient value from Poisson Regression by Hour",
       x = "Hour", y = "Coefficient")

# Comparing multiple models
models <- list(
  "Logistic Regression" = lr_fit,
  "LDA" = lda_fit,
  "QDA" = qda_fit,
  "KNN" = knn_fit
)

# Collect predictions
preds <- imap_dfr(models, augment, new_data = Smarket_test, .id = "model")

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored

# Metrics
multi_metric <- metric_set(accuracy, sensitivity, specificity)

preds %>%
  group_by(model) %>%
  multi_metric(truth = Direction, estimate = .pred_class)

## # A tibble: 12 × 4
##    model               .metric     .estimator .estimate
##    <chr>               <chr>       <chr>          <dbl>
##  1 KNN                 accuracy    binary         0.5  
##  2 LDA                 accuracy    binary         0.560
##  3 Logistic Regression accuracy    binary         0.560
##  4 QDA                 accuracy    binary         0.599
##  5 KNN                 sensitivity binary         0.387
##  6 LDA                 sensitivity binary         0.315
##  7 Logistic Regression sensitivity binary         0.315
##  8 QDA                 sensitivity binary         0.270
##  9 KNN                 specificity binary         0.589
## 10 LDA                 specificity binary         0.752
## 11 Logistic Regression specificity binary         0.752
## 12 QDA                 specificity binary         0.858

# ROC curves
preds %>%
  group_by(model) %>%
  roc_curve(Direction, .pred_Down) %>%
  autoplot()

chapter4

2025-04-28