library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.8     ✔ recipes      1.3.0
## ✔ dials        1.4.0     ✔ rsample      1.3.0
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.2     ✔ tidyr        1.3.1
## ✔ infer        1.0.8     ✔ tune         1.3.0
## ✔ modeldata    1.4.0     ✔ workflows    1.2.0
## ✔ parsnip      1.3.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.4     ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
library(ISLR) # For the Smarket data set
library(ISLR2) # For the Bikeshare data set
## 
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit
library(discrim)
## 
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
## 
##     smoothness
library(poissonreg)
library(corrr)

cor_Smarket <- Smarket %>%
  dplyr::select(-Direction) %>%
  correlate()
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
rplot(cor_Smarket, colours = c("indianred2", "black", "skyblue1"))

library(paletteer)
cor_Smarket %>%
  stretch() %>%
  ggplot(aes(x, y, fill = r)) +
  geom_tile() +
  geom_text(aes(label = as.character(fashion(r)))) +
  scale_fill_paletteer_c("scico::roma", limits = c(-1, 1), direction = -1)

ggplot(Smarket, aes(Year, Volume)) +
  geom_jitter(height = 0)

lr_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")
lr_fit <- lr_spec %>%
  fit(
    Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
    data = Smarket
    )

lr_fit
## parsnip model object
## 
## 
## Call:  stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + 
##     Lag5 + Volume, family = stats::binomial, data = data)
## 
## Coefficients:
## (Intercept)         Lag1         Lag2         Lag3         Lag4         Lag5  
##   -0.126000    -0.073074    -0.042301     0.011085     0.009359     0.010313  
##      Volume  
##    0.135441  
## 
## Degrees of Freedom: 1249 Total (i.e. Null);  1243 Residual
## Null Deviance:       1731 
## Residual Deviance: 1728  AIC: 1742
lr_fit %>%
  pluck("fit") %>%
  summary()
## 
## Call:
## stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + 
##     Lag5 + Volume, family = stats::binomial, data = data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
tidy(lr_fit)
## # A tibble: 7 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -0.126      0.241     -0.523   0.601
## 2 Lag1        -0.0731     0.0502    -1.46    0.145
## 3 Lag2        -0.0423     0.0501    -0.845   0.398
## 4 Lag3         0.0111     0.0499     0.222   0.824
## 5 Lag4         0.00936    0.0500     0.187   0.851
## 6 Lag5         0.0103     0.0495     0.208   0.835
## 7 Volume       0.135      0.158      0.855   0.392
predict(lr_fit, new_data = Smarket)
## # A tibble: 1,250 × 1
##    .pred_class
##    <fct>      
##  1 Up         
##  2 Down       
##  3 Down       
##  4 Up         
##  5 Up         
##  6 Up         
##  7 Down       
##  8 Up         
##  9 Up         
## 10 Down       
## # ℹ 1,240 more rows
predict(lr_fit, new_data = Smarket, type = "prob")
## # A tibble: 1,250 × 2
##    .pred_Down .pred_Up
##         <dbl>    <dbl>
##  1      0.493    0.507
##  2      0.519    0.481
##  3      0.519    0.481
##  4      0.485    0.515
##  5      0.489    0.511
##  6      0.493    0.507
##  7      0.507    0.493
##  8      0.491    0.509
##  9      0.482    0.518
## 10      0.511    0.489
## # ℹ 1,240 more rows
augment(lr_fit, new_data = Smarket) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down  145 141
##       Up    457 507
augment(lr_fit, new_data = Smarket) %>%
  conf_mat(truth = Direction, estimate = .pred_class) %>%
  autoplot(type = "heatmap")

augment(lr_fit, new_data = Smarket) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.522
Smarket_train <- Smarket %>%
  filter(Year != 2005)

Smarket_test <- Smarket %>%
  filter(Year == 2005)
lr_fit2 <- lr_spec %>%
  fit(
    Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
    data = Smarket_train
    )
augment(lr_fit2, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class) 
##           Truth
## Prediction Down Up
##       Down   77 97
##       Up     34 44
augment(lr_fit2, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class) 
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.480
lr_fit3 <- lr_spec %>%
  fit(
    Direction ~ Lag1 + Lag2,
    data = Smarket_train
    )

augment(lr_fit3, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class) 
##           Truth
## Prediction Down  Up
##       Down   35  35
##       Up     76 106
augment(lr_fit3, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class) 
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560
Smarket_new <- tibble(
  Lag1 = c(1.2, 1.5), 
  Lag2 = c(1.1, -0.8)
)
predict(
  lr_fit3,
  new_data = Smarket_new, 
  type = "prob"
)
## # A tibble: 2 × 2
##   .pred_Down .pred_Up
##        <dbl>    <dbl>
## 1      0.521    0.479
## 2      0.504    0.496
lda_spec <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS")

lda_fit <- lda_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

lda_fit
## parsnip model object
## 
## Call:
## lda(Direction ~ Lag1 + Lag2, data = data)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
predict(lda_fit, new_data = Smarket_test)
## # A tibble: 252 × 1
##    .pred_class
##    <fct>      
##  1 Up         
##  2 Up         
##  3 Up         
##  4 Up         
##  5 Up         
##  6 Up         
##  7 Up         
##  8 Up         
##  9 Up         
## 10 Up         
## # ℹ 242 more rows
predict(lda_fit, new_data = Smarket_test, type = "prob")
## # A tibble: 252 × 2
##    .pred_Down .pred_Up
##         <dbl>    <dbl>
##  1      0.490    0.510
##  2      0.479    0.521
##  3      0.467    0.533
##  4      0.474    0.526
##  5      0.493    0.507
##  6      0.494    0.506
##  7      0.495    0.505
##  8      0.487    0.513
##  9      0.491    0.509
## 10      0.484    0.516
## # ℹ 242 more rows
augment(lda_fit, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class) 
##           Truth
## Prediction Down  Up
##       Down   35  35
##       Up     76 106
augment(lda_fit, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class) 
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560
qda_spec <- discrim_quad() %>%
  set_mode("classification") %>%
  set_engine("MASS")

qda_fit <- qda_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
augment(qda_fit, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class) 
##           Truth
## Prediction Down  Up
##       Down   30  20
##       Up     81 121
augment(qda_fit, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class) 
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.599
library(dplyr)
library(klaR)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
## The following object is masked from 'package:dplyr':
## 
##     select
library(parsnip)
nb_spec <- naive_Bayes() %>%
  set_mode("classification") %>%
  set_engine("klaR") %>%
  set_args(usekernel = FALSE)

nb_fit <- nb_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

nb_spec <- naive_Bayes() %>% 
  set_mode("classification") %>% 
  set_engine("klaR") %>% 
  set_args(usekernel = FALSE)  

nb_fit <- nb_spec %>% 
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
library(yardstick)
augment(nb_fit, new_data = Smarket_test) %>% 
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   28  20
##       Up     83 121
augment(nb_fit, new_data = Smarket_test) %>% 
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.591
library(ggplot2)
library(ISLR2)  # loads the Smarket dataset
ggplot(Smarket, aes(Lag1, Lag2)) +
  geom_point(alpha = 0.1, size = 2) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "No apparent correlation between Lag1 and Lag2")
## `geom_smooth()` using formula = 'y ~ x'

knn_spec <- nearest_neighbor(neighbors = 3) %>%
  set_mode("classification") %>%
  set_engine("kknn")

knn_fit <- knn_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

knn_fit
## parsnip model object
## 
## 
## Call:
## kknn::train.kknn(formula = Direction ~ Lag1 + Lag2, data = data,     ks = min_rows(3, data, 5))
## 
## Type of response variable: nominal
## Minimal misclassification: 0.492986
## Best kernel: optimal
## Best k: 3
augment(knn_fit, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class) 
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
##           Truth
## Prediction Down Up
##       Down   43 58
##       Up     68 83
augment(knn_fit, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class) 
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary           0.5
Caravan_test <- Caravan[seq_len(1000), ]
Caravan_train <- Caravan[-seq_len(1000), ]
library(recipes)
rec_spec <- recipe(Purchase ~ ., data = Caravan_train) %>%
  step_normalize(all_numeric_predictors())
Caravan_wf <- workflow() %>%
  add_recipe(rec_spec)
knn_spec <- nearest_neighbor() %>%
  set_mode("classification") %>%
  set_engine("kknn")
knn1_wf <- Caravan_wf %>%
  add_model(knn_spec %>% set_args(neighbors = 1))

knn3_wf <- Caravan_wf %>%
  add_model(knn_spec %>% set_args(neighbors = 3))

knn5_wf <- Caravan_wf %>%
  add_model(knn_spec %>% set_args(neighbors = 5))
knn1_fit <- fit(knn1_wf, data = Caravan_train)
knn3_fit <- fit(knn3_wf, data = Caravan_train)
knn5_fit <- fit(knn5_wf, data = Caravan_train)
augment(knn1_fit, new_data = Caravan_test) %>%
  conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
##           Truth
## Prediction  No Yes
##        No  874  50
##        Yes  67   9
augment(knn3_fit, new_data = Caravan_test) %>%
  conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
##           Truth
## Prediction  No Yes
##        No  875  50
##        Yes  66   9
augment(knn5_fit, new_data = Caravan_test) %>%
  conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
##           Truth
## Prediction  No Yes
##        No  874  50
##        Yes  67   9
pois_spec <- poisson_reg() %>% 
  set_mode("regression") %>% 
  set_engine("glm")
pois_rec_spec <- recipe(
  bikers ~ mnth + hr + workingday + temp + weathersit,
  data = Bikeshare
) %>% 
  step_dummy(all_nominal_predictors())
pois_wf <- workflow() %>% 
  add_recipe(pois_rec_spec) %>% 
  add_model(pois_spec)
pois_fit <- pois_wf %>% fit(data = Bikeshare)

augment(pois_fit, new_data = Bikeshare, type.predict = "response") %>% 
  ggplot(aes(bikers, .pred)) +
  geom_point(alpha = 0.1) +
  geom_abline(slope = 1, size = 1, color = "grey40") +
  labs(title = "Predicting the number of bikers per hour using Poission Regression",
       x = "Actual", y = "Predicted")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

pois_fit_coef_mnths <- 
  tidy(pois_fit) %>% 
  filter(grepl("^mnth", term)) %>% 
  mutate(
    term = stringr::str_replace(term, "mnth_", ""),
    term = forcats::fct_inorder(term)
  ) 

pois_fit_coef_mnths %>% 
  ggplot(aes(term, estimate)) +
  geom_line(group = 1) +
  geom_point(shape = 21, size = 3, stroke = 1.5, 
             fill = "black", color = "white") +
  labs(title = "Coefficient value from Poission Regression",
       x = "Month", y = "Coefficient")

pois_fit_coef_hr <- 
  tidy(pois_fit) %>% 
  filter(grepl("^hr", term)) %>% 
  mutate(
    term = stringr::str_replace(term, "hr_X", ""),
    term = forcats::fct_inorder(term)
  )

pois_fit_coef_hr %>% 
  ggplot(aes(term, estimate)) +
  geom_line(group = 1) +
  geom_point(shape = 21, size = 3, stroke = 1.5, 
             fill = "black", color = "white") +
  labs(title = "Coefficient value from Poission Regression",
       x = "hours", y = "Coefficient")

models <- list("logistic regression" = lr_fit3,
               "LDA" = lda_fit,
               "QDA" = qda_fit,
               "KNN" = knn_fit)
# Make sure to load all necessary packages explicitly
library(dplyr)      # For select() and pipe operators
library(purrr)      # For imap_dfr()
library(tidymodels) # If using tidymodels for augment()
# Or library(broom) if using broom directly for augment()

# Now try with explicit namespace references
preds <- purrr::imap_dfr(models, augment, 
                  new_data = Smarket_test, .id = "model")
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
# Use explicit dplyr namespace for select
preds %>%
  dplyr::select(model, Direction, .pred_class, .pred_Down, .pred_Up)
## # A tibble: 1,008 × 5
##    model               Direction .pred_class .pred_Down .pred_Up
##    <chr>               <fct>     <fct>            <dbl>    <dbl>
##  1 logistic regression Down      Up               0.490    0.510
##  2 logistic regression Down      Up               0.479    0.521
##  3 logistic regression Down      Up               0.467    0.533
##  4 logistic regression Up        Up               0.474    0.526
##  5 logistic regression Down      Up               0.493    0.507
##  6 logistic regression Up        Up               0.494    0.506
##  7 logistic regression Down      Up               0.495    0.505
##  8 logistic regression Up        Up               0.487    0.513
##  9 logistic regression Down      Up               0.491    0.509
## 10 logistic regression Up        Up               0.484    0.516
## # ℹ 998 more rows
multi_metric <- metric_set(accuracy, sensitivity, specificity)
preds %>%
  group_by(model) %>%
  multi_metric(truth = Direction, estimate = .pred_class)
## # A tibble: 12 × 4
##    model               .metric     .estimator .estimate
##    <chr>               <chr>       <chr>          <dbl>
##  1 KNN                 accuracy    binary         0.5  
##  2 LDA                 accuracy    binary         0.560
##  3 QDA                 accuracy    binary         0.599
##  4 logistic regression accuracy    binary         0.560
##  5 KNN                 sensitivity binary         0.387
##  6 LDA                 sensitivity binary         0.315
##  7 QDA                 sensitivity binary         0.270
##  8 logistic regression sensitivity binary         0.315
##  9 KNN                 specificity binary         0.589
## 10 LDA                 specificity binary         0.752
## 11 QDA                 specificity binary         0.858
## 12 logistic regression specificity binary         0.752
preds %>%
  group_by(model) %>%
  roc_curve(Direction, .pred_Down) %>%
  autoplot()