# Load libraries
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.8     ✔ recipes      1.3.0
## ✔ dials        1.4.0     ✔ rsample      1.3.0
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.2     ✔ tidyr        1.3.1
## ✔ infer        1.0.8     ✔ tune         1.3.0
## ✔ modeldata    1.4.0     ✔ workflows    1.2.0
## ✔ parsnip      1.3.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.4     ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
library(ISLR)
library(kknn)
library(stringr)
## 
## Attaching package: 'stringr'
## The following object is masked from 'package:recipes':
## 
##     fixed
library(forcats)
library(ggplot2)
library(purrr)
library(discrim)
## 
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
## 
##     smoothness
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.4     ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(parsnip)
library(recipes)
library(workflows)
# 4.1 Logistic Regression
install.packages("ISLR")   # If not already installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR)
library(dplyr)

Smarket_train <- Smarket %>% filter(Year < 2005)
# Split the data
Smarket_train <- Smarket %>% filter(Year < 2005)
Smarket_test <- Smarket %>% filter(Year == 2005)

# Specify logistic regression model
lr_spec <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm")
# Fit the model
lr_fit <- lr_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

# Model summary
summary(lr_fit$fit)
## 
## Call:
## stats::glm(formula = Direction ~ Lag1 + Lag2, family = stats::binomial, 
##     data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.03222    0.06338   0.508    0.611
## Lag1        -0.05562    0.05171  -1.076    0.282
## Lag2        -0.04449    0.05166  -0.861    0.389
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1383.3  on 997  degrees of freedom
## Residual deviance: 1381.4  on 995  degrees of freedom
## AIC: 1387.4
## 
## Number of Fisher Scoring iterations: 3
# Predictions
augment(lr_fit, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   35  35
##       Up     76 106
# Accuracy
augment(lr_fit, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560
# 4.2 Logistic Regression with more variables

# Logistic regression with all lags
lr_spec2 <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm")

lr_fit2 <- lr_spec2 %>%
  fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket_train)

# Predictions
augment(lr_fit2, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down Up
##       Down   77 97
##       Up     34 44
# Accuracy
augment(lr_fit2, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.480
# 4.3 Logistic Regression with Lag1 and Lag2 only

lr_spec3 <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm")

lr_fit3 <- lr_spec3 %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

# Predictions
augment(lr_fit3, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   35  35
##       Up     76 106
# Accuracy
augment(lr_fit3, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560
# Install the discrim package if not already installed
install.packages("discrim")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
# Load the discrim package (which provides support for LDA/QDA)
library(discrim)
# 4.4 Linear Discriminant Analysis

# Specify LDA model
lda_spec <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS")

# Fit the model
lda_fit <- lda_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

# Predictions
augment(lda_fit, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   35  35
##       Up     76 106
# Accuracy
augment(lda_fit, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.560
# 4.5 Quadratic Discriminant Analysis

# Specify QDA model
qda_spec <- discrim_quad() %>%
  set_mode("classification") %>%
  set_engine("MASS")

# Fit the model
qda_fit <- qda_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

# Predictions
augment(qda_fit, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
##           Truth
## Prediction Down  Up
##       Down   30  20
##       Up     81 121
# Accuracy
augment(qda_fit, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.599
# 4.6 K-Nearest Neighbors

# Specify KNN model with 3 neighbors
knn_spec <- nearest_neighbor(neighbors = 3) %>%
  set_mode("classification") %>%
  set_engine("kknn")

# Fit the model
knn_fit <- knn_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket_train)

# Predictions
augment(knn_fit, new_data = Smarket_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
##           Truth
## Prediction Down Up
##       Down   43 58
##       Up     68 83
# Accuracy
augment(knn_fit, new_data = Smarket_test) %>%
  accuracy(truth = Direction, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary           0.5
# Caravan insurance data - preparation
install.packages("ISLR")  # Run only once if not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR)
data(Caravan)  # Optional, forces loading
head(Caravan)
##   MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE MRELGE
## 1      33        1       3        2        8      0      5      1      3      7
## 2      37        1       2        2        8      1      4      1      4      6
## 3      37        1       2        2        8      0      4      2      4      3
## 4       9        1       3        3        3      2      3      2      4      5
## 5      40        1       4        2       10      1      4      1      4      7
## 6      23        1       2        1        5      0      5      0      5      0
##   MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG MBERHOOG
## 1      0      2        1        2        6        1        2        7        1
## 2      2      2        0        4        5        0        5        4        0
## 3      2      4        4        4        2        0        5        4        0
## 4      2      2        2        3        4        3        4        2        4
## 5      1      2        2        4        4        5        4        0        0
## 6      6      3        3        5        2        0        5        4        2
##   MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA MSKB1 MSKB2 MSKC MSKD
## 1        0        1        2        5        2    1     1     2    6    1
## 2        0        0        5        0        4    0     2     3    5    0
## 3        0        0        7        0        2    0     5     0    4    0
## 4        0        0        3        1        2    3     2     1    4    0
## 5        5        4        0        0        0    9     0     0    0    0
## 6        0        0        4        2        2    2     2     2    4    2
##   MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART MINKM30 MINK3045 MINK4575
## 1      1      8     8     0     1       8      1       0        4        5
## 2      2      7     7     1     2       6      3       2        0        5
## 3      7      2     7     0     2       9      0       4        5        0
## 4      5      4     9     0     0       7      2       1        5        3
## 5      4      5     6     2     1       5      4       0        0        9
## 6      9      0     5     3     3       9      0       5        2        3
##   MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART PWABEDR PWALAND PPERSAUT PBESAUT
## 1        0        0       4        3       0       0       0        6       0
## 2        2        0       5        4       2       0       0        0       0
## 3        0        0       3        4       2       0       0        6       0
## 4        0        0       4        4       0       0       0        6       0
## 5        0        0       6        3       0       0       0        0       0
## 6        0        0       3        3       0       0       0        6       0
##   PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG
## 1       0       0        0        0      0     0      0        0       0
## 2       0       0        0        0      0     0      0        0       0
## 3       0       0        0        0      0     0      0        0       0
## 4       0       0        0        0      0     0      0        0       0
## 5       0       0        0        0      0     0      0        0       0
## 6       0       0        0        0      0     0      0        0       0
##   PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND AWAPART AWABEDR
## 1       0      5       0        0      0       0        0       0       0
## 2       0      2       0        0      0       0        0       2       0
## 3       0      2       0        0      0       0        0       1       0
## 4       0      2       0        0      0       0        0       0       0
## 5       0      6       0        0      0       0        0       0       0
## 6       0      0       0        0      0       0        0       0       0
##   AWALAND APERSAUT ABESAUT AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT ABROM
## 1       0        1       0       0       0        0        0      0     0
## 2       0        0       0       0       0        0        0      0     0
## 3       0        1       0       0       0        0        0      0     0
## 4       0        1       0       0       0        0        0      0     0
## 5       0        0       0       0       0        0        0      0     0
## 6       0        1       0       0       0        0        0      0     0
##   ALEVEN APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED
## 1      0        0       0       0      1       0        0      0       0
## 2      0        0       0       0      1       0        0      0       0
## 3      0        0       0       0      1       0        0      0       0
## 4      0        0       0       0      1       0        0      0       0
## 5      0        0       0       0      1       0        0      0       0
## 6      0        0       0       0      0       0        0      0       0
##   ABYSTAND Purchase
## 1        0       No
## 2        0       No
## 3        0       No
## 4        0       No
## 5        0       No
## 6        0       No
Caravan_test <- Caravan[seq_len(1000), ]
Caravan_train <- Caravan[-seq_len(1000), ]

# Normalize predictors
install.packages("tidymodels")  # Only if not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidymodels)
rec_spec <- recipe(Purchase ~ ., data = Caravan_train) %>%
  step_normalize(all_numeric_predictors())

# Create workflow
Caravan_wf <- workflow() %>%
  add_recipe(rec_spec)

# Base KNN spec
knn_spec <- nearest_neighbor() %>%
  set_mode("classification") %>%
  set_engine("kknn")
# Workflow for K = 1, 3, 5
knn1_wf <- Caravan_wf %>%
  add_model(knn_spec %>% set_args(neighbors = 1))

knn3_wf <- Caravan_wf %>%
  add_model(knn_spec %>% set_args(neighbors = 3))

knn5_wf <- Caravan_wf %>%
  add_model(knn_spec %>% set_args(neighbors = 5))
# Fit models
knn1_fit <- fit(knn1_wf, data = Caravan_train)
knn3_fit <- fit(knn3_wf, data = Caravan_train)
knn5_fit <- fit(knn5_wf, data = Caravan_train)

library(tidymodels)

augment(knn1_fit, new_data = Caravan_test) %>%
  conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
##           Truth
## Prediction  No Yes
##        No  874  50
##        Yes  67   9
augment(knn3_fit, new_data = Caravan_test) %>%
  conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
##           Truth
## Prediction  No Yes
##        No  875  50
##        Yes  66   9
augment(knn5_fit, new_data = Caravan_test) %>%
  conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
##           Truth
## Prediction  No Yes
##        No  874  50
##        Yes  67   9
# 4.7 Poisson Regression

# Bikeshare data (you may need to load it or simulate if missing)
# Load the necessary libraries
library(tidyverse)
library(parsnip)
library(recipes)
library(workflows)
library(poissonreg)
library(broom)
library(ISLR2)
## 
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit
# Model spec
pois_spec <- poisson_reg() %>%
  set_mode("regression") %>%
  set_engine("glm")

# Recipe with dummy variables
pois_rec_spec <- recipe(
  bikers ~ mnth + hr + workingday + temp + weathersit,
  data = Bikeshare
) %>%
  step_dummy(all_nominal_predictors())
# Workflow
pois_wf <- workflow() %>%
  add_recipe(pois_rec_spec) %>%
  add_model(pois_spec)

# Fit model
pois_fit <- pois_wf %>% fit(data = Bikeshare)
# Scatter plot of predictions
augment(pois_fit, new_data = Bikeshare, type.predict = "response") %>%
  ggplot(aes(bikers, .pred)) +
  geom_point(alpha = 0.1) +
  geom_abline(slope = 1, color = "grey40", linewidth = 1) +
  labs(title = "Predicting the number of bikers per hour using Poisson Regression",
       x = "Actual", y = "Predicted")

# Coefficient plots for months
pois_fit_coef_mnths <- tidy(pois_fit) %>%
  filter(str_detect(term, "^mnth")) %>%
  mutate(
    term = str_replace(term, "mnth_", ""),
    term = fct_inorder(term)
  )

pois_fit_coef_mnths %>%
  ggplot(aes(term, estimate)) +
  geom_line(group = 1) +
  geom_point(shape = 21, size = 3, stroke = 1.5,
             fill = "black", color = "white") +
  labs(title = "Coefficient value from Poisson Regression",
       x = "Month", y = "Coefficient")

# Coefficient plots for hours
pois_fit_coef_hr <- tidy(pois_fit) %>%
  filter(str_detect(term, "^hr")) %>%
  mutate(
    term = str_replace(term, "hr_", ""),
    term = fct_inorder(term)
  )
pois_fit_coef_hr %>%
  ggplot(aes(term, estimate)) +
  geom_line(group = 1) +
  geom_point(shape = 21, size = 3, stroke = 1.5,
             fill = "black", color = "white") +
  labs(title = "Coefficient value from Poisson Regression",
       x = "Hour", y = "Coefficient")

library(tidymodels)
## # A tibble: 12 × 4
##    model               .metric     .estimator .estimate
##    <chr>               <chr>       <chr>          <dbl>
##  1 KNN                 accuracy    binary         0.5  
##  2 LDA                 accuracy    binary         0.560
##  3 QDA                 accuracy    binary         0.599
##  4 logistic regression accuracy    binary         0.560
##  5 KNN                 sensitivity binary         0.387
##  6 LDA                 sensitivity binary         0.315
##  7 QDA                 sensitivity binary         0.270
##  8 logistic regression sensitivity binary         0.315
##  9 KNN                 specificity binary         0.589
## 10 LDA                 specificity binary         0.752
## 11 QDA                 specificity binary         0.858
## 12 logistic regression specificity binary         0.752
# 1. Load necessary libraries
library(tidymodels)
library(ISLR)
library(ggplot2)

# 2. Split the Smarket dataset into training (Year < 2005) and test (Year == 2005)
data("Smarket")
train_data <- Smarket %>% filter(Year < 2005)
test_data  <- Smarket %>% filter(Year == 2005)

# 3. Define the model specifications
knn_spec <- nearest_neighbor(neighbors = 5) %>% 
  set_mode("classification") %>% 
  set_engine("kknn")

lda_spec <- discrim_linear() %>% 
  set_mode("classification") %>% 
  set_engine("MASS")

qda_spec <- discrim_quad() %>% 
  set_mode("classification") %>% 
  set_engine("MASS")

log_spec <- logistic_reg() %>% 
  set_mode("classification") %>% 
  set_engine("glm")

# 4. Fit each model (using all predictors in Smarket except Year)
knn_fit      <- fit(knn_spec,    Direction ~ . - Year, data = train_data)
lda_fit      <- fit(lda_spec,    Direction ~ . - Year, data = train_data)
qda_fit      <- fit(qda_spec,    Direction ~ . - Year, data = train_data)
logistic_fit <- fit(log_spec,    Direction ~ . - Year, data = train_data)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# 5. Generate predicted probabilities on the test set for each model
preds_knn_prob <- predict(knn_fit, new_data = test_data, type = "prob") %>%
  bind_cols(test_data %>% select(Direction)) %>%
  mutate(model = "KNN")
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
preds_lda_prob <- predict(lda_fit, new_data = test_data, type = "prob") %>%
  bind_cols(test_data %>% select(Direction)) %>%
  mutate(model = "LDA")

preds_qda_prob <- predict(qda_fit, new_data = test_data, type = "prob") %>%
  bind_cols(test_data %>% select(Direction)) %>%
  mutate(model = "QDA")

preds_log_prob <- predict(logistic_fit, new_data = test_data, type = "prob") %>%
  bind_cols(test_data %>% select(Direction)) %>%
  mutate(model = "Logistic Regression")

# 6. Combine all probability predictions into one data frame
preds_prob <- bind_rows(preds_knn_prob, preds_lda_prob, preds_qda_prob, preds_log_prob)

# 7. Generate the ROC curve
preds_prob %>%
  group_by(model) %>%
  roc_curve(Direction, .pred_Down) %>%
  autoplot()