# Load libraries
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ recipes 1.3.0
## ✔ dials 1.4.0 ✔ rsample 1.3.0
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.2 ✔ tidyr 1.3.1
## ✔ infer 1.0.8 ✔ tune 1.3.0
## ✔ modeldata 1.4.0 ✔ workflows 1.2.0
## ✔ parsnip 1.3.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.4 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(ISLR)
library(kknn)
library(stringr)
##
## Attaching package: 'stringr'
## The following object is masked from 'package:recipes':
##
## fixed
library(forcats)
library(ggplot2)
library(purrr)
library(discrim)
##
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
##
## smoothness
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.4 ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(parsnip)
library(recipes)
library(workflows)
# 4.1 Logistic Regression
install.packages("ISLR") # If not already installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR)
library(dplyr)
Smarket_train <- Smarket %>% filter(Year < 2005)
# Split the data
Smarket_train <- Smarket %>% filter(Year < 2005)
Smarket_test <- Smarket %>% filter(Year == 2005)
# Specify logistic regression model
lr_spec <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
# Fit the model
lr_fit <- lr_spec %>%
fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
# Model summary
summary(lr_fit$fit)
##
## Call:
## stats::glm(formula = Direction ~ Lag1 + Lag2, family = stats::binomial,
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.03222 0.06338 0.508 0.611
## Lag1 -0.05562 0.05171 -1.076 0.282
## Lag2 -0.04449 0.05166 -0.861 0.389
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1383.3 on 997 degrees of freedom
## Residual deviance: 1381.4 on 995 degrees of freedom
## AIC: 1387.4
##
## Number of Fisher Scoring iterations: 3
# Predictions
augment(lr_fit, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 35 35
## Up 76 106
# Accuracy
augment(lr_fit, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.560
# 4.2 Logistic Regression with more variables
# Logistic regression with all lags
lr_spec2 <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
lr_fit2 <- lr_spec2 %>%
fit(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket_train)
# Predictions
augment(lr_fit2, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 77 97
## Up 34 44
# Accuracy
augment(lr_fit2, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.480
# 4.3 Logistic Regression with Lag1 and Lag2 only
lr_spec3 <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
lr_fit3 <- lr_spec3 %>%
fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
# Predictions
augment(lr_fit3, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 35 35
## Up 76 106
# Accuracy
augment(lr_fit3, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.560
# Install the discrim package if not already installed
install.packages("discrim")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
# Load the discrim package (which provides support for LDA/QDA)
library(discrim)
# 4.4 Linear Discriminant Analysis
# Specify LDA model
lda_spec <- discrim_linear() %>%
set_mode("classification") %>%
set_engine("MASS")
# Fit the model
lda_fit <- lda_spec %>%
fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
# Predictions
augment(lda_fit, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 35 35
## Up 76 106
# Accuracy
augment(lda_fit, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.560
# 4.5 Quadratic Discriminant Analysis
# Specify QDA model
qda_spec <- discrim_quad() %>%
set_mode("classification") %>%
set_engine("MASS")
# Fit the model
qda_fit <- qda_spec %>%
fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
# Predictions
augment(qda_fit, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 30 20
## Up 81 121
# Accuracy
augment(qda_fit, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.599
# 4.6 K-Nearest Neighbors
# Specify KNN model with 3 neighbors
knn_spec <- nearest_neighbor(neighbors = 3) %>%
set_mode("classification") %>%
set_engine("kknn")
# Fit the model
knn_fit <- knn_spec %>%
fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
# Predictions
augment(knn_fit, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Truth
## Prediction Down Up
## Down 43 58
## Up 68 83
# Accuracy
augment(knn_fit, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.5
# Caravan insurance data - preparation
install.packages("ISLR") # Run only once if not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR)
data(Caravan) # Optional, forces loading
head(Caravan)
## MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE MRELGE
## 1 33 1 3 2 8 0 5 1 3 7
## 2 37 1 2 2 8 1 4 1 4 6
## 3 37 1 2 2 8 0 4 2 4 3
## 4 9 1 3 3 3 2 3 2 4 5
## 5 40 1 4 2 10 1 4 1 4 7
## 6 23 1 2 1 5 0 5 0 5 0
## MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG MBERHOOG
## 1 0 2 1 2 6 1 2 7 1
## 2 2 2 0 4 5 0 5 4 0
## 3 2 4 4 4 2 0 5 4 0
## 4 2 2 2 3 4 3 4 2 4
## 5 1 2 2 4 4 5 4 0 0
## 6 6 3 3 5 2 0 5 4 2
## MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA MSKB1 MSKB2 MSKC MSKD
## 1 0 1 2 5 2 1 1 2 6 1
## 2 0 0 5 0 4 0 2 3 5 0
## 3 0 0 7 0 2 0 5 0 4 0
## 4 0 0 3 1 2 3 2 1 4 0
## 5 5 4 0 0 0 9 0 0 0 0
## 6 0 0 4 2 2 2 2 2 4 2
## MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART MINKM30 MINK3045 MINK4575
## 1 1 8 8 0 1 8 1 0 4 5
## 2 2 7 7 1 2 6 3 2 0 5
## 3 7 2 7 0 2 9 0 4 5 0
## 4 5 4 9 0 0 7 2 1 5 3
## 5 4 5 6 2 1 5 4 0 0 9
## 6 9 0 5 3 3 9 0 5 2 3
## MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART PWABEDR PWALAND PPERSAUT PBESAUT
## 1 0 0 4 3 0 0 0 6 0
## 2 2 0 5 4 2 0 0 0 0
## 3 0 0 3 4 2 0 0 6 0
## 4 0 0 4 4 0 0 0 6 0
## 5 0 0 6 3 0 0 0 0 0
## 6 0 0 3 3 0 0 0 6 0
## PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND AWAPART AWABEDR
## 1 0 5 0 0 0 0 0 0 0
## 2 0 2 0 0 0 0 0 2 0
## 3 0 2 0 0 0 0 0 1 0
## 4 0 2 0 0 0 0 0 0 0
## 5 0 6 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## AWALAND APERSAUT ABESAUT AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT ABROM
## 1 0 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 1 0 0 0 0 0 0 0
## 4 0 1 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 1 0 0 0 0 0 0 0
## ALEVEN APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED
## 1 0 0 0 0 1 0 0 0 0
## 2 0 0 0 0 1 0 0 0 0
## 3 0 0 0 0 1 0 0 0 0
## 4 0 0 0 0 1 0 0 0 0
## 5 0 0 0 0 1 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## ABYSTAND Purchase
## 1 0 No
## 2 0 No
## 3 0 No
## 4 0 No
## 5 0 No
## 6 0 No
Caravan_test <- Caravan[seq_len(1000), ]
Caravan_train <- Caravan[-seq_len(1000), ]
# Normalize predictors
install.packages("tidymodels") # Only if not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidymodels)
rec_spec <- recipe(Purchase ~ ., data = Caravan_train) %>%
step_normalize(all_numeric_predictors())
# Create workflow
Caravan_wf <- workflow() %>%
add_recipe(rec_spec)
# Base KNN spec
knn_spec <- nearest_neighbor() %>%
set_mode("classification") %>%
set_engine("kknn")
# Workflow for K = 1, 3, 5
knn1_wf <- Caravan_wf %>%
add_model(knn_spec %>% set_args(neighbors = 1))
knn3_wf <- Caravan_wf %>%
add_model(knn_spec %>% set_args(neighbors = 3))
knn5_wf <- Caravan_wf %>%
add_model(knn_spec %>% set_args(neighbors = 5))
# Fit models
knn1_fit <- fit(knn1_wf, data = Caravan_train)
knn3_fit <- fit(knn3_wf, data = Caravan_train)
knn5_fit <- fit(knn5_wf, data = Caravan_train)
library(tidymodels)
augment(knn1_fit, new_data = Caravan_test) %>%
conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Truth
## Prediction No Yes
## No 874 50
## Yes 67 9
augment(knn3_fit, new_data = Caravan_test) %>%
conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Truth
## Prediction No Yes
## No 875 50
## Yes 66 9
augment(knn5_fit, new_data = Caravan_test) %>%
conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Truth
## Prediction No Yes
## No 874 50
## Yes 67 9
# 4.7 Poisson Regression
# Bikeshare data (you may need to load it or simulate if missing)
# Load the necessary libraries
library(tidyverse)
library(parsnip)
library(recipes)
library(workflows)
library(poissonreg)
library(broom)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
# Model spec
pois_spec <- poisson_reg() %>%
set_mode("regression") %>%
set_engine("glm")
# Recipe with dummy variables
pois_rec_spec <- recipe(
bikers ~ mnth + hr + workingday + temp + weathersit,
data = Bikeshare
) %>%
step_dummy(all_nominal_predictors())
# Workflow
pois_wf <- workflow() %>%
add_recipe(pois_rec_spec) %>%
add_model(pois_spec)
# Fit model
pois_fit <- pois_wf %>% fit(data = Bikeshare)
# Scatter plot of predictions
augment(pois_fit, new_data = Bikeshare, type.predict = "response") %>%
ggplot(aes(bikers, .pred)) +
geom_point(alpha = 0.1) +
geom_abline(slope = 1, color = "grey40", linewidth = 1) +
labs(title = "Predicting the number of bikers per hour using Poisson Regression",
x = "Actual", y = "Predicted")

# Coefficient plots for months
pois_fit_coef_mnths <- tidy(pois_fit) %>%
filter(str_detect(term, "^mnth")) %>%
mutate(
term = str_replace(term, "mnth_", ""),
term = fct_inorder(term)
)
pois_fit_coef_mnths %>%
ggplot(aes(term, estimate)) +
geom_line(group = 1) +
geom_point(shape = 21, size = 3, stroke = 1.5,
fill = "black", color = "white") +
labs(title = "Coefficient value from Poisson Regression",
x = "Month", y = "Coefficient")

# Coefficient plots for hours
pois_fit_coef_hr <- tidy(pois_fit) %>%
filter(str_detect(term, "^hr")) %>%
mutate(
term = str_replace(term, "hr_", ""),
term = fct_inorder(term)
)
pois_fit_coef_hr %>%
ggplot(aes(term, estimate)) +
geom_line(group = 1) +
geom_point(shape = 21, size = 3, stroke = 1.5,
fill = "black", color = "white") +
labs(title = "Coefficient value from Poisson Regression",
x = "Hour", y = "Coefficient")

library(tidymodels)
## # A tibble: 12 × 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 KNN accuracy binary 0.5
## 2 LDA accuracy binary 0.560
## 3 QDA accuracy binary 0.599
## 4 logistic regression accuracy binary 0.560
## 5 KNN sensitivity binary 0.387
## 6 LDA sensitivity binary 0.315
## 7 QDA sensitivity binary 0.270
## 8 logistic regression sensitivity binary 0.315
## 9 KNN specificity binary 0.589
## 10 LDA specificity binary 0.752
## 11 QDA specificity binary 0.858
## 12 logistic regression specificity binary 0.752
# 1. Load necessary libraries
library(tidymodels)
library(ISLR)
library(ggplot2)
# 2. Split the Smarket dataset into training (Year < 2005) and test (Year == 2005)
data("Smarket")
train_data <- Smarket %>% filter(Year < 2005)
test_data <- Smarket %>% filter(Year == 2005)
# 3. Define the model specifications
knn_spec <- nearest_neighbor(neighbors = 5) %>%
set_mode("classification") %>%
set_engine("kknn")
lda_spec <- discrim_linear() %>%
set_mode("classification") %>%
set_engine("MASS")
qda_spec <- discrim_quad() %>%
set_mode("classification") %>%
set_engine("MASS")
log_spec <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
# 4. Fit each model (using all predictors in Smarket except Year)
knn_fit <- fit(knn_spec, Direction ~ . - Year, data = train_data)
lda_fit <- fit(lda_spec, Direction ~ . - Year, data = train_data)
qda_fit <- fit(qda_spec, Direction ~ . - Year, data = train_data)
logistic_fit <- fit(log_spec, Direction ~ . - Year, data = train_data)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# 5. Generate predicted probabilities on the test set for each model
preds_knn_prob <- predict(knn_fit, new_data = test_data, type = "prob") %>%
bind_cols(test_data %>% select(Direction)) %>%
mutate(model = "KNN")
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
preds_lda_prob <- predict(lda_fit, new_data = test_data, type = "prob") %>%
bind_cols(test_data %>% select(Direction)) %>%
mutate(model = "LDA")
preds_qda_prob <- predict(qda_fit, new_data = test_data, type = "prob") %>%
bind_cols(test_data %>% select(Direction)) %>%
mutate(model = "QDA")
preds_log_prob <- predict(logistic_fit, new_data = test_data, type = "prob") %>%
bind_cols(test_data %>% select(Direction)) %>%
mutate(model = "Logistic Regression")
# 6. Combine all probability predictions into one data frame
preds_prob <- bind_rows(preds_knn_prob, preds_lda_prob, preds_qda_prob, preds_log_prob)
# 7. Generate the ROC curve
preds_prob %>%
group_by(model) %>%
roc_curve(Direction, .pred_Down) %>%
autoplot()
