library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ recipes 1.3.0
## ✔ dials 1.4.0 ✔ rsample 1.3.0
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.2 ✔ tidyr 1.3.1
## ✔ infer 1.0.8 ✔ tune 1.3.0
## ✔ modeldata 1.4.0 ✔ workflows 1.2.0
## ✔ parsnip 1.3.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.4 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(ISLR) # For the Smarket data set
library(ISLR2) # For the Bikeshare data set
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(discrim)
##
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
##
## smoothness
library(poissonreg)
library(corrr)
cor_Smarket <- Smarket %>%
dplyr::select(-Direction) %>%
correlate()
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
rplot(cor_Smarket, colours = c("indianred2", "black", "skyblue1"))

library(paletteer)
cor_Smarket %>%
stretch() %>%
ggplot(aes(x, y, fill = r)) +
geom_tile() +
geom_text(aes(label = as.character(fashion(r)))) +
scale_fill_paletteer_c("scico::roma", limits = c(-1, 1), direction = -1)

ggplot(Smarket, aes(Year, Volume)) +
geom_jitter(height = 0)

lr_spec <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
lr_fit <- lr_spec %>%
fit(
Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Smarket
)
lr_fit
## parsnip model object
##
##
## Call: stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 +
## Lag5 + Volume, family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## -0.126000 -0.073074 -0.042301 0.011085 0.009359 0.010313
## Volume
## 0.135441
##
## Degrees of Freedom: 1249 Total (i.e. Null); 1243 Residual
## Null Deviance: 1731
## Residual Deviance: 1728 AIC: 1742
lr_fit %>%
pluck("fit") %>%
summary()
##
## Call:
## stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 +
## Lag5 + Volume, family = stats::binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000 0.240736 -0.523 0.601
## Lag1 -0.073074 0.050167 -1.457 0.145
## Lag2 -0.042301 0.050086 -0.845 0.398
## Lag3 0.011085 0.049939 0.222 0.824
## Lag4 0.009359 0.049974 0.187 0.851
## Lag5 0.010313 0.049511 0.208 0.835
## Volume 0.135441 0.158360 0.855 0.392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1741.6
##
## Number of Fisher Scoring iterations: 3
tidy(lr_fit)
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.126 0.241 -0.523 0.601
## 2 Lag1 -0.0731 0.0502 -1.46 0.145
## 3 Lag2 -0.0423 0.0501 -0.845 0.398
## 4 Lag3 0.0111 0.0499 0.222 0.824
## 5 Lag4 0.00936 0.0500 0.187 0.851
## 6 Lag5 0.0103 0.0495 0.208 0.835
## 7 Volume 0.135 0.158 0.855 0.392
predict(lr_fit, new_data = Smarket)
## # A tibble: 1,250 × 1
## .pred_class
## <fct>
## 1 Up
## 2 Down
## 3 Down
## 4 Up
## 5 Up
## 6 Up
## 7 Down
## 8 Up
## 9 Up
## 10 Down
## # ℹ 1,240 more rows
predict(lr_fit, new_data = Smarket, type = "prob")
## # A tibble: 1,250 × 2
## .pred_Down .pred_Up
## <dbl> <dbl>
## 1 0.493 0.507
## 2 0.519 0.481
## 3 0.519 0.481
## 4 0.485 0.515
## 5 0.489 0.511
## 6 0.493 0.507
## 7 0.507 0.493
## 8 0.491 0.509
## 9 0.482 0.518
## 10 0.511 0.489
## # ℹ 1,240 more rows
augment(lr_fit, new_data = Smarket) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 145 141
## Up 457 507
augment(lr_fit, new_data = Smarket) %>%
conf_mat(truth = Direction, estimate = .pred_class) %>%
autoplot(type = "heatmap")

augment(lr_fit, new_data = Smarket) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.522
Smarket_train <- Smarket %>%
filter(Year != 2005)
Smarket_test <- Smarket %>%
filter(Year == 2005)
lr_fit2 <- lr_spec %>%
fit(
Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Smarket_train
)
augment(lr_fit2, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 77 97
## Up 34 44
augment(lr_fit2, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.480
lr_fit3 <- lr_spec %>%
fit(
Direction ~ Lag1 + Lag2,
data = Smarket_train
)
augment(lr_fit3, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 35 35
## Up 76 106
augment(lr_fit3, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.560
Smarket_new <- tibble(
Lag1 = c(1.2, 1.5),
Lag2 = c(1.1, -0.8)
)
predict(
lr_fit3,
new_data = Smarket_new,
type = "prob"
)
## # A tibble: 2 × 2
## .pred_Down .pred_Up
## <dbl> <dbl>
## 1 0.521 0.479
## 2 0.504 0.496
lda_spec <- discrim_linear() %>%
set_mode("classification") %>%
set_engine("MASS")
lda_fit <- lda_spec %>%
fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
lda_fit
## parsnip model object
##
## Call:
## lda(Direction ~ Lag1 + Lag2, data = data)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
predict(lda_fit, new_data = Smarket_test)
## # A tibble: 252 × 1
## .pred_class
## <fct>
## 1 Up
## 2 Up
## 3 Up
## 4 Up
## 5 Up
## 6 Up
## 7 Up
## 8 Up
## 9 Up
## 10 Up
## # ℹ 242 more rows
predict(lda_fit, new_data = Smarket_test, type = "prob")
## # A tibble: 252 × 2
## .pred_Down .pred_Up
## <dbl> <dbl>
## 1 0.490 0.510
## 2 0.479 0.521
## 3 0.467 0.533
## 4 0.474 0.526
## 5 0.493 0.507
## 6 0.494 0.506
## 7 0.495 0.505
## 8 0.487 0.513
## 9 0.491 0.509
## 10 0.484 0.516
## # ℹ 242 more rows
augment(lda_fit, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 35 35
## Up 76 106
augment(lda_fit, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.560
qda_spec <- discrim_quad() %>%
set_mode("classification") %>%
set_engine("MASS")
qda_fit <- qda_spec %>%
fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
augment(qda_fit, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 30 20
## Up 81 121
augment(qda_fit, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.599
library(dplyr)
library(klaR)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
## The following object is masked from 'package:dplyr':
##
## select
library(parsnip)
nb_spec <- naive_Bayes() %>%
set_mode("classification") %>%
set_engine("klaR") %>%
set_args(usekernel = FALSE)
nb_fit <- nb_spec %>%
fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
nb_spec <- naive_Bayes() %>%
set_mode("classification") %>%
set_engine("klaR") %>%
set_args(usekernel = FALSE)
nb_fit <- nb_spec %>%
fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
library(yardstick)
augment(nb_fit, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Truth
## Prediction Down Up
## Down 28 20
## Up 83 121
augment(nb_fit, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.591
library(ggplot2)
library(ISLR2) # loads the Smarket dataset
ggplot(Smarket, aes(Lag1, Lag2)) +
geom_point(alpha = 0.1, size = 2) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "No apparent correlation between Lag1 and Lag2")
## `geom_smooth()` using formula = 'y ~ x'

knn_spec <- nearest_neighbor(neighbors = 3) %>%
set_mode("classification") %>%
set_engine("kknn")
knn_fit <- knn_spec %>%
fit(Direction ~ Lag1 + Lag2, data = Smarket_train)
knn_fit
## parsnip model object
##
##
## Call:
## kknn::train.kknn(formula = Direction ~ Lag1 + Lag2, data = data, ks = min_rows(3, data, 5))
##
## Type of response variable: nominal
## Minimal misclassification: 0.492986
## Best kernel: optimal
## Best k: 3
augment(knn_fit, new_data = Smarket_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Truth
## Prediction Down Up
## Down 43 58
## Up 68 83
augment(knn_fit, new_data = Smarket_test) %>%
accuracy(truth = Direction, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.5
Caravan_test <- Caravan[seq_len(1000), ]
Caravan_train <- Caravan[-seq_len(1000), ]
library(recipes)
rec_spec <- recipe(Purchase ~ ., data = Caravan_train) %>%
step_normalize(all_numeric_predictors())
Caravan_wf <- workflow() %>%
add_recipe(rec_spec)
knn_spec <- nearest_neighbor() %>%
set_mode("classification") %>%
set_engine("kknn")
knn1_wf <- Caravan_wf %>%
add_model(knn_spec %>% set_args(neighbors = 1))
knn3_wf <- Caravan_wf %>%
add_model(knn_spec %>% set_args(neighbors = 3))
knn5_wf <- Caravan_wf %>%
add_model(knn_spec %>% set_args(neighbors = 5))
knn1_fit <- fit(knn1_wf, data = Caravan_train)
knn3_fit <- fit(knn3_wf, data = Caravan_train)
knn5_fit <- fit(knn5_wf, data = Caravan_train)
augment(knn1_fit, new_data = Caravan_test) %>%
conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Truth
## Prediction No Yes
## No 874 50
## Yes 67 9
augment(knn3_fit, new_data = Caravan_test) %>%
conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Truth
## Prediction No Yes
## No 875 50
## Yes 66 9
augment(knn5_fit, new_data = Caravan_test) %>%
conf_mat(truth = Purchase, estimate = .pred_class)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
## Truth
## Prediction No Yes
## No 874 50
## Yes 67 9
pois_spec <- poisson_reg() %>%
set_mode("regression") %>%
set_engine("glm")
pois_rec_spec <- recipe(
bikers ~ mnth + hr + workingday + temp + weathersit,
data = Bikeshare
) %>%
step_dummy(all_nominal_predictors())
pois_wf <- workflow() %>%
add_recipe(pois_rec_spec) %>%
add_model(pois_spec)
pois_fit <- pois_wf %>% fit(data = Bikeshare)
augment(pois_fit, new_data = Bikeshare, type.predict = "response") %>%
ggplot(aes(bikers, .pred)) +
geom_point(alpha = 0.1) +
geom_abline(slope = 1, size = 1, color = "grey40") +
labs(title = "Predicting the number of bikers per hour using Poission Regression",
x = "Actual", y = "Predicted")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

pois_fit_coef_mnths <-
tidy(pois_fit) %>%
filter(grepl("^mnth", term)) %>%
mutate(
term = stringr::str_replace(term, "mnth_", ""),
term = forcats::fct_inorder(term)
)
pois_fit_coef_mnths %>%
ggplot(aes(term, estimate)) +
geom_line(group = 1) +
geom_point(shape = 21, size = 3, stroke = 1.5,
fill = "black", color = "white") +
labs(title = "Coefficient value from Poission Regression",
x = "Month", y = "Coefficient")

pois_fit_coef_hr <-
tidy(pois_fit) %>%
filter(grepl("^hr", term)) %>%
mutate(
term = stringr::str_replace(term, "hr_X", ""),
term = forcats::fct_inorder(term)
)
pois_fit_coef_hr %>%
ggplot(aes(term, estimate)) +
geom_line(group = 1) +
geom_point(shape = 21, size = 3, stroke = 1.5,
fill = "black", color = "white") +
labs(title = "Coefficient value from Poission Regression",
x = "hours", y = "Coefficient")

models <- list("logistic regression" = lr_fit3,
"LDA" = lda_fit,
"QDA" = qda_fit,
"KNN" = knn_fit)
# Make sure to load all necessary packages explicitly
library(dplyr) # For select() and pipe operators
library(purrr) # For imap_dfr()
library(tidymodels) # If using tidymodels for augment()
# Or library(broom) if using broom directly for augment()
# Now try with explicit namespace references
preds <- purrr::imap_dfr(models, augment,
new_data = Smarket_test, .id = "model")
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'Direction' is absent, its contrast will be ignored
# Use explicit dplyr namespace for select
preds %>%
dplyr::select(model, Direction, .pred_class, .pred_Down, .pred_Up)
## # A tibble: 1,008 × 5
## model Direction .pred_class .pred_Down .pred_Up
## <chr> <fct> <fct> <dbl> <dbl>
## 1 logistic regression Down Up 0.490 0.510
## 2 logistic regression Down Up 0.479 0.521
## 3 logistic regression Down Up 0.467 0.533
## 4 logistic regression Up Up 0.474 0.526
## 5 logistic regression Down Up 0.493 0.507
## 6 logistic regression Up Up 0.494 0.506
## 7 logistic regression Down Up 0.495 0.505
## 8 logistic regression Up Up 0.487 0.513
## 9 logistic regression Down Up 0.491 0.509
## 10 logistic regression Up Up 0.484 0.516
## # ℹ 998 more rows
multi_metric <- metric_set(accuracy, sensitivity, specificity)
preds %>%
group_by(model) %>%
multi_metric(truth = Direction, estimate = .pred_class)
## # A tibble: 12 × 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 KNN accuracy binary 0.5
## 2 LDA accuracy binary 0.560
## 3 QDA accuracy binary 0.599
## 4 logistic regression accuracy binary 0.560
## 5 KNN sensitivity binary 0.387
## 6 LDA sensitivity binary 0.315
## 7 QDA sensitivity binary 0.270
## 8 logistic regression sensitivity binary 0.315
## 9 KNN specificity binary 0.589
## 10 LDA specificity binary 0.752
## 11 QDA specificity binary 0.858
## 12 logistic regression specificity binary 0.752
preds %>%
group_by(model) %>%
roc_curve(Direction, .pred_Down) %>%
autoplot()
