p
1
mu1 <- -1.25
mu2 <- 1.25
sigma1 <- 1
sigma2 <- 1
bayes_boundary <- (mu1 + mu2) / 2
p1 <- ggplot(data = tibble(x = seq(-4, 4, 0.1)), aes(x)) +
stat_function(fun = dnorm, args = list(mean = mu1, sd = sigma1),
geom = "line", size = 1.5, color = "forestgreen") +
stat_function(fun = dnorm, args = list(mean = mu2, sd = sigma2),
geom = "line", size = 1.5, color = "orchid") +
geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
set.seed(42)
d <- tribble(
~class, ~x,
1, rnorm(20, mean = mu1, sd = sigma1),
2, rnorm(20, mean = mu2, sd = sigma2)
) %>%
unnest(x)
lda_boundary <-
(mean(filter(d, class == 1)$x) + mean(filter(d, class == 2)$x)) / 2
p2 <- d %>%
ggplot(aes(x, fill = factor(class), color = factor(class))) +
geom_histogram(bins = 13, alpha = 0.5, position = "identity") +
geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
geom_vline(xintercept = lda_boundary, lty = 1, size = 1.5) +
scale_fill_manual(values = c("#50C878", "#B784A7")) + # emerald & mauve
scale_color_manual(values = c("#50C878", "#B784A7")) +
theme(legend.position = "none")
p1 | p2

set.seed(2021)
d <- tribble(
~class, ~x,
1, rnorm(1e3, mean = mu1, sd = sigma1),
2, rnorm(1e3, mean = mu2, sd = sigma2)
) %>%
unnest(x)
# The LDA boundary must be recomputed with the new data
lda_boundary <-
(mean(filter(d, class == 1)$x) + mean(filter(d, class == 2)$x)) / 2
d %>%
mutate(
bayes_class = ifelse(x > bayes_boundary, 1, 2),
lda_class = ifelse(x > lda_boundary, 1, 2)
) %>%
summarise(
`Bayes error rate` = mean(class == bayes_class),
`LDA error rate` = mean(class == lda_class)
)
## # A tibble: 1 × 2
## `Bayes error rate` `LDA error rate`
## <dbl> <dbl>
## 1 0.104 0.107
4.4.2 Linear Discriminant Analysis for
p > 1
d <- crossing(x1 = seq(-2, 2, 0.1), x2 = seq(-2, 2, 0.1))
d1 <- d %>%
bind_cols(
prob = mvtnorm::dmvnorm(
x = as.matrix(d),
mean = c(0, 0), sigma = matrix(c(1, 0, 0, 1), nrow = 2)
)
)
d2 <- d %>%
bind_cols(
prob = mvtnorm::dmvnorm(
x = as.matrix(d),
mean = c(0, 0), sigma = matrix(c(1, 0.7, 0.7, 1), nrow = 2)
)
)
p1 <- d1 %>%
ggplot(aes(x = x1, y = x2)) +
geom_tile(aes(fill = prob)) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
theme(legend.position = "none")
p2 <- d2 %>%
ggplot(aes(x = x1, y = x2)) +
geom_tile(aes(fill = prob)) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
theme(legend.position = "none")
p1 | p2

library(ISLR) # or ISLR2 depending on your version
data(Default)
head(Default)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
lda_default_balance_student <- MASS::lda(default ~ balance + student, data = Default)
lda_default_balance_student
## Call:
## lda(default ~ balance + student, data = Default)
##
## Prior probabilities of groups:
## No Yes
## 0.9667 0.0333
##
## Group means:
## balance studentYes
## No 803.9438 0.2914037
## Yes 1747.8217 0.3813814
##
## Coefficients of linear discriminants:
## LD1
## balance 0.002244397
## studentYes -0.249059498
library(ISLR)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
##
## area
## The following object is masked from 'package:dplyr':
##
## select
data(Default)
lda_default_balance_student <- lda(default ~ balance + student, data = Default)
lda_default_balance_student
## Call:
## lda(default ~ balance + student, data = Default)
##
## Prior probabilities of groups:
## No Yes
## 0.9667 0.0333
##
## Group means:
## balance studentYes
## No 803.9438 0.2914037
## Yes 1747.8217 0.3813814
##
## Coefficients of linear discriminants:
## LD1
## balance 0.002244397
## studentYes -0.249059498
# The MASS package has a `select` function that overwrite `dplyr`,
# fix that before it becomes a problem
select <- dplyr::select
mean(
predict(lda_default_balance_student,
newdata = Default)$class != Default$default
)
## [1] 0.0275
lda_pred <- bind_cols(
pred_default = predict(lda_default_balance_student,
newdata = Default)$class,
Default
)
lda_pred %>%
count(pred_default, default) %>%
pivot_wider(names_from = default, values_from = n, values_fill = 0) %>%
mutate(Total = No + Yes) %>%
gt(rowname_col = "pred_default") %>%
tab_spanner(label = "True default status", columns = everything()) %>%
tab_stubhead("Predicted") %>%
grand_summary_rows(
columns = c(No, Yes, Total),
fns = list(Total = ~round(sum(.), 0))
)
Predicted |
True default status
|
No |
Yes |
Total |
No |
9644 |
252 |
9896 |
Yes |
23 |
81 |
104 |
Total |
9667 |
333 |
10000 |
lda_posterior <- predict(lda_default_balance_student, newdata = Default)$posterior
head(lda_posterior)
## No Yes
## 1 0.9968680 0.003131975
## 2 0.9971925 0.002807531
## 3 0.9843970 0.015603046
## 4 0.9987769 0.001223133
## 5 0.9959254 0.004074582
## 6 0.9954627 0.004537289
lda_pred_20 <- bind_cols(
Default, # Make sure to use `Default` (uppercase D)
posterior_prob_default = lda_posterior[, 2]
) %>%
mutate(
pred_default = ifelse(posterior_prob_default > 0.2, "Yes", "No")
)
lda_pred_20 %>%
count(pred_default, default) %>%
pivot_wider(names_from = default, values_from = n, values_fill = 0) %>%
mutate(Total = No + Yes) %>%
gt(rowname_col = "pred_default") %>%
tab_spanner(label = "True default status", columns = everything()) %>%
tab_stubhead("Predicted") %>%
grand_summary_rows(
columns = c(No, Yes, Total),
fns = list(Total = ~round(sum(.), 0))
)
Predicted |
True default status
|
No |
Yes |
Total |
No |
9432 |
138 |
9570 |
Yes |
235 |
195 |
430 |
Total |
9667 |
333 |
10000 |
lda_roc <-
yardstick::roc_curve(
lda_pred_20,
# Specify the class probability and the truth variables
posterior_prob_default, truth = default,
# This argument specifies which level of truth (default) is considered
# "positive", so it will flip the ROC curve vertically
event_level = "second"
)
autoplot(lda_roc)

yardstick::roc_auc(
lda_pred_20,
posterior_prob_default, truth = default,
event_level = "second"
)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.950
glm_default_balance_student <- glm(default ~ balance + student,
data = Default,
family = binomial)
summary(glm_default_balance_student)
##
## Call:
## glm(formula = default ~ balance + student, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.075e+01 3.692e-01 -29.116 < 2e-16 ***
## balance 5.738e-03 2.318e-04 24.750 < 2e-16 ***
## studentYes -7.149e-01 1.475e-01 -4.846 1.26e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.7 on 9997 degrees of freedom
## AIC: 1577.7
##
## Number of Fisher Scoring iterations: 8
glm_pred <- bind_cols(
Default, # Use the correct dataset name
glm_prob_default = predict(
glm_default_balance_student,
newdata = Default, type = "response"
)
)
yardstick::roc_auc(
glm_pred,
glm_prob_default, truth = default,
event_level = "second"
)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.950
4.4.4 Naive Bayes
# Make sure klaR is loaded
library(klaR)
# Fit Naive Bayes model
nb_default <- NaiveBayes(default ~ balance + student, data = Default)
nb_pred <- bind_cols(
Default,
nb_prob_default = predict(nb_default, newdata = Default)$posterior[,2]
)
nb_pred <- nb_pred %>%
mutate(
pred_default_0.5 = ifelse(nb_prob_default > 0.5, "Yes", "No"),
pred_default_0.2 = ifelse(nb_prob_default > 0.2, "Yes", "No")
)
nb_pred %>%
count(pred_default_0.5, default) %>%
pivot_wider(names_from = default, values_from = n) %>%
mutate(Total = No + Yes) %>%
gt(rowname_col = "pred_default_0.5") %>%
tab_spanner(label = "True default status", columns = everything()) %>%
tab_stubhead("Predicted") %>%
grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))
Predicted |
True default status
|
No |
Yes |
Total |
No |
9621 |
244 |
9865 |
Yes |
46 |
89 |
135 |
Total |
9667 |
333 |
10000 |
nb_pred %>%
count(pred_default_0.2, default) %>%
pivot_wider(names_from = default, values_from = n) %>%
mutate(Total = No + Yes) %>%
gt(rowname_col = "pred_default_0.2") %>%
tab_spanner(label = "True default status", columns = everything()) %>%
tab_stubhead("Predicted") %>%
grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))
Predicted |
True default status
|
No |
Yes |
Total |
No |
9339 |
130 |
9469 |
Yes |
328 |
203 |
531 |
Total |
9667 |
333 |
10000 |
nb_pred %>%
select(default, pred_default_0.2, pred_default_0.5) %>%
pivot_longer(c(pred_default_0.5, pred_default_0.2),
names_to = "threshold", values_to = "pred_default") %>%
mutate(threshold = as.numeric(str_remove(threshold, "pred_default_"))) %>%
group_by(threshold) %>%
summarise(
overall_error = mean(default != pred_default),
sensitivity = sum(default == "Yes" & pred_default == "Yes") /
sum(default == "Yes"),
specificity = sum(default == "No" & pred_default == "No") /
sum(default == "No"),
.groups = "drop"
) %>%
mutate(across(everything(), scales::percent)) %>%
gt()
threshold |
overall_error |
sensitivity |
specificity |
20% |
4.6% |
61% |
96.6% |
50% |
2.9% |
27% |
99.5% |
4.5 A Comparison of Classification Methods
4.5.2 An Empirical Comparison
make_blobs <- function(
n_samples = 40, n_features = 2,
# By default, class 1 is centered at (0, 0) and class 2 at (1, 1)
cluster_centers = matrix(c(0, 0, 1, 1), nrow = 2, byrow = TRUE),
# By default, the two features are uncorrelated with variance = 1
cluster_covar = matrix(c(1, 0, 0, 1), nrow = 2),
dist = c("norm", "t"), t_df = 5
) {
if (ncol(cluster_centers) != n_features) {
stop("Dimensionality of centers must equal number of features")
}
if ((nrow(cluster_covar) != n_features) |
(ncol(cluster_covar) != n_features)) {
stop("Dimensionality of covariance matrix must match number of features")
}
dist <- match.arg(dist)
# Equally divides each of `n_samples` into the different categories according
# to the number of provided classes
categories <- rep(1:nrow(cluster_centers), length.out = n_samples)
if (dist == "norm") {
points <- MASS::mvrnorm(n = n_samples, mu = c(0, 0), Sigma = cluster_covar)
} else if (dist == "t") {
points <- mvtnorm::rmvt(n = n_samples, delta = c(0, 0), df = t_df,
sigma = cluster_covar)
}
points <- points + cluster_centers[categories, ]
colnames(points) <- c("x", "y")
as_tibble(points) %>%
bind_cols(category = factor(categories))
}
install.packages("dunnr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
## Warning: package 'dunnr' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
set.seed(22)
library(ggplot2)
library(tidyr)
# Create your example data
tribble(
~scenario, ~data,
"Scenario 1", make_blobs(n_samples = 40),
"Scenario 2", make_blobs(n_samples = 40,
cluster_covar = matrix(c(1, -0.5, -0.5, 1),
nrow = 2)),
"Scenario 3", make_blobs(n_samples = 100,
cluster_covar = matrix(c(1, -0.5, -0.5, 1),
nrow = 2),
dist = "t")
) %>%
unnest(data) %>%
ggplot(aes(x, y, color = category, shape = category)) +
geom_point(size = 3) +
facet_wrap(~scenario) +
theme(
strip.background = element_rect(color = "black", fill = "lightgray", size = 1, linetype = "solid"),
strip.text = element_text(face = "bold")
)
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

sim_linear_train <- tribble(
~ scenario, ~ n_samples, ~ corr, ~ dist,
"Scenario 1", 40, 0.0, "norm",
"Scenario 2", 40, -0.5, "norm",
"Scenario 3", 40, -0.5, "t"
) %>%
crossing(sim = 1:100) %>%
rowwise() %>%
mutate(
train_data = list(make_blobs(
n_samples = n_samples,
cluster_covar = matrix(c(1, corr, corr, 1), nrow = 2),
dist = dist
))
) %>%
ungroup()
sim_linear_test <- sim_linear_train %>%
distinct(scenario, corr, dist) %>%
rowwise() %>%
mutate(
test_data = list(make_blobs(
n_samples = 1000,
cluster_covar = matrix(c(1, corr, corr, 1), nrow = 2),
dist = dist
))
) %>%
ungroup()
library(tidymodels)
library(discrim) # this needs to be loaded separately for `discrim_*()`
##
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
##
## smoothness
models <- tribble(
~ model_label, ~ model,
"KNN-1", nearest_neighbor(mode = "classification", neighbors = 1),
"KNN-CV", nearest_neighbor(mode = "classification", neighbors = tune()),
"LDA", discrim_linear(),
"Logistic", logistic_reg(),
"NBayes", naive_Bayes(engine = "klaR") %>%
# The klaR engine has an argument usekernel that is always TRUE
# We have to set it to FALSE to not use KDE, and instead use Gaussian
# distributions, as in the text
set_args(usekernel = FALSE),
"QDA", discrim_quad()
)
# A helper function for fitting on a training set and getting accuracy from
# a testing set
calc_test_accuracy <- function(model_label, train_data, test_data, model) {
wf <- workflow() %>%
add_recipe(recipe(category ~ x + y, data = train_data)) %>%
add_model(model)
if (model_label == "KNN-CV") {
# 5 fold cross-validation
train_data_folds <- vfold_cv(train_data, v = 5)
tune_res <- wf %>%
tune_grid(
resamples = train_data_folds,
# Try 1 to 10 neighbors
grid = tibble(neighbors = 1:10)
)
# Overwrite the workflow with the best `neighbors` value by CV accuracy
wf <- finalize_workflow(wf, select_best(tune_res, "accuracy"))
}
wf %>%
fit(data = train_data) %>%
augment(test_data) %>%
accuracy(truth = category, estimate = .pred_class) %>%
pull(.estimate)
}
# Install and load required packages
install.packages("tictoc") # For timing code execution
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tictoc)
library(dplyr) # Data manipulation
library(purrr) # Functional programming (pmap_dbl)
library(tidyr) # Data tidying (crossing)
library(ggplot2) # Visualization (if needed)
library(kknn) # K-Nearest Neighbors model (if used)
# Evaluate model performance
install.packages("parsnip")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("recipes") # If you are working with recipes for model workflows
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(parsnip)
library(recipes)
# Example data
sim_linear_train <- data.frame(
scenario = rep(1:3, each = 5),
train_data = rep(list(data.frame(x = rnorm(5), y = rnorm(5))), 3)
)
sim_linear_test <- data.frame(
scenario = rep(1:3, each = 5),
test_data = rep(list(data.frame(x = rnorm(5), y = rnorm(5))), 3)
)
models <- data.frame(model_label = c("KNN", "SVM"))
# Function to calculate test accuracy (as an example)
calc_test_accuracy <- function(model_label, train_data, test_data, model) {
# Placeholder logic for calculating test accuracy
return(runif(1)) # Example: return a random accuracy for now
}
# Create the result
colnames(sim_linear_test)
## [1] "scenario" "test_data.x" "test_data.y" "test_data.x.1"
## [5] "test_data.y.1" "test_data.x.2" "test_data.y.2"
str(sim_linear_test$test_data)
## NULL
sim_linear_test_clean <- sim_linear_test %>%
rename(test_data = test_data.x) %>%
select(scenario, test_data)
# View the result
sim_linear_test_clean <- sim_linear_test %>%
rename(test_data = test_data.x) %>%
select(scenario, test_data)
4.8 Exercises
13. Predict returns with Weekly
weekly <- ISLR2::Weekly
skimr::skim(weekly)
Data summary
Name |
weekly |
Number of rows |
1089 |
Number of columns |
9 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
8 |
________________________ |
|
Group variables |
None |
Variable type: factor
Direction |
0 |
1 |
FALSE |
2 |
Up: 605, Dow: 484 |
Variable type: numeric
Year |
0 |
1 |
2000.05 |
6.03 |
1990.00 |
1995.00 |
2000.00 |
2005.00 |
2010.00 |
▇▆▆▆▆ |
Lag1 |
0 |
1 |
0.15 |
2.36 |
-18.20 |
-1.15 |
0.24 |
1.41 |
12.03 |
▁▁▆▇▁ |
Lag2 |
0 |
1 |
0.15 |
2.36 |
-18.20 |
-1.15 |
0.24 |
1.41 |
12.03 |
▁▁▆▇▁ |
Lag3 |
0 |
1 |
0.15 |
2.36 |
-18.20 |
-1.16 |
0.24 |
1.41 |
12.03 |
▁▁▆▇▁ |
Lag4 |
0 |
1 |
0.15 |
2.36 |
-18.20 |
-1.16 |
0.24 |
1.41 |
12.03 |
▁▁▆▇▁ |
Lag5 |
0 |
1 |
0.14 |
2.36 |
-18.20 |
-1.17 |
0.23 |
1.41 |
12.03 |
▁▁▆▇▁ |
Volume |
0 |
1 |
1.57 |
1.69 |
0.09 |
0.33 |
1.00 |
2.05 |
9.33 |
▇▂▁▁▁ |
Today |
0 |
1 |
0.15 |
2.36 |
-18.20 |
-1.15 |
0.24 |
1.41 |
12.03 |
▁▁▆▇▁ |
weekly %>%
ggplot(aes(x = factor(Year), y = Volume)) +
geom_line() +
theme_minimal()

install.packages("forcats")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(forcats)
weekly <- weekly %>% mutate(Direction = fct_rev(Direction))
library(gt)
library(tidymodels)
lr_weekly_fit <- logistic_reg() %>%
set_engine("glm") %>%
fit(Direction ~ Year + Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = weekly)
# Load the required libraries
library(broom)
library(gt)
# Tidy the model and display it with gt
lr_weekly_tidy <- tidy(lr_weekly_fit)
lr_weekly_tidy %>%
gt()
term |
estimate |
std.error |
statistic |
p.value |
(Intercept) |
-17.225822230 |
37.89052190 |
-0.45462088 |
0.6493820 |
Year |
0.008499918 |
0.01899083 |
0.44758011 |
0.6544563 |
Lag1 |
0.040687571 |
0.02644652 |
1.53848459 |
0.1239302 |
Lag2 |
-0.059448637 |
0.02697031 |
-2.20422531 |
0.0275085 |
Lag3 |
0.015477987 |
0.02670309 |
0.57963289 |
0.5621622 |
Lag4 |
0.027316278 |
0.02648478 |
1.03139539 |
0.3023554 |
Lag5 |
0.014022185 |
0.02640947 |
0.53095285 |
0.5954515 |
Volume |
-0.003256253 |
0.06883640 |
-0.04730423 |
0.9622708 |
lr_weekly_fit_conf_mat <- augment(lr_weekly_fit, weekly) %>%
conf_mat(truth = Direction, estimate = .pred_class)
lr_weekly_fit_conf_mat
## Truth
## Prediction Up Down
## Up 558 428
## Down 47 56
weekly_train <- weekly %>% filter(Year <= 2008)
weekly_test <- weekly %>% filter(Year > 2008)
lr_weekly_fit_lag2 <-
logistic_reg() %>%
fit(Direction ~ Lag2, data = weekly_train)
lr_weekly_fit_lag2_conf_mat <-
augment(lr_weekly_fit_lag2, weekly_test) %>%
conf_mat(truth = Direction, estimate = .pred_class)
lr_weekly_fit_lag2_conf_mat
## Truth
## Prediction Up Down
## Up 56 34
## Down 5 9
install.packages("discrim")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(discrim)
model_fits <- list(
"logistic" = lr_weekly_fit_lag2,
"LDA" = discrim_linear() %>% fit(Direction ~ Lag2, data = weekly_train),
"QDA" = discrim_quad() %>% fit(Direction ~ Lag2, data = weekly_train),
"KNN1" = nearest_neighbor(mode = "classification", neighbors = 1) %>%
fit(Direction ~ Lag2, data = weekly_train),
"NB" = naive_Bayes() %>% fit(Direction ~ Lag2, data = weekly_train)
)
weekly_metrics <- metric_set(accuracy, sens, spec, ppv)
imap_dfr(
model_fits,
~augment(.x, new_data = weekly_test) %>%
weekly_metrics(truth = Direction, estimate = .pred_class),
.id = "model"
) %>%
select(model, .metric, .estimate) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
gt(rowname_col = "model") %>%
fmt_percent(columns = -model)
|
accuracy |
sens |
spec |
ppv |
logistic |
62.50% |
91.80% |
20.93% |
62.22% |
LDA |
62.50% |
91.80% |
20.93% |
62.22% |
QDA |
58.65% |
100.00% |
0.00% |
58.65% |
KNN1 |
50.00% |
49.18% |
51.16% |
58.82% |
NB |
60.58% |
91.80% |
16.28% |
60.87% |
14. Predict gas mileage with Auto
auto <- ISLR2::Auto %>%
mutate(mpg01 = ifelse(mpg > median(mpg), 1, 0),
mpg01 = factor(mpg01))
glimpse(auto)
## Rows: 392
## Columns: 10
## $ mpg <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower <int> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight <int> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ year <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ name <fct> chevrolet chevelle malibu, buick skylark 320, plymouth sa…
## $ mpg01 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
auto <- auto %>%
mutate(origin = factor(origin, levels = 1:3,
labels = c("American", "European", "Japanese")))
auto %>%
select(-name, -origin, -mpg) %>%
pivot_longer(-mpg01, names_to = "var", values_to = "val") %>%
ggplot(aes(y = mpg01, x = val)) +
geom_boxplot(aes(fill = factor(mpg01))) +
facet_wrap(~var, scales = "free_x") +
theme(
legend.position = "none",
strip.background = element_rect(color = "black", size = 1), # Adds border around facet labels
strip.text = element_text(size = 12) # Optional: Adjusts the text size of facet labels
)

auto %>%
count(origin, mpg01) %>%
ggplot(aes(y = origin, x = mpg01)) +
geom_tile(aes(fill = n)) +
geom_text(aes(label = n), color = "white") +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
theme(legend.position = "none")

install.packages("RColorBrewer")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(RColorBrewer)
auto %>% select(-name, -mpg01, -origin) %>%
corrr::correlate(method = "pearson", quiet = TRUE) %>%
gt(rowname_col = "term") %>%
gt::fmt_missing(columns = everything(), missing_text = "") %>%
gt::data_color(
columns = everything(),
colors = scales::col_numeric(
palette = brewer.pal(5, "RdBu"), # Using a diverging palette (e.g., "RdBu")
domain = c(-1, 1)
)
) %>%
gt::fmt_number(columns = everything(), decimals = 2)
## Warning: Since gt v0.6.0 `fmt_missing()` is deprecated and will soon be removed.
## ℹ Use `sub_missing()` instead.
## This warning is displayed once every 8 hours.
## Warning: Since gt v0.9.0, the `colors` argument has been deprecated.
## • Please use the `fn` argument instead.
## This warning is displayed once every 8 hours.
|
mpg |
cylinders |
displacement |
horsepower |
weight |
acceleration |
year |
mpg |
|
−0.78 |
−0.81 |
−0.78 |
−0.83 |
0.42 |
0.58 |
cylinders |
−0.78 |
|
0.95 |
0.84 |
0.90 |
−0.50 |
−0.35 |
displacement |
−0.81 |
0.95 |
|
0.90 |
0.93 |
−0.54 |
−0.37 |
horsepower |
−0.78 |
0.84 |
0.90 |
|
0.86 |
−0.69 |
−0.42 |
weight |
−0.83 |
0.90 |
0.93 |
0.86 |
|
−0.42 |
−0.31 |
acceleration |
0.42 |
−0.50 |
−0.54 |
−0.69 |
−0.42 |
|
0.29 |
year |
0.58 |
−0.35 |
−0.37 |
−0.42 |
−0.31 |
0.29 |
|
set.seed(49)
auto_split <- initial_split(auto, prop = 3 / 4)
auto_train <- training(auto_split)
auto_test <- testing(auto_split)
auto_train %>% count(mpg01)
## mpg01 n
## 1 0 148
## 2 1 146
auto_test %>% count(mpg01)
## mpg01 n
## 1 0 48
## 2 1 50
auto_recipe <- recipe(
mpg01 ~ cylinders + displacement + horsepower + weight + acceleration +
year + origin,
data = auto_train
) %>%
# Normalize numerical predictors to work with KNN
step_normalize(all_numeric_predictors()) %>%
step_dummy(origin)
auto_workflow <- workflow() %>%
add_recipe(auto_recipe)
model_fits <-
list(
"LDA" = auto_workflow %>%
add_model(discrim_linear()) %>%
fit(data = auto_train),
"QDA" = auto_workflow %>%
add_model(discrim_quad()) %>%
fit(data = auto_train),
"logistic" = auto_workflow %>%
add_model(logistic_reg()) %>%
fit(data = auto_train),
"NB" = auto_workflow %>%
add_model(naive_Bayes()) %>%
fit(data = auto_train),
"KNN1" = auto_workflow %>%
add_model(nearest_neighbor(mode = "classification", neighbors = 1)) %>%
fit(data = auto_train),
"KNN3" = auto_workflow %>%
add_model(nearest_neighbor(mode = "classification", neighbors = 3)) %>%
fit(data = auto_train),
"KNN5" = auto_workflow %>%
add_model(nearest_neighbor(mode = "classification", neighbors = 5)) %>%
fit(data = auto_train),
"KNN7" = auto_workflow %>%
add_model(nearest_neighbor(mode = "classification", neighbors = 7)) %>%
fit(data = auto_train)
)
auto_metrics <- metric_set(accuracy, sens, spec, ppv)
imap_dfr(
model_fits,
~augment(.x, new_data = auto_test) %>%
auto_metrics(truth = mpg01, estimate = .pred_class),
.id = "model"
) %>%
select(model, .metric, .estimate) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
arrange(desc(accuracy)) %>%
gt(rowname_col = "model") %>%
fmt_percent(columns = -model, decimals = 1)
|
accuracy |
sens |
spec |
ppv |
LDA |
90.8% |
87.5% |
94.0% |
93.3% |
logistic |
90.8% |
91.7% |
90.0% |
89.8% |
QDA |
89.8% |
85.4% |
94.0% |
93.2% |
NB |
89.8% |
87.5% |
92.0% |
91.3% |
KNN1 |
89.8% |
93.8% |
86.0% |
86.5% |
KNN3 |
89.8% |
93.8% |
86.0% |
86.5% |
KNN5 |
89.8% |
91.7% |
88.0% |
88.0% |
KNN7 |
89.8% |
91.7% |
88.0% |
88.0% |
16. Predict crime rate with Boston
boston <- ISLR2::Boston %>%
mutate(
crim01 = ifelse(crim > median(crim), 1, 0),
crim01 = factor(crim01),
# Convert the binary chas variable to TRUE/FALSE
chas = chas == 1
)
glimpse(boston)
## Rows: 506
## Columns: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
## $ crim01 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,…
boston %>%
select(-chas) %>%
pivot_longer(-crim01, names_to = "var", values_to = "val") %>%
ggplot(aes(y = crim01, x = val)) +
geom_boxplot(aes(fill = factor(crim01))) +
facet_wrap(~var, scales = "free_x") +
theme(
legend.position = "none",
strip.background = element_rect(color = "black", size = 1), # Add borders to facet labels
strip.text = element_text(size = 12) # Adjust text size of facet labels
)

boston %>%
count(chas, crim01) %>%
ggplot(aes(y = chas, x = crim01)) +
geom_tile(aes(fill = n)) +
geom_text(aes(label = n), color = "white") +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
theme(legend.position = "none")

set.seed(98)
# By default, splits 3:1
boston_split <- initial_split(boston)
boston_train <- training(boston_split)
boston_test <- testing(boston_split)
boston_models <- list(
"LDA" = discrim_linear(),
"QDA" = discrim_quad(),
"logistic" = logistic_reg(),
"NB" = naive_Bayes(),
"KNN1" = nearest_neighbor(mode = "classification", neighbors = 1),
"KNN3" = nearest_neighbor(mode = "classification", neighbors = 3),
"KNN5" = nearest_neighbor(mode = "classification", neighbors = 5),
"KNN7" = nearest_neighbor(mode = "classification", neighbors = 7)
)
boston_recs <- list(
"rec1" = recipe(
crim01 ~ age + dis + indus + lstat + medv + nox + ptratio + rad + tax + zn,
data = boston_train
) %>%
step_normalize(all_numeric_predictors()),
# Drop medv and lstat
"rec2" = recipe(
crim01 ~ age + dis + indus + nox + ptratio + rad + tax + zn,
data = boston_train
) %>%
step_normalize(all_numeric_predictors()),
# Drop ptratio and tax
"rec3" = recipe(
crim01 ~ age + dis + indus + nox + rad + zn,
data = boston_train
) %>%
step_normalize(all_numeric_predictors())
)
boston_fits <-
map(
boston_models,
function(model) {
map(
boston_recs,
~workflow() %>%
add_model(model) %>%
add_recipe(.x) %>%
fit(data = boston_train)
)
}
)
boston_metrics <- metric_set(accuracy, sens, spec, ppv)
imap_dfr(
boston_fits,
function(fit, y) {
imap_dfr(
fit,
~augment(.x, new_data = boston_test) %>%
boston_metrics(truth = crim01, estimate = .pred_class),
.id = "recipe"
)
},
.id = "model"
) %>%
select(model, recipe, .metric, .estimate) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
arrange(recipe, desc(accuracy)) %>%
group_by(recipe) %>%
gt(rowname_col = "model") %>%
fmt_percent(columns = -model, decimals = 1)
|
accuracy |
sens |
spec |
ppv |
rec1 |
QDA |
95.3% |
94.6% |
95.8% |
94.6% |
KNN1 |
94.5% |
98.2% |
91.5% |
90.2% |
KNN3 |
94.5% |
98.2% |
91.5% |
90.2% |
KNN5 |
94.5% |
98.2% |
91.5% |
90.2% |
KNN7 |
94.5% |
98.2% |
91.5% |
90.2% |
logistic |
92.1% |
98.2% |
87.3% |
85.9% |
NB |
85.0% |
80.4% |
88.7% |
84.9% |
LDA |
83.5% |
91.1% |
77.5% |
76.1% |
rec2 |
KNN5 |
96.1% |
100.0% |
93.0% |
91.8% |
KNN1 |
95.3% |
98.2% |
93.0% |
91.7% |
KNN3 |
95.3% |
98.2% |
93.0% |
91.7% |
KNN7 |
95.3% |
98.2% |
93.0% |
91.7% |
QDA |
94.5% |
94.6% |
94.4% |
93.0% |
logistic |
89.0% |
87.5% |
90.1% |
87.5% |
LDA |
86.6% |
96.4% |
78.9% |
78.3% |
NB |
84.3% |
80.4% |
87.3% |
83.3% |
rec3 |
KNN5 |
95.3% |
98.2% |
93.0% |
91.7% |
KNN1 |
94.5% |
98.2% |
91.5% |
90.2% |
KNN3 |
94.5% |
98.2% |
91.5% |
90.2% |
KNN7 |
94.5% |
96.4% |
93.0% |
91.5% |
logistic |
89.0% |
87.5% |
90.1% |
87.5% |
QDA |
86.6% |
91.1% |
83.1% |
81.0% |
NB |
86.6% |
76.8% |
94.4% |
91.5% |
LDA |
84.3% |
91.1% |
78.9% |
77.3% |
boston_recs$rec2$term_info
## # A tibble: 9 × 4
## variable type role source
## <chr> <list> <chr> <chr>
## 1 age <chr [2]> predictor original
## 2 dis <chr [2]> predictor original
## 3 indus <chr [2]> predictor original
## 4 nox <chr [2]> predictor original
## 5 ptratio <chr [2]> predictor original
## 6 rad <chr [2]> predictor original
## 7 tax <chr [2]> predictor original
## 8 zn <chr [2]> predictor original
## 9 crim01 <chr [3]> outcome original