4.4.1 Linear Discriminant Analysis for P=1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library(tidyr)
library(tibble)
library(ISLR)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
td_colors <- list(
nice = list(
emerald = "#50C878", # Emerald green
opera_mauve = "#B784A7" # Opera mauve
)
)
mu1 <- -1.25
mu2 <- 1.25
sigma1 <- 1
sigma2 <- 1
bayes_boundary <- (mu1 + mu2) / 2
p1 <- ggplot(data = tibble(x = seq(-4, 4, 0.1)), aes(x)) +
stat_function(fun = dnorm, args = list(mean = mu1, sd = sigma1),
geom = "line", size = 1.5, color = td_colors$nice$emerald) +
stat_function(fun = dnorm, args = list(mean = mu2, sd = sigma2),
geom = "line", size = 1.5, color = td_colors$nice$opera_mauve) +
geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
theme(
axis.text.y = element_blank(), # Remove y-axis text
axis.ticks.y = element_blank(), # Remove y-axis ticks
axis.title.y = element_blank() # Remove y-axis label
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
set.seed(42)
d <- tribble(
~class, ~x,
1, rnorm(20, mean = mu1, sd = sigma1),
2, rnorm(20, mean = mu2, sd = sigma2)
) %>%
unnest(x)
lda_boundary <-
(mean(filter(d, class == 1)$x) + mean(filter(d, class == 2)$x)) / 2
p2 <- d %>%
ggplot(aes(x, fill = factor(class), color = factor(class))) +
geom_histogram(bins = 13, alpha = 0.5, position = "identity") +
geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
geom_vline(xintercept = lda_boundary, lty = 1, size = 1.5) +
scale_fill_manual(values = c(td_colors$nice$emerald,
td_colors$nice$opera_mauve)) +
scale_color_manual(values = c(td_colors$nice$emerald,
td_colors$nice$opera_mauve)) +
theme(legend.position = "none")
library(patchwork)
p1 | p2
print(td_colors)
## $nice
## $nice$emerald
## [1] "#50C878"
##
## $nice$opera_mauve
## [1] "#B784A7"
4.4.2 Linear Discriminant Analysis for p > 1
d <- crossing(x1 = seq(-2, 2, 0.1), x2 = seq(-2, 2, 0.1))
d1 <- d %>%
bind_cols(
prob = mvtnorm::dmvnorm(
x = as.matrix(d),
mean = c(0, 0), sigma = matrix(c(1, 0, 0, 1), nrow = 2)
)
)
d2 <- d %>%
bind_cols(
prob = mvtnorm::dmvnorm(
x = as.matrix(d),
mean = c(0, 0), sigma = matrix(c(1, 0.7, 0.7, 1), nrow = 2)
)
)
p1 <- d1 %>%
ggplot(aes(x = x1, y = x2)) +
geom_tile(aes(fill = prob)) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
theme(legend.position = "none")
p2 <- d2 %>%
ggplot(aes(x = x1, y = x2)) +
geom_tile(aes(fill = prob)) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
theme(legend.position = "none")
p1 | p2
data("Default")
lda_Default_balance_student <- MASS::lda(default ~ balance + student, data = Default)
lda_Default_balance_student
## Call:
## lda(default ~ balance + student, data = Default)
##
## Prior probabilities of groups:
## No Yes
## 0.9667 0.0333
##
## Group means:
## balance studentYes
## No 803.9438 0.2914037
## Yes 1747.8217 0.3813814
##
## Coefficients of linear discriminants:
## LD1
## balance 0.002244397
## studentYes -0.249059498
select <- dplyr::select
mean(
predict(lda_Default_balance_student, newdata = Default)$class != Default$default
)
## [1] 0.0275
lda_pred <-
bind_cols(
pred_default = predict(lda_Default_balance_student,
newdata = Default)$class,
Default
)
library(gt)
lda_pred %>%
count(pred_default, default) %>%
pivot_wider(names_from = default, values_from = n) %>%
mutate(Total = No + Yes) %>%
gt(rowname_col = "pred_default") %>%
gt::tab_spanner(label = "True default status", columns = everything()) %>%
gt::tab_stubhead("Predicted") %>%
# Use grand_summary_rows() instead of summary_rows()
gt::grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))
Predicted |
True default status
|
||
---|---|---|---|
No | Yes | Total | |
No | 9644 | 252 | 9896 |
Yes | 23 | 81 | 104 |
Total | 9667 | 333 | 10000 |
lda_posterior <- predict(lda_Default_balance_student, newdata = Default)$posterior
head(lda_posterior)
## No Yes
## 1 0.9968680 0.003131975
## 2 0.9971925 0.002807531
## 3 0.9843970 0.015603046
## 4 0.9987769 0.001223133
## 5 0.9959254 0.004074582
## 6 0.9954627 0.004537289
lda_pred_20 <- bind_cols(
Default,
posterior_prob_default = lda_posterior[,2]
) %>%
mutate(
pred_default = ifelse(posterior_prob_default > 0.2, "Yes", "No")
)
lda_pred_20 %>%
count(pred_default, default) %>%
pivot_wider(names_from = default, values_from = n) %>%
mutate(Total = No + Yes) %>%
gt(rowname_col = "pred_default") %>%
tab_spanner(label = "True default status", columns = everything()) %>%
tab_stubhead("Predicted") %>%
# Use grand_summary_rows() instead of summary_rows()
grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))
Predicted |
True default status
|
||
---|---|---|---|
No | Yes | Total | |
No | 9432 | 138 | 9570 |
Yes | 235 | 195 | 430 |
Total | 9667 | 333 | 10000 |
posterior_prob_default <- lda_pred_20$posterior_prob_default
lda_roc <-
yardstick::roc_curve(
lda_pred_20,
# Specify the class probability and the truth variables
posterior_prob_default, truth = default,
# This argument specifies which level of truth (default) is considered
# "positive", so it will flip the ROC curve vertically
event_level = "second"
)
autoplot(lda_roc)
yardstick::roc_auc(
lda_pred_20,
posterior_prob_default, truth = default,
event_level = "second"
)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.950
glm_default_balance_student <-
glm(default ~ balance + student,
data = Default, family = binomial)
glm_pred <- bind_cols(
Default,
glm_prob_default = predict(
glm_default_balance_student,
newdata = Default, type = "response"
)
)
yardstick::roc_auc(
data = glm_pred,
truth = default,
glm_prob_default, # No `estimate =`, just the column name
event_level = "second"
)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.950
4.4.4 Naive Bayes
nb_default <-
klaR::NaiveBayes(default ~ balance + student, data = Default)
nb_pred <- bind_cols(
Default["default"], # Correctly reference the 'default' column
nb_prob_default = predict(nb_default, newdata = Default)$posterior[,2]
)
nb_pred <- nb_pred %>%
mutate(
pred_default_0.5 = ifelse(nb_prob_default > 0.5, "Yes", "No"),
pred_default_0.2 = ifelse(nb_prob_default > 0.2, "Yes", "No")
)
nb_pred %>%
count(pred_default_0.5, default) %>%
pivot_wider(names_from = default, values_from = n, values_fill = 0) %>%
mutate(Total = No + Yes) %>%
gt(rowname_col = "pred_default_0.5") %>%
tab_spanner(label = "True default status", columns = c(No, Yes, Total)) %>%
tab_stubhead("Predicted") %>%
grand_summary_rows(
columns = c(No, Yes, Total),
fns = list(Total = ~round(sum(.), 0))
)
Predicted |
True default status
|
||
---|---|---|---|
No | Yes | Total | |
No | 9621 | 244 | 9865 |
Yes | 46 | 89 | 135 |
Total | 9667 | 333 | 10000 |
nb_pred %>%
count(pred_default_0.2, default) %>%
pivot_wider(names_from = default, values_from = n, values_fill = 0) %>%
mutate(Total = No + Yes) %>%
gt(rowname_col = "pred_default_0.2") %>%
tab_spanner(label = "True default status", columns = c(No, Yes, Total)) %>%
tab_stubhead("Predicted") %>%
grand_summary_rows(
columns = c(No, Yes, Total),
fns = list(Total = ~round(sum(.), 0))
)
Predicted |
True default status
|
||
---|---|---|---|
No | Yes | Total | |
No | 9339 | 130 | 9469 |
Yes | 328 | 203 | 531 |
Total | 9667 | 333 | 10000 |
nb_pred %>%
select(default, pred_default_0.2, pred_default_0.5) %>%
pivot_longer(c(pred_default_0.5, pred_default_0.2),
names_to = "threshold", values_to = "pred_default") %>%
mutate(threshold = as.numeric(str_remove(threshold, "pred_default_"))) %>%
group_by(threshold) %>%
summarise(
overall_error = mean(default != pred_default),
sensitivity = sum(default == "Yes" & pred_default == "Yes") /
sum(default == "Yes"),
specificity = sum(default == "No" & pred_default == "No") /
sum(default == "No"),
.groups = "drop"
) %>%
mutate(across(everything(), scales::percent)) %>%
gt()
threshold | overall_error | sensitivity | specificity |
---|---|---|---|
20% | 4.6% | 61% | 96.6% |
50% | 2.9% | 27% | 99.5% |