library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gt)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ rsample 1.3.0
## ✔ dials 1.4.0 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.2 ✔ tidyr 1.3.1
## ✔ infer 1.0.8 ✔ tune 1.3.0
## ✔ modeldata 1.4.0 ✔ workflows 1.2.0
## ✔ parsnip 1.3.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.4 ✔ yardstick 1.3.2
## ✔ recipes 1.3.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(stringr)
##
## Attaching package: 'stringr'
## The following object is masked from 'package:recipes':
##
## fixed
library(ggplot2)
library(tibble)
library(tidyr)
nb_pred <- data.frame(nb_prob_default = runif(100), default = sample(c("Yes", "No"), 100, replace = TRUE))
nb_pred <- nb_pred %>%
mutate(
pred_default_0.5 = ifelse(nb_prob_default > 0.5, "Yes", "No"),
pred_default_0.2 = ifelse(nb_prob_default > 0.2, "Yes", "No")
)
table_4_4 <- nb_pred %>%
count(pred_default_0.5, default) %>%
pivot_wider(names_from = default, values_from = n) %>%
mutate(Total = No + Yes) %>%
gt(rowname_col = "pred_default_0.5") %>%
tab_spanner(label = "True default status", columns = everything()) %>%
tab_stubhead("Predicted") %>%
grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))
table_4_4
Predicted |
True default status
|
||
---|---|---|---|
No | Yes | Total | |
No | 21 | 23 | 44 |
Yes | 24 | 32 | 56 |
Total | 45 | 55 | 100 |
#Figure 4.4 - Density Plot(additional practice)
mu1 <- -1.25
mu2 <- 1.25
sigma1 <- 1
sigma2 <- 1
bayes_boundary <- (mu1 + mu2) / 2
p1 <- ggplot(data = tibble(x = seq(-4, 4, 0.1)), aes(x)) +
stat_function(fun = dnorm, args = list(mean = mu1, sd = sigma1),
geom = "line", size = 1.5, color = "green") +
stat_function(fun = dnorm, args = list(mean = mu2, sd = sigma2),
geom = "line", size = 1.5, color = "purple") +
geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
set.seed(42)
d <- tribble(
~class, ~x,
1, rnorm(20, mean = mu1, sd = sigma1),
2, rnorm(20, mean = mu2, sd = sigma2)
) %>%
unnest(x)
lda_boundary <- (mean(filter(d, class == 1)$x) + mean(filter(d, class == 2)$x)) / 2
p2 <- d %>%
ggplot(aes(x, fill = factor(class), color = factor(class))) +
geom_histogram(bins = 13, alpha = 0.5, position = "identity") +
geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
geom_vline(xintercept = lda_boundary, lty = 1, size = 1.5)
# Install and load the patchwork package
if (!require(patchwork)) install.packages("patchwork")
## Loading required package: patchwork
library(patchwork)
p1 + p2
# Table 4.5 - Confusion Matrix (Threshold 0.2)
table_4_5 <- nb_pred %>%
count(pred_default_0.2, default) %>%
pivot_wider(names_from = default, values_from = n) %>%
mutate(Total = No + Yes) %>%
gt(rowname_col = "pred_default_0.2") %>%
tab_spanner(label = "True default status", columns = everything()) %>%
tab_stubhead("Predicted") %>%
grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))
table_4_5
Predicted |
True default status
|
||
---|---|---|---|
No | Yes | Total | |
No | 6 | 8 | 14 |
Yes | 39 | 47 | 86 |
Total | 45 | 55 | 100 |
table_4_8 <- nb_pred %>%
select(default, pred_default_0.2, pred_default_0.5) %>%
pivot_longer(c(pred_default_0.5, pred_default_0.2),
names_to = "threshold", values_to = "pred_default") %>%
mutate(threshold = as.numeric(str_remove(threshold, "pred_default_"))) %>%
group_by(threshold) %>%
summarise(
overall_error = mean(default != pred_default),
sensitivity = sum(default == "Yes" & pred_default == "Yes") / sum(default == "Yes"),
specificity = sum(default == "No" & pred_default == "No") / sum(default == "No")
) %>%
mutate(across(everything(), scales::percent)) %>%
gt()
table_4_8
threshold | overall_error | sensitivity | specificity |
---|---|---|---|
20% | 47% | 85% | 13% |
50% | 47% | 58% | 47% |