library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gt)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.8     ✔ rsample      1.3.0
## ✔ dials        1.4.0     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.2     ✔ tidyr        1.3.1
## ✔ infer        1.0.8     ✔ tune         1.3.0
## ✔ modeldata    1.4.0     ✔ workflows    1.2.0
## ✔ parsnip      1.3.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.4     ✔ yardstick    1.3.2
## ✔ recipes      1.3.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
library(stringr)
## 
## Attaching package: 'stringr'
## The following object is masked from 'package:recipes':
## 
##     fixed
library(ggplot2)
library(tibble)
library(tidyr)

Table 4.4 - Confusion Matrix

nb_pred <- data.frame(nb_prob_default = runif(100), default = sample(c("Yes", "No"), 100, replace = TRUE))
nb_pred <- nb_pred %>%
  mutate(
    pred_default_0.5 = ifelse(nb_prob_default > 0.5, "Yes", "No"),
    pred_default_0.2 = ifelse(nb_prob_default > 0.2, "Yes", "No")
  )

table_4_4 <- nb_pred %>%
  count(pred_default_0.5, default) %>%
  pivot_wider(names_from = default, values_from = n) %>%
  mutate(Total = No + Yes) %>%
  gt(rowname_col = "pred_default_0.5") %>%
  tab_spanner(label = "True default status", columns = everything()) %>%
  tab_stubhead("Predicted") %>%
  grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))
table_4_4
Predicted
True default status
No Yes Total
No 21 23 44
Yes 24 32 56
Total 45 55 100

#Figure 4.4 - Density Plot(additional practice)

mu1 <- -1.25
mu2 <- 1.25
sigma1 <- 1
sigma2 <- 1
bayes_boundary <- (mu1 + mu2) / 2
p1 <- ggplot(data = tibble(x = seq(-4, 4, 0.1)), aes(x)) +
  stat_function(fun = dnorm, args = list(mean = mu1, sd = sigma1),
                geom = "line", size = 1.5, color = "green") +
  stat_function(fun = dnorm, args = list(mean = mu2, sd = sigma2),
                geom = "line", size = 1.5, color = "purple") +
  geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
set.seed(42)
d <- tribble(
  ~class, ~x,
  1, rnorm(20, mean = mu1, sd = sigma1),
  2, rnorm(20, mean = mu2, sd = sigma2)
) %>%
  unnest(x)
lda_boundary <- (mean(filter(d, class == 1)$x) + mean(filter(d, class == 2)$x)) / 2

p2 <- d %>%
  ggplot(aes(x, fill = factor(class), color = factor(class))) +
  geom_histogram(bins = 13, alpha = 0.5, position = "identity") +
  geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
  geom_vline(xintercept = lda_boundary, lty = 1, size = 1.5)
# Install and load the patchwork package
if (!require(patchwork)) install.packages("patchwork")
## Loading required package: patchwork
library(patchwork)
p1 + p2

# Table 4.5 - Confusion Matrix (Threshold 0.2)

table_4_5 <- nb_pred %>%
  count(pred_default_0.2, default) %>%
  pivot_wider(names_from = default, values_from = n) %>%
  mutate(Total = No + Yes) %>%
  gt(rowname_col = "pred_default_0.2") %>%
  tab_spanner(label = "True default status", columns = everything()) %>%
  tab_stubhead("Predicted") %>%
  grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))
table_4_5
Predicted
True default status
No Yes Total
No 6 8 14
Yes 39 47 86
Total 45 55 100

Table 4.8 - Error Rate, Sensitivity, and Specificity

table_4_8 <- nb_pred %>%
  select(default, pred_default_0.2, pred_default_0.5) %>%
  pivot_longer(c(pred_default_0.5, pred_default_0.2),
              names_to = "threshold", values_to = "pred_default") %>%
  mutate(threshold = as.numeric(str_remove(threshold, "pred_default_"))) %>%
  group_by(threshold) %>%
  summarise(
    overall_error = mean(default != pred_default),
    sensitivity = sum(default == "Yes" & pred_default == "Yes") / sum(default == "Yes"),
    specificity = sum(default == "No" & pred_default == "No") / sum(default == "No")
  ) %>%
  mutate(across(everything(), scales::percent)) %>%
  gt()
table_4_8
threshold overall_error sensitivity specificity
20% 47% 85% 13%
50% 47% 58% 47%