Exercise05 4.4

4.4.1 Linear Discriminant Analysis for P=1

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(ggplot2)
library(tidyr)
library(tibble)
library(ISLR)
library(ISLR2)

## 
## Attaching package: 'ISLR2'

## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit

td_colors <- list(
  nice = list(
    emerald = "#50C878",       # Emerald green
    opera_mauve = "#B784A7"    # Opera mauve
  )
)
mu1 <- -1.25
mu2 <- 1.25
sigma1 <- 1
sigma2 <- 1
bayes_boundary <- (mu1 + mu2) / 2
p1 <- ggplot(data = tibble(x = seq(-4, 4, 0.1)), aes(x)) +
  stat_function(fun = dnorm, args = list(mean = mu1, sd = sigma1),
                geom = "line", size = 1.5, color = td_colors$nice$emerald) +
  stat_function(fun = dnorm, args = list(mean = mu2, sd = sigma2),
                geom = "line", size = 1.5, color = td_colors$nice$opera_mauve) +
  geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
  theme(
    axis.text.y = element_blank(),   # Remove y-axis text
    axis.ticks.y = element_blank(),  # Remove y-axis ticks
    axis.title.y = element_blank()   # Remove y-axis label
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

set.seed(42)
d <- tribble(
  ~class, ~x,
  1, rnorm(20, mean = mu1, sd = sigma1),
  2, rnorm(20, mean = mu2, sd = sigma2)
) %>%
  unnest(x)
lda_boundary <-
  (mean(filter(d, class == 1)$x) + mean(filter(d, class == 2)$x)) / 2

p2 <- d %>%
  ggplot(aes(x, fill = factor(class), color = factor(class))) +
  geom_histogram(bins = 13, alpha = 0.5, position = "identity") +
  geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
  geom_vline(xintercept = lda_boundary, lty = 1, size = 1.5) +
  scale_fill_manual(values = c(td_colors$nice$emerald,
                               td_colors$nice$opera_mauve)) +
  scale_color_manual(values = c(td_colors$nice$emerald,
                                td_colors$nice$opera_mauve)) +
  theme(legend.position = "none")

library(patchwork)
p1 | p2

print(td_colors)

## $nice
## $nice$emerald
## [1] "#50C878"
## 
## $nice$opera_mauve
## [1] "#B784A7"

4.4.2 Linear Discriminant Analysis for p > 1

d <- crossing(x1 = seq(-2, 2, 0.1), x2 = seq(-2, 2, 0.1))
d1 <- d %>%
  bind_cols(
    prob = mvtnorm::dmvnorm(
      x = as.matrix(d),
      mean = c(0, 0), sigma = matrix(c(1, 0, 0, 1), nrow = 2)
    )
  )
d2 <- d %>%
  bind_cols(
    prob = mvtnorm::dmvnorm(
      x = as.matrix(d),
      mean = c(0, 0), sigma = matrix(c(1, 0.7, 0.7, 1), nrow = 2)
    )
  )
p1 <- d1 %>%
  ggplot(aes(x = x1, y = x2)) +
  geom_tile(aes(fill = prob)) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(legend.position = "none")
p2 <- d2 %>%
  ggplot(aes(x = x1, y = x2)) +
  geom_tile(aes(fill = prob)) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(legend.position = "none")
p1 | p2

data("Default")

lda_Default_balance_student <- MASS::lda(default ~ balance + student, data = Default)
lda_Default_balance_student

## Call:
## lda(default ~ balance + student, data = Default)
## 
## Prior probabilities of groups:
##     No    Yes 
## 0.9667 0.0333 
## 
## Group means:
##       balance studentYes
## No   803.9438  0.2914037
## Yes 1747.8217  0.3813814
## 
## Coefficients of linear discriminants:
##                     LD1
## balance     0.002244397
## studentYes -0.249059498

select <- dplyr::select
mean(
  predict(lda_Default_balance_student, newdata = Default)$class != Default$default
)

## [1] 0.0275

lda_pred <- 
  bind_cols(
    pred_default = predict(lda_Default_balance_student,
                           newdata = Default)$class,
    Default 
  ) 

library(gt)
lda_pred %>%
  count(pred_default, default) %>%
  pivot_wider(names_from = default, values_from = n) %>%
  mutate(Total = No + Yes) %>%
  gt(rowname_col = "pred_default") %>%
  gt::tab_spanner(label = "True default status", columns = everything()) %>%
  gt::tab_stubhead("Predicted") %>%
  # Use grand_summary_rows() instead of summary_rows()
  gt::grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))

Predicted	True default status
Predicted	No	Yes	Total
No	9644	252	9896
Yes	23	81	104
Total	9667	333	10000

lda_posterior <- predict(lda_Default_balance_student, newdata = Default)$posterior
head(lda_posterior)

##          No         Yes
## 1 0.9968680 0.003131975
## 2 0.9971925 0.002807531
## 3 0.9843970 0.015603046
## 4 0.9987769 0.001223133
## 5 0.9959254 0.004074582
## 6 0.9954627 0.004537289

lda_pred_20 <- bind_cols(
  Default,
  posterior_prob_default = lda_posterior[,2]
) %>%
  mutate(
    pred_default = ifelse(posterior_prob_default > 0.2, "Yes", "No")
  )
lda_pred_20 %>%
  count(pred_default, default) %>%
  pivot_wider(names_from = default, values_from = n) %>%
  mutate(Total = No + Yes) %>%
  gt(rowname_col = "pred_default") %>%
  tab_spanner(label = "True default status", columns = everything()) %>%
  tab_stubhead("Predicted") %>%
  # Use grand_summary_rows() instead of summary_rows()
  grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))

Predicted	True default status
Predicted	No	Yes	Total
No	9432	138	9570
Yes	235	195	430
Total	9667	333	10000

posterior_prob_default <- lda_pred_20$posterior_prob_default
lda_roc <-
  yardstick::roc_curve(
    lda_pred_20,
    # Specify the class probability and the truth variables
    posterior_prob_default, truth = default,
    # This argument specifies which level of truth (default) is considered
    #  "positive", so it will flip the ROC curve vertically
    event_level = "second"
  )
autoplot(lda_roc)

yardstick::roc_auc(
  lda_pred_20,
  posterior_prob_default, truth = default,
  event_level = "second"
)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.950

glm_default_balance_student <-
  glm(default ~ balance + student,
      data = Default, family = binomial)
glm_pred <- bind_cols(
  Default,
  glm_prob_default = predict(
    glm_default_balance_student,
    newdata = Default, type = "response"
  )
)
yardstick::roc_auc(
  data = glm_pred,
  truth = default,  
  glm_prob_default,  # No `estimate =`, just the column name
  event_level = "second"
)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.950

4.4.4 Naive Bayes

nb_default <-
  klaR::NaiveBayes(default ~ balance + student, data = Default)
nb_pred <- bind_cols(
  Default["default"],  # Correctly reference the 'default' column
  nb_prob_default = predict(nb_default, newdata = Default)$posterior[,2]
)
nb_pred <- nb_pred %>%
  mutate(
    pred_default_0.5 = ifelse(nb_prob_default > 0.5, "Yes", "No"),
    pred_default_0.2 = ifelse(nb_prob_default > 0.2, "Yes", "No")
  )
nb_pred %>% 
  count(pred_default_0.5, default) %>% 
  pivot_wider(names_from = default, values_from = n, values_fill = 0) %>% 
  mutate(Total = No + Yes) %>% 
  gt(rowname_col = "pred_default_0.5") %>% 
  tab_spanner(label = "True default status", columns = c(No, Yes, Total)) %>% 
  tab_stubhead("Predicted") %>% 
  grand_summary_rows(
    columns = c(No, Yes, Total),
    fns = list(Total = ~round(sum(.), 0))
  )

Predicted	True default status
Predicted	No	Yes	Total
No	9621	244	9865
Yes	46	89	135
Total	9667	333	10000

nb_pred %>% 
  count(pred_default_0.2, default) %>% 
  pivot_wider(names_from = default, values_from = n, values_fill = 0) %>% 
  mutate(Total = No + Yes) %>% 
  gt(rowname_col = "pred_default_0.2") %>% 
  tab_spanner(label = "True default status", columns = c(No, Yes, Total)) %>% 
  tab_stubhead("Predicted") %>% 
  grand_summary_rows(
    columns = c(No, Yes, Total),
    fns = list(Total = ~round(sum(.), 0))
  )

Predicted	True default status
Predicted	No	Yes	Total
No	9339	130	9469
Yes	328	203	531
Total	9667	333	10000

nb_pred %>%
  select(default, pred_default_0.2, pred_default_0.5) %>%
  pivot_longer(c(pred_default_0.5, pred_default_0.2),
               names_to = "threshold", values_to = "pred_default") %>%
  mutate(threshold = as.numeric(str_remove(threshold, "pred_default_"))) %>%
  group_by(threshold) %>%
  summarise(
    overall_error = mean(default != pred_default),
    sensitivity = sum(default == "Yes" & pred_default == "Yes") /
      sum(default == "Yes"),
    specificity = sum(default == "No" & pred_default == "No") /
      sum(default == "No"),
    .groups = "drop"
  ) %>%
  mutate(across(everything(), scales::percent)) %>%
  gt()

threshold	overall_error	sensitivity	specificity
20%	4.6%	61%	96.6%
50%	2.9%	27%	99.5%

Exercise05 4.4

113035131 Baljingarav

2025-03-29