library(tidymodels)
library(tidyverse)
library(yardstick)
Default <- read.csv("Default.csv")
Default$default <- factor(Default$default, levels = c("No", "Yes"))
Default$student <- factor(Default$student, levels = c("No", "Yes"))
glimpse(Default)
## Rows: 10,000
## Columns: 5
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, No, No, N…
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919.5885, 8…
## $ income <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496, 7491.55…
log_spec <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
# Table 4.1: balance only
fit1 <- log_spec %>% fit(default ~ balance, data = Default)
# Table 4.2: student only
fit2 <- log_spec %>% fit(default ~ student, data = Default)
# Table 4.3: balance + income + student
fit3 <- log_spec %>% fit(default ~ balance + income + student, data = Default)
# Get predicted probabilities for each model
roc1 <- fit1 %>%
predict(Default, type = "prob") %>%
bind_cols(Default) %>%
roc_curve(truth = default, .pred_Yes, event_level = "second") %>%
mutate(model = "Model 1: balance")
roc2 <- fit2 %>%
predict(Default, type = "prob") %>%
bind_cols(Default) %>%
roc_curve(truth = default, .pred_Yes, event_level = "second") %>%
mutate(model = "Model 2: student")
roc3 <- fit3 %>%
predict(Default, type = "prob") %>%
bind_cols(Default) %>%
roc_curve(truth = default, .pred_Yes, event_level = "second") %>%
mutate(model = "Model 3: balance + income + student")
# Combine and plot
bind_rows(roc1, roc2, roc3) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, color = model)) +
geom_line(linewidth = 1) +
geom_abline(linetype = "dashed", color = "gray50") +
scale_color_manual(values = c("steelblue", "indianred2", "seagreen")) +
labs(
title = "ROC Curves for Default Dataset",
subtitle = "Models from Tables 4.1, 4.2, and 4.3",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
color = "Model"
) +
theme_minimal() +
theme(legend.position = "bottom")

# AUC for each model
auc1 <- fit1 %>%
predict(Default, type = "prob") %>%
bind_cols(Default) %>%
roc_auc(truth = default, .pred_Yes, event_level = "second")
auc2 <- fit2 %>%
predict(Default, type = "prob") %>%
bind_cols(Default) %>%
roc_auc(truth = default, .pred_Yes, event_level = "second")
auc3 <- fit3 %>%
predict(Default, type = "prob") %>%
bind_cols(Default) %>%
roc_auc(truth = default, .pred_Yes, event_level = "second")
bind_rows(
auc1 %>% mutate(model = "Model 1: balance"),
auc2 %>% mutate(model = "Model 2: student"),
auc3 %>% mutate(model = "Model 3: balance + income + student")
) %>% select(model, .estimate)
## # A tibble: 3 × 2
## model .estimate
## <chr> <dbl>
## 1 Model 1: balance 0.948
## 2 Model 2: student 0.545
## 3 Model 3: balance + income + student 0.950