Overview

This report fits three logistic regression models on the Default dataset (Tables 4.1, 4.2, and 4.3 from ISLR Chapter 4) and plots their ROC curves.

Model Predictors Reference
Model 1 balance Table 4.1
Model 2 student Table 4.2
Model 3 balance + income + student Table 4.3

Load Packages

library(tidymodels)
library(tidyverse)

Load Data

default_data <- read_csv("Default.csv")

# Ensure 'default' and 'student' are factors
default_data <- default_data %>%
  mutate(
    default = factor(default, levels = c("No", "Yes")),
    student = factor(student, levels = c("No", "Yes"))
  )

glimpse(default_data)
## Rows: 10,000
## Columns: 5
## $ ...1    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, No, No, N…
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919.5885, 8…
## $ income  <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496, 7491.55…

Model Specification

All three models use logistic regression with the glm engine.

lr_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

Fit the Three Models

Model 1 — Table 4.1: Predict default using balance

fit_balance <- lr_spec %>%
  fit(default ~ balance, data = default_data)

tidy(fit_balance)
## # A tibble: 2 × 5
##   term         estimate std.error statistic   p.value
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept) -10.7      0.361        -29.5 3.62e-191
## 2 balance       0.00550  0.000220      25.0 1.98e-137

Model 2 — Table 4.2: Predict default using student

fit_student <- lr_spec %>%
  fit(default ~ student, data = default_data)

tidy(fit_student)
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -3.50     0.0707    -49.6  0       
## 2 studentYes     0.405    0.115       3.52 0.000431

Model 3 — Table 4.3: Predict default using balance + income + student

fit_all <- lr_spec %>%
  fit(default ~ balance + income + student, data = default_data)

tidy(fit_all)
## # A tibble: 4 × 5
##   term            estimate  std.error statistic   p.value
##   <chr>              <dbl>      <dbl>     <dbl>     <dbl>
## 1 (Intercept) -10.9        0.492        -22.1   4.91e-108
## 2 balance       0.00574    0.000232      24.7   4.22e-135
## 3 income        0.00000303 0.00000820     0.370 7.12e-  1
## 4 studentYes   -0.647      0.236         -2.74  6.19e-  3

ROC Curves

We use augment() to get predicted probabilities for default = "Yes", then compute the ROC curve with roc_curve().

# Get predicted probabilities from each model
roc_balance <- augment(fit_balance, new_data = default_data) %>%
  roc_curve(truth = default, .pred_Yes, event_level = "second") %>%
  mutate(Model = "Table 4.1: balance only")

roc_student <- augment(fit_student, new_data = default_data) %>%
  roc_curve(truth = default, .pred_Yes, event_level = "second") %>%
  mutate(Model = "Table 4.2: student only")

roc_all <- augment(fit_all, new_data = default_data) %>%
  roc_curve(truth = default, .pred_Yes, event_level = "second") %>%
  mutate(Model = "Table 4.3: balance + income + student")

Individual ROC Curves

Table 4.1 — balance only

roc_balance %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_path(color = "steelblue", linewidth = 1.2) +
  geom_abline(lty = 2, color = "gray50") +
  coord_equal() +
  labs(
    title    = "ROC Curve — Table 4.1: balance only",
    x        = "False Positive Rate (1 - Specificity)",
    y        = "True Positive Rate (Sensitivity)"
  ) +
  theme_bw()

Table 4.2 — student only

roc_student %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_path(color = "darkorange", linewidth = 1.2) +
  geom_abline(lty = 2, color = "gray50") +
  coord_equal() +
  labs(
    title    = "ROC Curve — Table 4.2: student only",
    x        = "False Positive Rate (1 - Specificity)",
    y        = "True Positive Rate (Sensitivity)"
  ) +
  theme_bw()

Table 4.3 — balance + income + student

roc_all %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_path(color = "forestgreen", linewidth = 1.2) +
  geom_abline(lty = 2, color = "gray50") +
  coord_equal() +
  labs(
    title    = "ROC Curve — Table 4.3: balance + income + student",
    x        = "False Positive Rate (1 - Specificity)",
    y        = "True Positive Rate (Sensitivity)"
  ) +
  theme_bw()

All Three Models on One Plot

bind_rows(roc_balance, roc_student, roc_all) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity, color = Model)) +
  geom_path(linewidth = 1.2) +
  geom_abline(lty = 2, color = "gray50") +
  coord_equal() +
  scale_color_manual(values = c(
    "Table 4.1: balance only"               = "steelblue",
    "Table 4.2: student only"               = "darkorange",
    "Table 4.3: balance + income + student" = "forestgreen"
  )) +
  labs(
    title    = "ROC Curves — Default Dataset (Tables 4.1, 4.2, 4.3)",
    x        = "False Positive Rate (1 - Specificity)",
    y        = "True Positive Rate (Sensitivity)",
    color    = "Model"
  ) +
  theme_bw() +
  theme(legend.position = "bottom")


AUC (Area Under the Curve)

A higher AUC indicates better discriminatory ability. AUC = 0.5 means the model is no better than random chance.

auc_balance <- augment(fit_balance, new_data = default_data) %>%
  roc_auc(truth = default, .pred_Yes, event_level = "second")

auc_student <- augment(fit_student, new_data = default_data) %>%
  roc_auc(truth = default, .pred_Yes, event_level = "second")

auc_all <- augment(fit_all, new_data = default_data) %>%
  roc_auc(truth = default, .pred_Yes, event_level = "second")

tibble(
  Model = c(
    "Table 4.1: balance only",
    "Table 4.2: student only",
    "Table 4.3: balance + income + student"
  ),
  AUC = c(
    auc_balance$.estimate,
    auc_student$.estimate,
    auc_all$.estimate
  )
) %>%
  arrange(desc(AUC)) %>%
  knitr::kable(digits = 4, caption = "AUC for each logistic regression model")
AUC for each logistic regression model
Model AUC
Table 4.3: balance + income + student 0.9496
Table 4.1: balance only 0.9480
Table 4.2: student only 0.5450

Summary

  • Table 4.1 (balance only) achieves the highest AUC among the single-predictor models, reflecting that credit card balance is a strong predictor of default.
  • Table 4.2 (student only) performs near random chance — student status alone is a weak predictor.
  • Table 4.3 (balance + income + student) achieves the best overall AUC by combining all three predictors, capturing additional nuance that neither balance nor student status alone can provide.