Overview

This report fits the three logistic regression models described in Tables 4.1–4.3 of An Introduction to Statistical Learning (James et al.) on the Default dataset, and compares their predictive performance via ROC curves and AUC values.

Model Predictors Reference
M1 balance Table 4.1
M2 student Table 4.2
M3 balance + income + student Table 4.3

1. Load Data & Packages

needed <- c("pROC", "ggplot2", "dplyr", "scales")
for (pkg in needed) {
  if (!requireNamespace(pkg, quietly = TRUE)) install.packages(pkg)
}

library(pROC)
library(ggplot2)
library(dplyr)
library(scales)

df <- read.csv("Default.csv", stringsAsFactors = TRUE)

df$y <- as.integer(df$default == "Yes")

dplyr::glimpse(df)
## Rows: 10,000
## Columns: 6
## $ X       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, No, No, N…
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919.5885, 8…
## $ income  <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496, 7491.55…
## $ y       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
table(df$default)
## 
##   No  Yes 
## 9667  333

2. Fit the Three Models

fit1 <- glm(y ~ balance, data = df, family = binomial)

fit2 <- glm(y ~ student, data = df, family = binomial)

fit3 <- glm(y ~ balance + income + student, data = df, family = binomial)

df$p1 <- predict(fit1, type = "response")
df$p2 <- predict(fit2, type = "response")
df$p3 <- predict(fit3, type = "response")

Model summaries

summary(fit1)
## 
## Call:
## glm(formula = y ~ balance, family = binomial, data = df)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.065e+01  3.612e-01  -29.49   <2e-16 ***
## balance      5.499e-03  2.204e-04   24.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8
summary(fit2)
## 
## Call:
## glm(formula = y ~ student, family = binomial, data = df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.50413    0.07071  -49.55  < 2e-16 ***
## studentYes   0.40489    0.11502    3.52 0.000431 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2908.7  on 9998  degrees of freedom
## AIC: 2912.7
## 
## Number of Fisher Scoring iterations: 6
summary(fit3)
## 
## Call:
## glm(formula = y ~ balance + income + student, family = binomial, 
##     data = df)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

3. Compute ROC Objects

roc1 <- roc(df$y, df$p1, quiet = TRUE)
roc2 <- roc(df$y, df$p2, quiet = TRUE)
roc3 <- roc(df$y, df$p3, quiet = TRUE)

cat(sprintf(
  "AUC  |  M1 (balance): %.4f  |  M2 (student): %.4f  |  M3 (full): %.4f\n",
  auc(roc1), auc(roc2), auc(roc3)
))
## AUC  |  M1 (balance): 0.9480  |  M2 (student): 0.5450  |  M3 (full): 0.9496

4. ROC Curve Plot

make_roc_df <- function(roc_obj, label) {
  data.frame(
    FPR   = 1 - roc_obj$specificities,
    TPR   = roc_obj$sensitivities,
    Model = label
  )
}

roc_df <- bind_rows(
  make_roc_df(roc1, sprintf("M1 – balance  (AUC = %.3f)", auc(roc1))),
  make_roc_df(roc2, sprintf("M2 – student  (AUC = %.3f)", auc(roc2))),
  make_roc_df(roc3, sprintf("M3 – full     (AUC = %.3f)", auc(roc3)))
)

pal <- c("#E69F00", "#56B4E9", "#009E73")

ggplot(roc_df, aes(x = FPR, y = TPR, colour = Model)) +
  geom_line(linewidth = 1.1) +
  geom_abline(slope = 1, intercept = 0,
              linetype = "dashed", colour = "grey60", linewidth = 0.7) +
  scale_colour_manual(values = pal) +
  scale_x_continuous(labels = percent_format(accuracy = 1),
                     limits = c(0, 1), expand = c(0.01, 0)) +
  scale_y_continuous(labels = percent_format(accuracy = 1),
                     limits = c(0, 1), expand = c(0.01, 0)) +
  labs(
    title    = "ROC Curves – Credit Default Models",
    subtitle = "ISLR Tables 4.1 (M1), 4.2 (M2), 4.3 (M3)",
    x        = "False Positive Rate (1 – Specificity)",
    y        = "True Positive Rate (Sensitivity)",
    colour   = NULL,
    caption  = "Default dataset  |  n = 10,000  |  pROC + ggplot2"
  ) +
  theme_bw(base_size = 13) +
  theme(
    legend.position   = c(0.72, 0.18),
    legend.background = element_rect(fill = alpha("white", 0.85),
                                     colour = "grey80"),
    legend.key.width  = unit(1.5, "cm"),
    plot.title        = element_text(face = "bold"),
    plot.subtitle     = element_text(colour = "grey40"),
    plot.caption      = element_text(colour = "grey50", size = 9)
  )

ROC curves for three logistic regression models on the Default dataset


5. AUC Comparison Table

auc_tbl <- data.frame(
  Model     = c("M1 – balance only",
                "M2 – student only",
                "M3 – balance + income + student"),
  Reference = c("Table 4.1", "Table 4.2", "Table 4.3"),
  AUC       = round(c(auc(roc1), auc(roc2), auc(roc3)), 4)
)

knitr::kable(auc_tbl,
             caption = "Area Under the ROC Curve by model",
             align   = c("l", "c", "c"))
Area Under the ROC Curve by model
Model Reference AUC
M1 – balance only Table 4.1 0.9480
M2 – student only Table 4.2 0.5450
M3 – balance + income + student Table 4.3 0.9496

6. DeLong Test for AUC Differences

cat("M1 vs M2:\n"); print(roc.test(roc1, roc2))
## M1 vs M2:
## 
##  DeLong's test for two correlated ROC curves
## 
## data:  roc1 and roc2
## Z = 29.77, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
##  0.3764578 0.4295216
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.9479785   0.5449888
cat("\nM1 vs M3:\n"); print(roc.test(roc1, roc3))
## 
## M1 vs M3:
## 
##  DeLong's test for two correlated ROC curves
## 
## data:  roc1 and roc3
## Z = -1.8196, p-value = 0.06882
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
##  -0.0032811284  0.0001218711
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.9479785   0.9495581
cat("\nM2 vs M3:\n"); print(roc.test(roc2, roc3))
## 
## M2 vs M3:
## 
##  DeLong's test for two correlated ROC curves
## 
## data:  roc2 and roc3
## Z = -28.811, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
##  -0.4320911 -0.3770475
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.5449888   0.9495581

7. Interpretation

  • M1 (balance only) achieves the highest AUC among the single-predictor models, confirming that outstanding balance is the dominant signal for default risk.

  • M2 (student only) performs near random chance (AUC ≈ 0.50–0.60), because raw student status is a very weak predictor once balance and income are not controlled for.

  • M3 (full model) yields the best discrimination. The small incremental gain over M1 suggests that most of the predictive power is already captured by balance; nonetheless the DeLong test indicates whether the improvement is statistically significant.


Rendered with R 4.5.1 — source data: Default.csv