This report fits the three logistic regression models described in
Tables 4.1–4.3 of An Introduction to Statistical
Learning (James et al.) on the Default dataset, and
compares their predictive performance via ROC curves and AUC values.
| Model | Predictors | Reference |
|---|---|---|
| M1 | balance |
Table 4.1 |
| M2 | student |
Table 4.2 |
| M3 | balance + income +
student |
Table 4.3 |
needed <- c("pROC", "ggplot2", "dplyr", "scales")
for (pkg in needed) {
if (!requireNamespace(pkg, quietly = TRUE)) install.packages(pkg)
}
library(pROC)
library(ggplot2)
library(dplyr)
library(scales)
df <- read.csv("Default.csv", stringsAsFactors = TRUE)
df$y <- as.integer(df$default == "Yes")
dplyr::glimpse(df)## Rows: 10,000
## Columns: 6
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, No, No, N…
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919.5885, 8…
## $ income <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496, 7491.55…
## $ y <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
##
## No Yes
## 9667 333
fit1 <- glm(y ~ balance, data = df, family = binomial)
fit2 <- glm(y ~ student, data = df, family = binomial)
fit3 <- glm(y ~ balance + income + student, data = df, family = binomial)
df$p1 <- predict(fit1, type = "response")
df$p2 <- predict(fit2, type = "response")
df$p3 <- predict(fit3, type = "response")##
## Call:
## glm(formula = y ~ balance, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.065e+01 3.612e-01 -29.49 <2e-16 ***
## balance 5.499e-03 2.204e-04 24.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1596.5 on 9998 degrees of freedom
## AIC: 1600.5
##
## Number of Fisher Scoring iterations: 8
##
## Call:
## glm(formula = y ~ student, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.50413 0.07071 -49.55 < 2e-16 ***
## studentYes 0.40489 0.11502 3.52 0.000431 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 2908.7 on 9998 degrees of freedom
## AIC: 2912.7
##
## Number of Fisher Scoring iterations: 6
##
## Call:
## glm(formula = y ~ balance + income + student, family = binomial,
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.087e+01 4.923e-01 -22.080 < 2e-16 ***
## balance 5.737e-03 2.319e-04 24.738 < 2e-16 ***
## income 3.033e-06 8.203e-06 0.370 0.71152
## studentYes -6.468e-01 2.363e-01 -2.738 0.00619 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.5 on 9996 degrees of freedom
## AIC: 1579.5
##
## Number of Fisher Scoring iterations: 8
roc1 <- roc(df$y, df$p1, quiet = TRUE)
roc2 <- roc(df$y, df$p2, quiet = TRUE)
roc3 <- roc(df$y, df$p3, quiet = TRUE)
cat(sprintf(
"AUC | M1 (balance): %.4f | M2 (student): %.4f | M3 (full): %.4f\n",
auc(roc1), auc(roc2), auc(roc3)
))## AUC | M1 (balance): 0.9480 | M2 (student): 0.5450 | M3 (full): 0.9496
make_roc_df <- function(roc_obj, label) {
data.frame(
FPR = 1 - roc_obj$specificities,
TPR = roc_obj$sensitivities,
Model = label
)
}
roc_df <- bind_rows(
make_roc_df(roc1, sprintf("M1 – balance (AUC = %.3f)", auc(roc1))),
make_roc_df(roc2, sprintf("M2 – student (AUC = %.3f)", auc(roc2))),
make_roc_df(roc3, sprintf("M3 – full (AUC = %.3f)", auc(roc3)))
)
pal <- c("#E69F00", "#56B4E9", "#009E73")
ggplot(roc_df, aes(x = FPR, y = TPR, colour = Model)) +
geom_line(linewidth = 1.1) +
geom_abline(slope = 1, intercept = 0,
linetype = "dashed", colour = "grey60", linewidth = 0.7) +
scale_colour_manual(values = pal) +
scale_x_continuous(labels = percent_format(accuracy = 1),
limits = c(0, 1), expand = c(0.01, 0)) +
scale_y_continuous(labels = percent_format(accuracy = 1),
limits = c(0, 1), expand = c(0.01, 0)) +
labs(
title = "ROC Curves – Credit Default Models",
subtitle = "ISLR Tables 4.1 (M1), 4.2 (M2), 4.3 (M3)",
x = "False Positive Rate (1 – Specificity)",
y = "True Positive Rate (Sensitivity)",
colour = NULL,
caption = "Default dataset | n = 10,000 | pROC + ggplot2"
) +
theme_bw(base_size = 13) +
theme(
legend.position = c(0.72, 0.18),
legend.background = element_rect(fill = alpha("white", 0.85),
colour = "grey80"),
legend.key.width = unit(1.5, "cm"),
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(colour = "grey40"),
plot.caption = element_text(colour = "grey50", size = 9)
)auc_tbl <- data.frame(
Model = c("M1 – balance only",
"M2 – student only",
"M3 – balance + income + student"),
Reference = c("Table 4.1", "Table 4.2", "Table 4.3"),
AUC = round(c(auc(roc1), auc(roc2), auc(roc3)), 4)
)
knitr::kable(auc_tbl,
caption = "Area Under the ROC Curve by model",
align = c("l", "c", "c"))| Model | Reference | AUC |
|---|---|---|
| M1 – balance only | Table 4.1 | 0.9480 |
| M2 – student only | Table 4.2 | 0.5450 |
| M3 – balance + income + student | Table 4.3 | 0.9496 |
## M1 vs M2:
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc2
## Z = 29.77, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
## 0.3764578 0.4295216
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.9479785 0.5449888
##
## M1 vs M3:
##
## DeLong's test for two correlated ROC curves
##
## data: roc1 and roc3
## Z = -1.8196, p-value = 0.06882
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
## -0.0032811284 0.0001218711
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.9479785 0.9495581
##
## M2 vs M3:
##
## DeLong's test for two correlated ROC curves
##
## data: roc2 and roc3
## Z = -28.811, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
## -0.4320911 -0.3770475
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.5449888 0.9495581
M1 (balance only) achieves the highest AUC among the single-predictor models, confirming that outstanding balance is the dominant signal for default risk.
M2 (student only) performs near random chance (AUC ≈ 0.50–0.60), because raw student status is a very weak predictor once balance and income are not controlled for.
M3 (full model) yields the best discrimination.
The small incremental gain over M1 suggests that most of the predictive
power is already captured by balance; nonetheless the
DeLong test indicates whether the improvement is statistically
significant.
Rendered with R 4.5.1 — source data:
Default.csv