This document fits the three logistic regression models from
ISLR Tables 4.1–4.3 on the Default dataset
and plots their ROC curves for comparison.
| Table | Model | Predictors |
|---|---|---|
| 4.1 | Simple logistic regression | balance only |
| 4.2 | Simple logistic regression | student only |
| 4.3 | Multiple logistic regression | balance + income + student |
# Install pROC if not available
if (!requireNamespace("pROC", quietly = TRUE)) install.packages("pROC")
if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")
library(ggplot2)
library(pROC)
# Load data
Default <- read.csv("Default.csv")
# Convert binary variables to factors
Default$default <- factor(Default$default, levels = c("No", "Yes"))
Default$student <- factor(Default$student, levels = c("No", "Yes"))
cat("Dataset dimensions:", nrow(Default), "rows x", ncol(Default), "columns\n")
## Dataset dimensions: 10000 rows x 5 columns
cat("Default distribution:\n")
## Default distribution:
print(table(Default$default))
##
## No Yes
## 9667 333
# Table 4.1 — balance only
model_4_1 <- glm(default ~ balance, data = Default, family = binomial)
# Table 4.2 — student only
model_4_2 <- glm(default ~ student, data = Default, family = binomial)
# Table 4.3 — balance + income + student (multiple logistic regression)
model_4_3 <- glm(default ~ balance + income + student,
data = Default, family = binomial)
# Summary of Table 4.3 (full model)
summary(model_4_3)
##
## Call:
## glm(formula = default ~ balance + income + student, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.087e+01 4.923e-01 -22.080 < 2e-16 ***
## balance 5.737e-03 2.319e-04 24.738 < 2e-16 ***
## income 3.033e-06 8.203e-06 0.370 0.71152
## studentYes -6.468e-01 2.363e-01 -2.738 0.00619 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.5 on 9996 degrees of freedom
## AIC: 1579.5
##
## Number of Fisher Scoring iterations: 8
prob_4_1 <- predict(model_4_1, type = "response")
prob_4_2 <- predict(model_4_2, type = "response")
prob_4_3 <- predict(model_4_3, type = "response")
roc_4_1 <- roc(Default$default, prob_4_1, levels = c("No", "Yes"), direction = "<")
roc_4_2 <- roc(Default$default, prob_4_2, levels = c("No", "Yes"), direction = "<")
roc_4_3 <- roc(Default$default, prob_4_3, levels = c("No", "Yes"), direction = "<")
cat(sprintf("AUC — Table 4.1 (balance only): %.4f\n", auc(roc_4_1)))
## AUC — Table 4.1 (balance only): 0.9480
cat(sprintf("AUC — Table 4.2 (student only): %.4f\n", auc(roc_4_2)))
## AUC — Table 4.2 (student only): 0.5450
cat(sprintf("AUC — Table 4.3 (balance+income+student): %.4f\n", auc(roc_4_3)))
## AUC — Table 4.3 (balance+income+student): 0.9496
plot(roc_4_1,
col = "#2196F3",
lwd = 2.5,
main = "Table 4.1 — ROC Curve: balance only",
legacy.axes = TRUE,
xlab = "False Positive Rate (1 - Specificity)",
ylab = "True Positive Rate (Sensitivity)")
abline(a = 0, b = 1, lty = 2, col = "grey50")
legend("bottomright",
legend = sprintf("AUC = %.4f", auc(roc_4_1)),
col = "#2196F3", lwd = 2.5, bty = "n")
plot(roc_4_2,
col = "#FF5722",
lwd = 2.5,
main = "Table 4.2 — ROC Curve: student only",
legacy.axes = TRUE,
xlab = "False Positive Rate (1 - Specificity)",
ylab = "True Positive Rate (Sensitivity)")
abline(a = 0, b = 1, lty = 2, col = "grey50")
legend("bottomright",
legend = sprintf("AUC = %.4f", auc(roc_4_2)),
col = "#FF5722", lwd = 2.5, bty = "n")
plot(roc_4_3,
col = "#4CAF50",
lwd = 2.5,
main = "Table 4.3 — ROC Curve: balance + income + student",
legacy.axes = TRUE,
xlab = "False Positive Rate (1 - Specificity)",
ylab = "True Positive Rate (Sensitivity)")
abline(a = 0, b = 1, lty = 2, col = "grey50")
legend("bottomright",
legend = sprintf("AUC = %.4f", auc(roc_4_3)),
col = "#4CAF50", lwd = 2.5, bty = "n")
plot(roc_4_1,
col = "#2196F3",
lwd = 2.5,
legacy.axes = TRUE,
main = "ROC Curves — Default Dataset\nTables 4.1, 4.2 & 4.3 (ISLR)",
xlab = "False Positive Rate (1 - Specificity)",
ylab = "True Positive Rate (Sensitivity)")
plot(roc_4_2, col = "#FF5722", lwd = 2.5, add = TRUE)
plot(roc_4_3, col = "#4CAF50", lwd = 2.5, add = TRUE)
abline(a = 0, b = 1, lty = 2, col = "grey50", lwd = 1.5)
legend("bottomright",
legend = c(
sprintf("Table 4.1 — balance only (AUC = %.4f)", auc(roc_4_1)),
sprintf("Table 4.2 — student only (AUC = %.4f)", auc(roc_4_2)),
sprintf("Table 4.3 — balance+income+student (AUC = %.4f)", auc(roc_4_3)),
"Random Classifier"
),
col = c("#2196F3", "#FF5722", "#4CAF50", "grey50"),
lwd = c(2.5, 2.5, 2.5, 1.5),
lty = c(1, 1, 1, 2),
bty = "n",
cex = 0.85)
# Build a tidy data frame from each ROC object
roc_to_df <- function(roc_obj, label) {
data.frame(
FPR = 1 - roc_obj$specificities,
TPR = roc_obj$sensitivities,
Model = label
)
}
df_roc <- rbind(
roc_to_df(roc_4_1, sprintf("Table 4.1 — balance only (AUC=%.3f)", auc(roc_4_1))),
roc_to_df(roc_4_2, sprintf("Table 4.2 — student only (AUC=%.3f)", auc(roc_4_2))),
roc_to_df(roc_4_3, sprintf("Table 4.3 — balance+income+student (AUC=%.3f)", auc(roc_4_3)))
)
ggplot(df_roc, aes(x = FPR, y = TPR, colour = Model)) +
geom_line(linewidth = 1.2) +
geom_abline(slope = 1, intercept = 0,
linetype = "dashed", colour = "grey50", linewidth = 0.8) +
scale_colour_manual(values = c("#2196F3", "#FF5722", "#4CAF50")) +
labs(
title = "ROC Curves — Default Dataset",
subtitle = "Logistic Regression Models from ISLR Tables 4.1, 4.2 & 4.3",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
colour = NULL
) +
theme_bw(base_size = 13) +
theme(
legend.position = c(0.62, 0.18),
legend.background = element_rect(fill = "white", colour = "grey80"),
legend.key.width = unit(1.5, "cm"),
plot.title = element_text(face = "bold", size = 15),
plot.subtitle = element_text(colour = "grey40")
) +
annotate("text", x = 0.65, y = 0.07,
label = "Dashed line = random classifier",
colour = "grey50", size = 3.5)
auc_table <- data.frame(
Table = c("4.1", "4.2", "4.3"),
Model = c("balance only",
"student only",
"balance + income + student"),
AUC = round(c(auc(roc_4_1), auc(roc_4_2), auc(roc_4_3)), 4),
Interpretation = c("Excellent discrimination",
"Near random — student alone is weak",
"Best model — highest AUC")
)
knitr::kable(auc_table,
col.names = c("Table", "Predictors", "AUC", "Interpretation"),
align = c("c", "l", "c", "l"),
caption = "AUC Summary for the Three Logistic Regression Models")
| Table | Predictors | AUC | Interpretation |
|---|---|---|---|
| 4.1 | balance only | 0.9480 | Excellent discrimination |
| 4.2 | student only | 0.5450 | Near random — student alone is weak |
| 4.3 | balance + income + student | 0.9496 | Best model — highest AUC |
balance only): The
balance predictor alone is a very strong discriminator for
credit default. Its high AUC reflects that outstanding balance is the
dominant signal.student only): Student
status alone performs near random. This is consistent with the ISLR
discussion — the bivariate relationship is confounded by balance.balance + income + student): The full model achieves the
highest AUC. Adding income and student to
balance provides marginal improvement over using
balance alone, but the gain is modest because
balance captures most of the variation.Key insight from ISLR Ch.4: The student variable reverses sign (confounding) when
balanceis included in the model — a classic example of why multiple logistic regression is preferable to simple comparisons.