Overview

This document fits the three logistic regression models from ISLR Tables 4.1–4.3 on the Default dataset and plots their ROC curves for comparison.

Table Model Predictors
4.1 Simple logistic regression balance only
4.2 Simple logistic regression student only
4.3 Multiple logistic regression balance + income + student

1. Load Libraries & Data

# Install pROC if not available
if (!requireNamespace("pROC", quietly = TRUE)) install.packages("pROC")
if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")

library(ggplot2)
library(pROC)

# Load data
Default <- read.csv("Default.csv")

# Convert binary variables to factors
Default$default  <- factor(Default$default,  levels = c("No", "Yes"))
Default$student  <- factor(Default$student,  levels = c("No", "Yes"))

cat("Dataset dimensions:", nrow(Default), "rows x", ncol(Default), "columns\n")
## Dataset dimensions: 10000 rows x 5 columns
cat("Default distribution:\n")
## Default distribution:
print(table(Default$default))
## 
##   No  Yes 
## 9667  333

2. Fit Logistic Regression Models

# Table 4.1 — balance only
model_4_1 <- glm(default ~ balance, data = Default, family = binomial)

# Table 4.2 — student only
model_4_2 <- glm(default ~ student, data = Default, family = binomial)

# Table 4.3 — balance + income + student (multiple logistic regression)
model_4_3 <- glm(default ~ balance + income + student,
                 data = Default, family = binomial)

# Summary of Table 4.3 (full model)
summary(model_4_3)
## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

3. Generate Predicted Probabilities

prob_4_1 <- predict(model_4_1, type = "response")
prob_4_2 <- predict(model_4_2, type = "response")
prob_4_3 <- predict(model_4_3, type = "response")

4. Compute ROC Objects

roc_4_1 <- roc(Default$default, prob_4_1, levels = c("No", "Yes"), direction = "<")
roc_4_2 <- roc(Default$default, prob_4_2, levels = c("No", "Yes"), direction = "<")
roc_4_3 <- roc(Default$default, prob_4_3, levels = c("No", "Yes"), direction = "<")

cat(sprintf("AUC — Table 4.1 (balance only):             %.4f\n", auc(roc_4_1)))
## AUC — Table 4.1 (balance only):             0.9480
cat(sprintf("AUC — Table 4.2 (student only):             %.4f\n", auc(roc_4_2)))
## AUC — Table 4.2 (student only):             0.5450
cat(sprintf("AUC — Table 4.3 (balance+income+student):   %.4f\n", auc(roc_4_3)))
## AUC — Table 4.3 (balance+income+student):   0.9496

5. ROC Curves — Individual Plots

Table 4.1: Balance Only

plot(roc_4_1,
     col        = "#2196F3",
     lwd        = 2.5,
     main       = "Table 4.1 — ROC Curve: balance only",
     legacy.axes = TRUE,
     xlab       = "False Positive Rate (1 - Specificity)",
     ylab       = "True Positive Rate (Sensitivity)")
abline(a = 0, b = 1, lty = 2, col = "grey50")
legend("bottomright",
       legend = sprintf("AUC = %.4f", auc(roc_4_1)),
       col = "#2196F3", lwd = 2.5, bty = "n")

Table 4.2: Student Only

plot(roc_4_2,
     col        = "#FF5722",
     lwd        = 2.5,
     main       = "Table 4.2 — ROC Curve: student only",
     legacy.axes = TRUE,
     xlab       = "False Positive Rate (1 - Specificity)",
     ylab       = "True Positive Rate (Sensitivity)")
abline(a = 0, b = 1, lty = 2, col = "grey50")
legend("bottomright",
       legend = sprintf("AUC = %.4f", auc(roc_4_2)),
       col = "#FF5722", lwd = 2.5, bty = "n")

Table 4.3: Multiple Logistic Regression

plot(roc_4_3,
     col        = "#4CAF50",
     lwd        = 2.5,
     main       = "Table 4.3 — ROC Curve: balance + income + student",
     legacy.axes = TRUE,
     xlab       = "False Positive Rate (1 - Specificity)",
     ylab       = "True Positive Rate (Sensitivity)")
abline(a = 0, b = 1, lty = 2, col = "grey50")
legend("bottomright",
       legend = sprintf("AUC = %.4f", auc(roc_4_3)),
       col = "#4CAF50", lwd = 2.5, bty = "n")


6. Combined ROC Curve — All Three Models

plot(roc_4_1,
     col        = "#2196F3",
     lwd        = 2.5,
     legacy.axes = TRUE,
     main       = "ROC Curves — Default Dataset\nTables 4.1, 4.2 & 4.3 (ISLR)",
     xlab       = "False Positive Rate (1 - Specificity)",
     ylab       = "True Positive Rate (Sensitivity)")

plot(roc_4_2, col = "#FF5722", lwd = 2.5, add = TRUE)
plot(roc_4_3, col = "#4CAF50", lwd = 2.5, add = TRUE)

abline(a = 0, b = 1, lty = 2, col = "grey50", lwd = 1.5)

legend("bottomright",
       legend = c(
         sprintf("Table 4.1 — balance only         (AUC = %.4f)", auc(roc_4_1)),
         sprintf("Table 4.2 — student only         (AUC = %.4f)", auc(roc_4_2)),
         sprintf("Table 4.3 — balance+income+student (AUC = %.4f)", auc(roc_4_3)),
         "Random Classifier"
       ),
       col  = c("#2196F3", "#FF5722", "#4CAF50", "grey50"),
       lwd  = c(2.5, 2.5, 2.5, 1.5),
       lty  = c(1, 1, 1, 2),
       bty  = "n",
       cex  = 0.85)


7. ggplot2 Version — Combined ROC Curve

# Build a tidy data frame from each ROC object
roc_to_df <- function(roc_obj, label) {
  data.frame(
    FPR   = 1 - roc_obj$specificities,
    TPR   = roc_obj$sensitivities,
    Model = label
  )
}

df_roc <- rbind(
  roc_to_df(roc_4_1, sprintf("Table 4.1 — balance only (AUC=%.3f)",         auc(roc_4_1))),
  roc_to_df(roc_4_2, sprintf("Table 4.2 — student only (AUC=%.3f)",         auc(roc_4_2))),
  roc_to_df(roc_4_3, sprintf("Table 4.3 — balance+income+student (AUC=%.3f)", auc(roc_4_3)))
)

ggplot(df_roc, aes(x = FPR, y = TPR, colour = Model)) +
  geom_line(linewidth = 1.2) +
  geom_abline(slope = 1, intercept = 0,
              linetype = "dashed", colour = "grey50", linewidth = 0.8) +
  scale_colour_manual(values = c("#2196F3", "#FF5722", "#4CAF50")) +
  labs(
    title    = "ROC Curves — Default Dataset",
    subtitle = "Logistic Regression Models from ISLR Tables 4.1, 4.2 & 4.3",
    x        = "False Positive Rate (1 - Specificity)",
    y        = "True Positive Rate (Sensitivity)",
    colour   = NULL
  ) +
  theme_bw(base_size = 13) +
  theme(
    legend.position  = c(0.62, 0.18),
    legend.background = element_rect(fill = "white", colour = "grey80"),
    legend.key.width  = unit(1.5, "cm"),
    plot.title       = element_text(face = "bold", size = 15),
    plot.subtitle    = element_text(colour = "grey40")
  ) +
  annotate("text", x = 0.65, y = 0.07,
           label = "Dashed line = random classifier",
           colour = "grey50", size = 3.5)


8. AUC Summary Table

auc_table <- data.frame(
  Table     = c("4.1", "4.2", "4.3"),
  Model     = c("balance only",
                "student only",
                "balance + income + student"),
  AUC       = round(c(auc(roc_4_1), auc(roc_4_2), auc(roc_4_3)), 4),
  Interpretation = c("Excellent discrimination",
                     "Near random — student alone is weak",
                     "Best model — highest AUC")
)

knitr::kable(auc_table,
             col.names = c("Table", "Predictors", "AUC", "Interpretation"),
             align     = c("c", "l", "c", "l"),
             caption   = "AUC Summary for the Three Logistic Regression Models")
AUC Summary for the Three Logistic Regression Models
Table Predictors AUC Interpretation
4.1 balance only 0.9480 Excellent discrimination
4.2 student only 0.5450 Near random — student alone is weak
4.3 balance + income + student 0.9496 Best model — highest AUC

9. Interpretation

  • Table 4.1 (balance only): The balance predictor alone is a very strong discriminator for credit default. Its high AUC reflects that outstanding balance is the dominant signal.
  • Table 4.2 (student only): Student status alone performs near random. This is consistent with the ISLR discussion — the bivariate relationship is confounded by balance.
  • Table 4.3 (balance + income + student): The full model achieves the highest AUC. Adding income and student to balance provides marginal improvement over using balance alone, but the gain is modest because balance captures most of the variation.

Key insight from ISLR Ch.4: The student variable reverses sign (confounding) when balance is included in the model — a classic example of why multiple logistic regression is preferable to simple comparisons.