ROC Curves for Default Dataset – Tables 4.1, 4.2 & 4.3

Overview

This document fits the three logistic regression models described in Tables 4.1, 4.2, and 4.3 of Introduction to Statistical Learning (ISLR) using the Default dataset, then plots their ROC curves and reports AUC values.

Table	Model	Predictors
4.1	Model 1	`balance` only
4.2	Model 2	`income` only
4.3	Model 3	`balance + income + student` (multiple logistic)

1. Load Packages and Data

# Install packages if needed:
# install.packages(c("pROC", "ggplot2", "dplyr"))

library(pROC)
library(ggplot2)
library(dplyr)

# Load data
default <- read.csv("Default.csv", row.names = 1)

# Convert response to factor
default$default <- factor(default$default, levels = c("No", "Yes"))
default$student <- factor(default$student, levels = c("No", "Yes"))

cat("Dimensions:", nrow(default), "x", ncol(default), "\n")

## Dimensions: 10000 x 4

cat("Default rate:", round(mean(default$default == "Yes") * 100, 2), "%\n")

## Default rate: 3.33 %

head(default)

##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559

2. Fit Logistic Regression Models

Table 4.1 – Model 1: Balance Only

model1 <- glm(default ~ balance, data = default, family = binomial)
summary(model1)

## 
## Call:
## glm(formula = default ~ balance, family = binomial, data = default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.065e+01  3.612e-01  -29.49   <2e-16 ***
## balance      5.499e-03  2.204e-04   24.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8

Table 4.2 – Model 2: Income Only

model2 <- glm(default ~ income, data = default, family = binomial)
summary(model2)

## 
## Call:
## glm(formula = default ~ income, family = binomial, data = default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.094e+00  1.463e-01 -21.156   <2e-16 ***
## income      -8.353e-06  4.207e-06  -1.985   0.0471 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2916.7  on 9998  degrees of freedom
## AIC: 2920.7
## 
## Number of Fisher Scoring iterations: 6

Table 4.3 – Model 3: Balance + Income + Student

model3 <- glm(default ~ balance + income + student, data = default, family = binomial)
summary(model3)

## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial, 
##     data = default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

3. Generate Predicted Probabilities

prob1 <- predict(model1, type = "response")
prob2 <- predict(model2, type = "response")
prob3 <- predict(model3, type = "response")

4. Compute ROC Objects

roc1 <- roc(default$default, prob1, levels = c("No", "Yes"), direction = "<")
roc2 <- roc(default$default, prob2, levels = c("No", "Yes"), direction = "<")
roc3 <- roc(default$default, prob3, levels = c("No", "Yes"), direction = "<")

cat(sprintf("AUC – Model 1 (balance):                    %.4f\n", auc(roc1)))

## AUC – Model 1 (balance):                    0.9480

cat(sprintf("AUC – Model 2 (income):                     %.4f\n", auc(roc2)))

## AUC – Model 2 (income):                     0.5327

cat(sprintf("AUC – Model 3 (balance+income+student):     %.4f\n", auc(roc3)))

## AUC – Model 3 (balance+income+student):     0.9496

5. ROC Curve Plot

# Convert each ROC object to a data frame
roc_to_df <- function(roc_obj, label) {
  data.frame(
    FPR   = 1 - roc_obj$specificities,
    TPR   = roc_obj$sensitivities,
    Model = label
  )
}

df_roc <- bind_rows(
  roc_to_df(roc1, sprintf("Model 1: balance          (AUC = %.4f)", auc(roc1))),
  roc_to_df(roc2, sprintf("Model 2: income           (AUC = %.4f)", auc(roc2))),
  roc_to_df(roc3, sprintf("Model 3: balance+income+student (AUC = %.4f)", auc(roc3)))
)

ggplot(df_roc, aes(x = FPR, y = TPR, colour = Model)) +
  geom_line(linewidth = 1.1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed",
              colour = "grey50", linewidth = 0.7) +
  scale_colour_manual(
    values = c("#E63946", "#457B9D", "#2A9D8F")
  ) +
  labs(
    title    = "ROC Curves – Default Dataset (Tables 4.1, 4.2, 4.3)",
    subtitle = "Logistic Regression Models",
    x        = "False Positive Rate (1 – Specificity)",
    y        = "True Positive Rate (Sensitivity)",
    colour   = "Model"
  ) +
  theme_bw(base_size = 13) +
  theme(
    plot.title      = element_text(face = "bold"),
    legend.position = c(0.62, 0.22),
    legend.background = element_rect(fill = "white", colour = "grey80"),
    legend.text = element_text(family = "mono", size = 9)
  ) +
  coord_equal()

6. AUC Comparison Table

auc_table <- data.frame(
  Table    = c("4.1", "4.2", "4.3"),
  Model    = c("balance only", "income only", "balance + income + student"),
  AUC      = round(c(auc(roc1), auc(roc2), auc(roc3)), 4)
)

knitr::kable(auc_table, align = "llr",
             caption = "AUC Summary for Each Logistic Regression Model")

AUC Summary for Each Logistic Regression Model
Table	Model	AUC
4.1	balance only	0.9480
4.2	income only	0.5327
4.3	balance + income + student	0.9496

7. Interpretation

Model 1 (balance) achieves the highest individual-predictor AUC, confirming that credit card balance is the dominant predictor of default.
Model 2 (income) yields near-random performance, consistent with the very small coefficient estimate seen in Table 4.2.
Model 3 (balance + income + student) combines all predictors and produces the best overall discrimination, illustrating that adding student status and income alongside balance provides incremental lift despite the confounding discussed in ISLR Section 4.3.

Note: These models are fit on the full dataset (no train/test split) to replicate the in-sample results reported in the textbook tables. ```