Overview

This document fits the three logistic regression models described in ISLR Tables 4.1, 4.2, and 4.3 on the Default dataset and plots their ROC curves together for comparison.

Table Model Predictors
4.1 Model 1 balance only
4.2 Model 2 income only
4.3 Model 3 balance + income + student

1. Load Libraries and Data

library(tidyverse)
library(pROC)

# Load data
default_df <- read.csv("Default.csv")

# Convert outcome to binary factor
default_df$default <- factor(default_df$default, levels = c("No", "Yes"))
default_df$student <- factor(default_df$student, levels = c("No", "Yes"))

glimpse(default_df)
## Rows: 10,000
## Columns: 5
## $ X       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, No, No, N…
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919.5885, 8…
## $ income  <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496, 7491.55…

2. Fit the Three Logistic Regression Models

# Table 4.1: balance only
model_41 <- glm(default ~ balance, data = default_df, family = binomial)

# Table 4.2: income only
model_42 <- glm(default ~ income, data = default_df, family = binomial)

# Table 4.3: balance + income + student
model_43 <- glm(default ~ balance + income + student, data = default_df, family = binomial)

summary(model_41)
## 
## Call:
## glm(formula = default ~ balance, family = binomial, data = default_df)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.065e+01  3.612e-01  -29.49   <2e-16 ***
## balance      5.499e-03  2.204e-04   24.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8
summary(model_42)
## 
## Call:
## glm(formula = default ~ income, family = binomial, data = default_df)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.094e+00  1.463e-01 -21.156   <2e-16 ***
## income      -8.353e-06  4.207e-06  -1.985   0.0471 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2916.7  on 9998  degrees of freedom
## AIC: 2920.7
## 
## Number of Fisher Scoring iterations: 6
summary(model_43)
## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial, 
##     data = default_df)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

3. Generate Predicted Probabilities

pred_41 <- predict(model_41, type = "response")
pred_42 <- predict(model_42, type = "response")
pred_43 <- predict(model_43, type = "response")

4. Compute ROC Objects

roc_41 <- roc(default_df$default, pred_41, levels = c("No", "Yes"), direction = "<")
roc_42 <- roc(default_df$default, pred_42, levels = c("No", "Yes"), direction = "<")
roc_43 <- roc(default_df$default, pred_43, levels = c("No", "Yes"), direction = "<")

cat("AUC – Model 4.1 (balance):                  ", auc(roc_41), "\n")
## AUC – Model 4.1 (balance):                   0.9479785
cat("AUC – Model 4.2 (income):                   ", auc(roc_42), "\n")
## AUC – Model 4.2 (income):                    0.5326533
cat("AUC – Model 4.3 (balance+income+student):   ", auc(roc_43), "\n")
## AUC – Model 4.3 (balance+income+student):    0.9495581

5. Plot ROC Curves

# Build a combined data frame for ggplot
roc_to_df <- function(roc_obj, label) {
  data.frame(
    FPR   = 1 - roc_obj$specificities,
    TPR   = roc_obj$sensitivities,
    Model = label
  )
}

df_roc <- bind_rows(
  roc_to_df(roc_41, paste0("Table 4.1: balance  (AUC = ", round(auc(roc_41), 3), ")")),
  roc_to_df(roc_42, paste0("Table 4.2: income   (AUC = ", round(auc(roc_42), 3), ")")),
  roc_to_df(roc_43, paste0("Table 4.3: balance+income+student  (AUC = ", round(auc(roc_43), 3), ")"))
)

ggplot(df_roc, aes(x = FPR, y = TPR, color = Model)) +
  geom_line(linewidth = 1.1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
  scale_color_manual(values = c("#E74C3C", "#3498DB", "#2ECC71")) +
  labs(
    title    = "ROC Curves – Default Dataset",
    subtitle = "Logistic Regression Models from ISLR Tables 4.1, 4.2 & 4.3",
    x        = "False Positive Rate (1 – Specificity)",
    y        = "True Positive Rate (Sensitivity)",
    color    = "Model"
  ) +
  theme_bw(base_size = 13) +
  theme(
    legend.position  = c(0.62, 0.18),
    legend.background = element_rect(fill = "white", color = "grey80"),
    legend.text       = element_text(size = 10),
    plot.title        = element_text(face = "bold", size = 15),
    plot.subtitle     = element_text(size = 11, color = "grey40")
  ) +
  coord_equal()


6. AUC Summary Table

auc_table <- data.frame(
  Table   = c("Table 4.1", "Table 4.2", "Table 4.3"),
  Model   = c("balance only", "income only", "balance + income + student"),
  AUC     = round(c(auc(roc_41), auc(roc_42), auc(roc_43)), 4)
)

knitr::kable(auc_table, caption = "AUC for Each Model")
AUC for Each Model
Table Model AUC
Table 4.1 balance only 0.9480
Table 4.2 income only 0.5327
Table 4.3 balance + income + student 0.9496

7. Interpretation

The ROC plot visually confirms these findings: the curves for Tables 4.1 and 4.3 hug the upper-left corner, while Table 4.2 lies close to the diagonal (random classifier).