This document fits the three logistic regression models described in
ISLR Tables 4.1, 4.2, and 4.3 on the
Default dataset and plots their ROC curves
together for comparison.
| Table | Model | Predictors |
|---|---|---|
| 4.1 | Model 1 | balance only |
| 4.2 | Model 2 | income only |
| 4.3 | Model 3 | balance + income + student |
library(tidyverse)
library(pROC)
# Load data
default_df <- read.csv("Default.csv")
# Convert outcome to binary factor
default_df$default <- factor(default_df$default, levels = c("No", "Yes"))
default_df$student <- factor(default_df$student, levels = c("No", "Yes"))
glimpse(default_df)
## Rows: 10,000
## Columns: 5
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, No, No, N…
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919.5885, 8…
## $ income <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496, 7491.55…
# Table 4.1: balance only
model_41 <- glm(default ~ balance, data = default_df, family = binomial)
# Table 4.2: income only
model_42 <- glm(default ~ income, data = default_df, family = binomial)
# Table 4.3: balance + income + student
model_43 <- glm(default ~ balance + income + student, data = default_df, family = binomial)
summary(model_41)
##
## Call:
## glm(formula = default ~ balance, family = binomial, data = default_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.065e+01 3.612e-01 -29.49 <2e-16 ***
## balance 5.499e-03 2.204e-04 24.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1596.5 on 9998 degrees of freedom
## AIC: 1600.5
##
## Number of Fisher Scoring iterations: 8
summary(model_42)
##
## Call:
## glm(formula = default ~ income, family = binomial, data = default_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.094e+00 1.463e-01 -21.156 <2e-16 ***
## income -8.353e-06 4.207e-06 -1.985 0.0471 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 2916.7 on 9998 degrees of freedom
## AIC: 2920.7
##
## Number of Fisher Scoring iterations: 6
summary(model_43)
##
## Call:
## glm(formula = default ~ balance + income + student, family = binomial,
## data = default_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.087e+01 4.923e-01 -22.080 < 2e-16 ***
## balance 5.737e-03 2.319e-04 24.738 < 2e-16 ***
## income 3.033e-06 8.203e-06 0.370 0.71152
## studentYes -6.468e-01 2.363e-01 -2.738 0.00619 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.5 on 9996 degrees of freedom
## AIC: 1579.5
##
## Number of Fisher Scoring iterations: 8
pred_41 <- predict(model_41, type = "response")
pred_42 <- predict(model_42, type = "response")
pred_43 <- predict(model_43, type = "response")
roc_41 <- roc(default_df$default, pred_41, levels = c("No", "Yes"), direction = "<")
roc_42 <- roc(default_df$default, pred_42, levels = c("No", "Yes"), direction = "<")
roc_43 <- roc(default_df$default, pred_43, levels = c("No", "Yes"), direction = "<")
cat("AUC – Model 4.1 (balance): ", auc(roc_41), "\n")
## AUC – Model 4.1 (balance): 0.9479785
cat("AUC – Model 4.2 (income): ", auc(roc_42), "\n")
## AUC – Model 4.2 (income): 0.5326533
cat("AUC – Model 4.3 (balance+income+student): ", auc(roc_43), "\n")
## AUC – Model 4.3 (balance+income+student): 0.9495581
# Build a combined data frame for ggplot
roc_to_df <- function(roc_obj, label) {
data.frame(
FPR = 1 - roc_obj$specificities,
TPR = roc_obj$sensitivities,
Model = label
)
}
df_roc <- bind_rows(
roc_to_df(roc_41, paste0("Table 4.1: balance (AUC = ", round(auc(roc_41), 3), ")")),
roc_to_df(roc_42, paste0("Table 4.2: income (AUC = ", round(auc(roc_42), 3), ")")),
roc_to_df(roc_43, paste0("Table 4.3: balance+income+student (AUC = ", round(auc(roc_43), 3), ")"))
)
ggplot(df_roc, aes(x = FPR, y = TPR, color = Model)) +
geom_line(linewidth = 1.1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
scale_color_manual(values = c("#E74C3C", "#3498DB", "#2ECC71")) +
labs(
title = "ROC Curves – Default Dataset",
subtitle = "Logistic Regression Models from ISLR Tables 4.1, 4.2 & 4.3",
x = "False Positive Rate (1 – Specificity)",
y = "True Positive Rate (Sensitivity)",
color = "Model"
) +
theme_bw(base_size = 13) +
theme(
legend.position = c(0.62, 0.18),
legend.background = element_rect(fill = "white", color = "grey80"),
legend.text = element_text(size = 10),
plot.title = element_text(face = "bold", size = 15),
plot.subtitle = element_text(size = 11, color = "grey40")
) +
coord_equal()
auc_table <- data.frame(
Table = c("Table 4.1", "Table 4.2", "Table 4.3"),
Model = c("balance only", "income only", "balance + income + student"),
AUC = round(c(auc(roc_41), auc(roc_42), auc(roc_43)), 4)
)
knitr::kable(auc_table, caption = "AUC for Each Model")
| Table | Model | AUC |
|---|---|---|
| Table 4.1 | balance only | 0.9480 |
| Table 4.2 | income only | 0.5327 |
| Table 4.3 | balance + income + student | 0.9496 |
balance is
by far the strongest predictor of default.balance alone.The ROC plot visually confirms these findings: the curves for Tables 4.1 and 4.3 hug the upper-left corner, while Table 4.2 lies close to the diagonal (random classifier).