This report fits three logistic regression models based on
Tables 4.1, 4.2, and 4.3 from An Introduction to
Statistical Learning (ISLR) using the Default dataset,
and plots their ROC curves.
| Table | Predictors |
|---|---|
| 4.1 | balance only |
| 4.2 | student only |
| 4.3 | balance + income + student |
library(ISLR2) # Default dataset
library(pROC) # ROC curves
library(ggplot2) # Plotting
data(Default)
head(Default)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
# Table 4.1: balance only
model_4_1 <- glm(default ~ balance, data = Default, family = binomial)
# Table 4.2: student only
model_4_2 <- glm(default ~ student, data = Default, family = binomial)
# Table 4.3: balance + income + student
model_4_3 <- glm(default ~ balance + income + student, data = Default, family = binomial)
summary(model_4_3)
##
## Call:
## glm(formula = default ~ balance + income + student, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.087e+01 4.923e-01 -22.080 < 2e-16 ***
## balance 5.737e-03 2.319e-04 24.738 < 2e-16 ***
## income 3.033e-06 8.203e-06 0.370 0.71152
## studentYes -6.468e-01 2.363e-01 -2.738 0.00619 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.5 on 9996 degrees of freedom
## AIC: 1579.5
##
## Number of Fisher Scoring iterations: 8
# Predicted probabilities
prob_4_1 <- predict(model_4_1, type = "response")
prob_4_2 <- predict(model_4_2, type = "response")
prob_4_3 <- predict(model_4_3, type = "response")
# ROC objects
roc_4_1 <- roc(Default$default, prob_4_1, quiet = TRUE)
roc_4_2 <- roc(Default$default, prob_4_2, quiet = TRUE)
roc_4_3 <- roc(Default$default, prob_4_3, quiet = TRUE)
# AUC values
cat("AUC - Table 4.1 (balance): ", round(auc(roc_4_1), 4), "\n")
## AUC - Table 4.1 (balance): 0.948
cat("AUC - Table 4.2 (student): ", round(auc(roc_4_2), 4), "\n")
## AUC - Table 4.2 (student): 0.545
cat("AUC - Table 4.3 (balance+income+student):", round(auc(roc_4_3), 4), "\n")
## AUC - Table 4.3 (balance+income+student): 0.9496
# Build data frames for ggplot
make_roc_df <- function(roc_obj, label) {
data.frame(
FPR = 1 - roc_obj$specificities,
TPR = roc_obj$sensitivities,
Model = label
)
}
df_roc <- rbind(
make_roc_df(roc_4_1, paste0("Table 4.1 – balance only (AUC = ", round(auc(roc_4_1), 3), ")")),
make_roc_df(roc_4_2, paste0("Table 4.2 – student only (AUC = ", round(auc(roc_4_2), 3), ")")),
make_roc_df(roc_4_3, paste0("Table 4.3 – balance + income + student (AUC = ", round(auc(roc_4_3), 3), ")"))
)
ggplot(df_roc, aes(x = FPR, y = TPR, color = Model, linetype = Model)) +
geom_line(linewidth = 1.1) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "gray50") +
scale_color_manual(values = c("#2196F3", "#FF9800", "#4CAF50")) +
scale_linetype_manual(values = c("solid", "dashed", "dotdash")) +
labs(
title = "ROC Curves – ISLR Default Dataset",
subtitle = "Logistic Regression Models from Tables 4.1, 4.2, and 4.3",
x = "False Positive Rate (1 – Specificity)",
y = "True Positive Rate (Sensitivity)",
color = "Model",
linetype = "Model"
) +
theme_bw(base_size = 13) +
theme(
legend.position = "bottom",
legend.direction = "vertical",
plot.title = element_text(face = "bold")
)