We fit logistic regression models based on Tables 4.1, 4.2, and 4.3 from ISLR and plot ROC curves.
balance
onlybalance + income + studentstudent only# Load required libraries
library(ISLR2)
library(pROC)
library(ggplot2)
# Use built-in Default dataset (same as Default.csv)
data(Default)
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
# Model 1 (Table 4.1): balance only
model1 <- glm(default ~ balance, data = Default, family = binomial)
# Model 2 (Table 4.2): balance + income + student
model2 <- glm(default ~ balance + income + student, data = Default, family = binomial)
# Model 3 (Table 4.3): student only
model3 <- glm(default ~ student, data = Default, family = binomial)
# Predicted probabilities
prob1 <- predict(model1, type = "response")
prob2 <- predict(model2, type = "response")
prob3 <- predict(model3, type = "response")
# Compute ROC objects
roc1 <- roc(Default$default, prob1, levels = c("No", "Yes"), direction = "<")
roc2 <- roc(Default$default, prob2, levels = c("No", "Yes"), direction = "<")
roc3 <- roc(Default$default, prob3, levels = c("No", "Yes"), direction = "<")
# AUC values
auc1 <- round(auc(roc1), 4)
auc2 <- round(auc(roc2), 4)
auc3 <- round(auc(roc3), 4)
# Build data frames for ggplot
roc_df <- function(roc_obj, model_name, auc_val) {
data.frame(
FPR = 1 - roc_obj$specificities,
TPR = roc_obj$sensitivities,
Model = paste0(model_name, " (AUC = ", auc_val, ")")
)
}
df1 <- roc_df(roc1, "Model 1: balance", auc1)
df2 <- roc_df(roc2, "Model 2: balance+income+student", auc2)
df3 <- roc_df(roc3, "Model 3: student", auc3)
roc_data <- rbind(df1, df2, df3)
# Plot
ggplot(roc_data, aes(x = FPR, y = TPR, color = Model)) +
geom_line(size = 1.1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50") +
scale_color_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A")) +
labs(
title = "ROC Curves for Default Dataset (Tables 4.1, 4.2, 4.3)",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
color = "Model"
) +
theme_bw(base_size = 13) +
theme(legend.position = "bottom",
legend.text = element_text(size = 10))
cat("AUC Summary:\n")
## AUC Summary:
cat(sprintf(" Model 1 (balance only): %.4f\n", auc1))
## Model 1 (balance only): 0.9480
cat(sprintf(" Model 2 (balance + income + student): %.4f\n", auc2))
## Model 2 (balance + income + student): 0.9496
cat(sprintf(" Model 3 (student only): %.4f\n", auc3))
## Model 3 (student only): 0.5450
balance is the strongest predictor of default.