Load Libraries and Data

library(ggplot2)
library(pROC)

Default <- read.csv("Default.csv")
Default$default <- ifelse(Default$default == "Yes", 1, 0)
Default$student <- ifelse(Default$student == "Yes", 1, 0)

Fit Logistic Regression Models

Model 1 – Table 4.1: default ~ balance

model1 <- glm(default ~ balance, data = Default, family = binomial)
summary(model1)
## 
## Call:
## glm(formula = default ~ balance, family = binomial, data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.065e+01  3.612e-01  -29.49   <2e-16 ***
## balance      5.499e-03  2.204e-04   24.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8

Model 2 – Table 4.2: default ~ student

model2 <- glm(default ~ student, data = Default, family = binomial)
summary(model2)
## 
## Call:
## glm(formula = default ~ student, family = binomial, data = Default)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.50413    0.07071  -49.55  < 2e-16 ***
## student      0.40489    0.11502    3.52 0.000431 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2908.7  on 9998  degrees of freedom
## AIC: 2912.7
## 
## Number of Fisher Scoring iterations: 6

Model 3 – Table 4.3: default ~ balance + income + student

model3 <- glm(default ~ balance + income + student, data = Default, family = binomial)
summary(model3)
## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## student     -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

Predicted Probabilities

prob1 <- predict(model1, type = "response")
prob2 <- predict(model2, type = "response")
prob3 <- predict(model3, type = "response")

ROC Curves

roc1 <- roc(Default$default, prob1)
roc2 <- roc(Default$default, prob2)
roc3 <- roc(Default$default, prob3)

# Base plot
plot(roc1, col = "red", lwd = 2,
     main = "ROC Curves for Default Dataset",
     xlab = "False Positive Rate (1 - Specificity)",
     ylab = "True Positive Rate (Sensitivity)",
     legacy.axes = TRUE)
plot(roc2, col = "blue", lwd = 2, add = TRUE)
plot(roc3, col = "darkgreen", lwd = 2, add = TRUE)
abline(a = 0, b = 1, lty = 2, col = "gray")

legend("bottomright",
       legend = c(
         paste0("Model 1: balance (AUC = ", round(auc(roc1), 4), ")"),
         paste0("Model 2: student (AUC = ", round(auc(roc2), 4), ")"),
         paste0("Model 3: balance + income + student (AUC = ", round(auc(roc3), 4), ")")
       ),
       col = c("red", "blue", "darkgreen"),
       lwd = 2)

AUC Summary

auc_table <- data.frame(
  Model = c("Model 1: balance",
            "Model 2: student",
            "Model 3: balance + income + student"),
  AUC = c(round(auc(roc1), 4),
          round(auc(roc2), 4),
          round(auc(roc3), 4))
)
knitr::kable(auc_table, caption = "AUC Values for Each Model")
AUC Values for Each Model
Model AUC
Model 1: balance 0.9480
Model 2: student 0.5450
Model 3: balance + income + student 0.9496

ggplot Version of ROC Curves

roc_df <- rbind(
  data.frame(FPR = 1 - roc1$specificities, TPR = roc1$sensitivities,
             Model = paste0("balance (AUC=", round(auc(roc1), 4), ")")),
  data.frame(FPR = 1 - roc2$specificities, TPR = roc2$sensitivities,
             Model = paste0("student (AUC=", round(auc(roc2), 4), ")")),
  data.frame(FPR = 1 - roc3$specificities, TPR = roc3$sensitivities,
             Model = paste0("balance+income+student (AUC=", round(auc(roc3), 4), ")"))
)

ggplot(roc_df, aes(x = FPR, y = TPR, color = Model)) +
  geom_line(linewidth = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") +
  labs(title = "ROC Curves for Default Dataset",
       x = "False Positive Rate (1 - Specificity)",
       y = "True Positive Rate (Sensitivity)") +
  theme_minimal() +
  theme(legend.position = "bottom")