HW 06

Overview

This document fits the three logistic regression models described in Tables 4.1, 4.2, and 4.3 of An Introduction to Statistical Learning (ISLR) using the Default dataset, then plots their ROC curves for comparison.

Table	Model Description	Predictors
4.1	Simple logistic regression	`balance` only
4.2	Simple logistic regression	`student` only
4.3	Multiple logistic regression	`balance + income + student`

1. Load Packages & Data

# Install packages if needed:
# install.packages(c("tidyverse", "pROC", "ISLR"))

library(tidyverse)
library(pROC)

# Option A: load from ISLR package
# library(ISLR)
# data(Default)

# Option B: load from CSV (used here)
Default <- read.csv("Default.csv", row.names = 1)

# Convert to factors
Default$default <- factor(Default$default, levels = c("No", "Yes"))
Default$student <- factor(Default$student, levels = c("No", "Yes"))

glimpse(Default)

## Rows: 10,000
## Columns: 4
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, No, No, N…
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919.5885, 8…
## $ income  <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496, 7491.55…

2. Fit the Three Models

Table 4.1 — Balance Only

# Table 4.1: Simple logistic regression with balance
model_4_1 <- glm(default ~ balance,
                 data   = Default,
                 family = binomial)

summary(model_4_1)

## 
## Call:
## glm(formula = default ~ balance, family = binomial, data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.065e+01  3.612e-01  -29.49   <2e-16 ***
## balance      5.499e-03  2.204e-04   24.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8

Table 4.2 — Student Status Only

# Table 4.2: Simple logistic regression with student
model_4_2 <- glm(default ~ student,
                 data   = Default,
                 family = binomial)

summary(model_4_2)

## 
## Call:
## glm(formula = default ~ student, family = binomial, data = Default)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.50413    0.07071  -49.55  < 2e-16 ***
## studentYes   0.40489    0.11502    3.52 0.000431 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2908.7  on 9998  degrees of freedom
## AIC: 2912.7
## 
## Number of Fisher Scoring iterations: 6

Table 4.3 — Balance + Income + Student

# Table 4.3: Multiple logistic regression
model_4_3 <- glm(default ~ balance + income + student,
                 data   = Default,
                 family = binomial)

summary(model_4_3)

## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

3. Compute Predicted Probabilities

Default$prob_4_1 <- predict(model_4_1, type = "response")
Default$prob_4_2 <- predict(model_4_2, type = "response")
Default$prob_4_3 <- predict(model_4_3, type = "response")

4. Build ROC Objects

roc_4_1 <- roc(Default$default, Default$prob_4_1, levels = c("No", "Yes"))
roc_4_2 <- roc(Default$default, Default$prob_4_2, levels = c("No", "Yes"))
roc_4_3 <- roc(Default$default, Default$prob_4_3, levels = c("No", "Yes"))

# Print AUC values
cat(sprintf("AUC — Table 4.1 (balance):                   %.4f\n", auc(roc_4_1)))

## AUC — Table 4.1 (balance):                   0.9480

cat(sprintf("AUC — Table 4.2 (student):                   %.4f\n", auc(roc_4_2)))

## AUC — Table 4.2 (student):                   0.5450

cat(sprintf("AUC — Table 4.3 (balance + income + student): %.4f\n", auc(roc_4_3)))

## AUC — Table 4.3 (balance + income + student): 0.9496

5. Plot ROC Curves

5a. Base R Plot

plot(roc_4_1,
     col  = "steelblue",
     lwd  = 2,
     main = "ROC Curves — Default Dataset (ISLR Tables 4.1–4.3)",
     xlab = "False Positive Rate (1 – Specificity)",
     ylab = "True Positive Rate (Sensitivity)")

plot(roc_4_2, col = "tomato",    lwd = 2, add = TRUE)
plot(roc_4_3, col = "darkgreen", lwd = 2, add = TRUE)

abline(a = 0, b = 1, lty = 2, col = "grey50")   # random-classifier line

legend("bottomright",
       legend = c(
         sprintf("Table 4.1: balance only        (AUC = %.3f)", auc(roc_4_1)),
         sprintf("Table 4.2: student only         (AUC = %.3f)", auc(roc_4_2)),
         sprintf("Table 4.3: balance+income+student (AUC = %.3f)", auc(roc_4_3))
       ),
       col = c("steelblue", "tomato", "darkgreen"),
       lwd = 2,
       bty = "n",
       cex = 0.85)

5b. ggplot2 Plot

# Helper to convert a pROC roc object to a tidy data frame
roc_to_df <- function(roc_obj, label) {
  data.frame(
    FPR   = 1 - roc_obj$specificities,
    TPR   = roc_obj$sensitivities,
    Model = label
  )
}

roc_df <- bind_rows(
  roc_to_df(roc_4_1,
            sprintf("Table 4.1: balance only (AUC = %.3f)",        auc(roc_4_1))),
  roc_to_df(roc_4_2,
            sprintf("Table 4.2: student only (AUC = %.3f)",        auc(roc_4_2))),
  roc_to_df(roc_4_3,
            sprintf("Table 4.3: balance+income+student (AUC = %.3f)", auc(roc_4_3)))
)

ggplot(roc_df, aes(x = FPR, y = TPR, colour = Model)) +
  geom_line(linewidth = 1) +
  geom_abline(slope = 1, intercept = 0,
              linetype = "dashed", colour = "grey50") +
  scale_colour_manual(values = c("steelblue", "tomato", "darkgreen")) +
  labs(
    title    = "ROC Curves — Default Dataset (ISLR Tables 4.1–4.3)",
    x        = "False Positive Rate (1 – Specificity)",
    y        = "True Positive Rate (Sensitivity)",
    colour   = NULL
  ) +
  theme_bw(base_size = 13) +
  theme(
    legend.position  = "bottom",
    legend.direction = "vertical",
    plot.title       = element_text(face = "bold", hjust = 0.5)
  )

6. Summary Table

summary_tbl <- data.frame(
  Table    = c("4.1", "4.2", "4.3"),
  Model    = c("balance only",
               "student only",
               "balance + income + student"),
  AUC      = round(c(auc(roc_4_1), auc(roc_4_2), auc(roc_4_3)), 4)
)

knitr::kable(summary_tbl,
             caption = "AUC Comparison Across the Three ISLR Models",
             align   = c("c", "l", "c"))

AUC Comparison Across the Three ISLR Models
Table	Model	AUC
4.1	balance only	0.9480
4.2	student only	0.5450
4.3	balance + income + student	0.9496

7. Interpretation

Table 4.1 (balance) achieves the highest single-predictor AUC. Balance is a strong continuous predictor of default.
Table 4.2 (student) has a much lower AUC — the binary student indicator alone provides little discriminative power.
Table 4.3 (all three predictors) matches or slightly exceeds Table 4.1, confirming that once balance is included, income and student status add only marginal improvement in discrimination (though they remain statistically and practically meaningful).