Overview

This report fits three logistic regression models on the Default dataset from the ISLR package, corresponding to Tables 4.1, 4.2, and 4.3 in An Introduction to Statistical Learning. ROC curves are plotted to compare the discriminative performance of each model.


Install & Load Packages

# Install if not already installed
if (!require(ISLR))  install.packages("ISLR")
if (!require(pROC))  install.packages("pROC")

library(ISLR)
library(pROC)

Load Data

data(Default)
head(Default)
##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559
dim(Default)
## [1] 10000     4

Fit Logistic Regression Models

Table 4.1 — Simple Logistic Regression: balance

m1 <- glm(default ~ balance, data = Default, family = binomial)
summary(m1)
## 
## Call:
## glm(formula = default ~ balance, family = binomial, data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.065e+01  3.612e-01  -29.49   <2e-16 ***
## balance      5.499e-03  2.204e-04   24.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8

Table 4.2 — Simple Logistic Regression: student

m2 <- glm(default ~ student, data = Default, family = binomial)
summary(m2)
## 
## Call:
## glm(formula = default ~ student, family = binomial, data = Default)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.50413    0.07071  -49.55  < 2e-16 ***
## studentYes   0.40489    0.11502    3.52 0.000431 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2908.7  on 9998  degrees of freedom
## AIC: 2912.7
## 
## Number of Fisher Scoring iterations: 6

Table 4.3 — Multiple Logistic Regression: balance + income + student

m3 <- glm(default ~ balance + income + student, data = Default, family = binomial)
summary(m3)
## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

ROC Curves

# Compute ROC objects
roc1 <- roc(Default$default, fitted(m1), quiet = TRUE)
roc2 <- roc(Default$default, fitted(m2), quiet = TRUE)
roc3 <- roc(Default$default, fitted(m3), quiet = TRUE)

# Plot
plot(roc1,
     col  = "#2196F3",
     lwd  = 2,
     main = "ROC Curves — Default Dataset\n(ISLR Tables 4.1, 4.2, 4.3)",
     xlab = "False Positive Rate (1 - Specificity)",
     ylab = "True Positive Rate (Sensitivity)")

plot(roc2, col = "#FF9800", lwd = 2, lty = 2, add = TRUE)
plot(roc3, col = "#4CAF50", lwd = 2,          add = TRUE)

abline(a = 0, b = 1, lty = 3, col = "grey50")

legend("bottomright",
       legend = c(
         paste0("Table 4.1: Balance only  (AUC = ", round(auc(roc1), 3), ")"),
         paste0("Table 4.2: Student only  (AUC = ", round(auc(roc2), 3), ")"),
         paste0("Table 4.3: Balance + Income + Student  (AUC = ", round(auc(roc3), 3), ")"),
         "Random Classifier"
       ),
       col    = c("#2196F3", "#FF9800", "#4CAF50", "grey50"),
       lty    = c(1, 2, 1, 3),
       lwd    = c(2, 2, 2, 1),
       cex    = 0.85,
       bty    = "n")


AUC Summary

auc_df <- data.frame(
  Model      = c("Table 4.1: balance",
                 "Table 4.2: student",
                 "Table 4.3: balance + income + student"),
  Predictors = c("balance", "student", "balance + income + student"),
  AUC        = round(c(auc(roc1), auc(roc2), auc(roc3)), 4)
)

knitr::kable(auc_df, caption = "AUC for each logistic regression model")
AUC for each logistic regression model
Model Predictors AUC
Table 4.1: balance balance 0.9480
Table 4.2: student student 0.5450
Table 4.3: balance + income + student balance + income + student 0.9496

Interpretation


Data source: Default dataset, ISLR package. Reference: James, Witten, Hastie & Tibshirani, “An Introduction to Statistical Learning”, Tables 4.1–4.3.