Load Data

data <- read_csv("Default.csv")

## New names:
## Rows: 10000 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): default, student dbl (3): ...1, balance, income
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

head(data)

## # A tibble: 6 × 5
##    ...1 default student balance income
##   <dbl> <chr>   <chr>     <dbl>  <dbl>
## 1     1 No      No         730. 44362.
## 2     2 No      Yes        817. 12106.
## 3     3 No      No        1074. 31767.
## 4     4 No      No         529. 35704.
## 5     5 No      No         786. 38463.
## 6     6 No      Yes        920.  7492.

Make sure variables are correct types:

data <- data %>%
  mutate(
    default = as.factor(default),
    student = as.factor(student)
  )

Models (Table 4.1, 4.2, 4.3)

Model 1: balance only

model1 <- glm(default ~ balance,
              data = data,
              family = binomial)

Model 2: income only

model2 <- glm(default ~ income,
              data = data,
              family = binomial)

Model 3: balance + income + student

model3 <- glm(default ~ balance + income + student,
              data = data,
              family = binomial)

Predicted Probabilities

prob1 <- predict(model1, type = "response")
prob2 <- predict(model2, type = "response")
prob3 <- predict(model3, type = "response")

ROC Curves

roc1 <- roc(data$default, prob1)

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

roc2 <- roc(data$default, prob2)

## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

roc3 <- roc(data$default, prob3)

## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

Plot ROC Curves Together

plot(roc1, col = "blue", main = "ROC Curves for Default Models")
plot(roc2, col = "red", add = TRUE)
plot(roc3, col = "green", add = TRUE)

legend("bottomright",
       legend = c("Balance", "Income", "Full Model"),
       col = c("blue", "red", "green"),
       lwd = 2)

AUC Comparison

auc(roc1)

## Area under the curve: 0.948

auc(roc2)

## Area under the curve: 0.5327

auc(roc3)

## Area under the curve: 0.9496

Interpretation

Model using balance performs very well (high AUC)
Model using income performs poorly (AUC near 0.5)
Full model slightly improves prediction but is driven mainly by balance

This confirms that credit card balance is the strongest predictor of default.

Conclusion

ROC curves show that not all predictors contribute equally to classification performance. Among the models, balance provides the most predictive power, while income adds little explanatory value. The full model performs best overall, but only marginally better than using balance alone.

ROC Curve Analysis - Default Dataset

Bat-Erdene Zorigtbaatar

2026-03-30