Introduction

This report explores logistic regression and linear discriminant analysis (LDA) to predict credit card default using the Default dataset from ISLR2.

We will:


Load Libraries

library(ISLR2)
library(tidyverse)
library(MASS)
library(pROC)  # For ROC curves and AUC


Load Dataset

default <- ISLR2::Default %>% as_tibble()


Quick Overview of Data

glimpse(default)
## Rows: 10,000
## Columns: 4
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, No, No, N…
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919.5885, 8…
## $ income  <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496, 7491.55…
summary(default)
##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554


Balance vs Income (Hexbin Plot)

This plot helps visualize the density of data points across balance and income.

default %>%
  ggplot(aes(x = balance, y = income, fill = default)) + 
  geom_hex(alpha = 2/3)


Balance Distribution (Boxplot)

This boxplot shows how balance varies across defaulters and non-defaulters.

default %>%
  ggplot(aes(y = balance, fill = default)) +
  geom_boxplot()


Convert Default to Binary

The categorical ‘default’ variable is converted to a numeric binary format (1 for “Yes”, 0 for “No”).

default <- default %>%
  mutate(default_bool = if_else(default == "Yes", 1, 0))


Linear Regression

A simple linear regression model is fit to see how balance affects default probability.

lm.fit1 <- lm(default_bool ~ balance, data = default)

x <- default$balance
y <- default$default_bool
y_pred <- predict(lm.fit1)


Plot the relationship

plot(x, y, main = "Linear Regression Fit")
abline(lm.fit1, col = "blue")


Logistic Regression

We now fit a logistic regression model to predict default probability based on balance.

glm.fit1 <- glm(default_bool ~ balance, data = default, family = "binomial")


Predictions

y_pred <- predict(glm.fit1, type = "response")


Plot the logistic regression fit

plot(x, y, main = "Logistic Regression Fit")
points(x, y_pred, col = 'red')


Linear Discriminant Analysis (LDA)

LDA is a classification technique that projects the data onto a space that maximizes class separation.


LDA with Balance Only

lda.fit1 <- lda(default_bool ~ balance, data = default)
print(lda.fit1)
## Call:
## lda(default_bool ~ balance, data = default)
## 
## Prior probabilities of groups:
##      0      1 
## 0.9667 0.0333 
## 
## Group means:
##     balance
## 0  803.9438
## 1 1747.8217
## 
## Coefficients of linear discriminants:
##                 LD1
## balance 0.002206916
plot(lda.fit1)


LDA with Balance & Income

lda.fit2 <- lda(default_bool ~ balance + income, data = default)
print(lda.fit2)
## Call:
## lda(default_bool ~ balance + income, data = default)
## 
## Prior probabilities of groups:
##      0      1 
## 0.9667 0.0333 
## 
## Group means:
##     balance   income
## 0  803.9438 33566.17
## 1 1747.8217 32089.15
## 
## Coefficients of linear discriminants:
##                  LD1
## balance 2.230835e-03
## income  7.793355e-06
plot(lda.fit2)


Predictions & Confusion Matrix

We predict class labels using the LDA model and compute accuracy.

lda.pred2 <- predict(lda.fit2)
lda.class <- lda.pred2$class


Confusion Matrix

table(lda.class, default$default_bool)
##          
## lda.class    0    1
##         0 9647  256
##         1   20   77


Model Accuracy

mean(lda.class == default$default_bool)
## [1] 0.9724


Model Performance: ROC Curve & AUC

To evaluate model performance, we compute and visualize ROC curves and AUC values.

ROC Curve and AUC for Logistic Regression

roc_glm <- roc(default$default_bool, predict(glm.fit1, type = "response"))


Plot ROC Curve

plot(roc_glm, col = "blue", main = "ROC Curve - Logistic Regression")


Compute AUC

auc_glm <- auc(roc_glm)
print(paste("AUC (Logistic Regression):", auc_glm))
## [1] "AUC (Logistic Regression): 0.947978494683832"


ROC Curve and AUC for LDA

roc_lda <- roc(default$default_bool, lda.pred2$posterior[,2])


Add LDA ROC curve to the same plot and compare

plot(roc_glm, col = "blue", main = "ROC Curve - Logistic Regression")
plot(roc_lda, col = "red", add = TRUE)
legend("bottomright", legend = c("Logistic Regression", "LDA"),
       col = c("blue", "red"), lwd = 2)


Compute AUC

auc_lda <- auc(roc_lda)
print(paste("AUC (LDA):", auc_lda))
## [1] "AUC (LDA): 0.949071653633586"

Example of classification with more than two classes - Iris data set


Load the iris dataset

iris_data <- as_tibble(iris)


Quick overview of the dataset

glimpse(iris_data)
## Rows: 150
## Columns: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.…
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.…
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.…
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.…
## $ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, s…
summary(iris_data)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 


Check class distribution

table(iris_data$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50


Visualizing the distribution of Sepal Length for each Species

ggplot(iris_data, aes(x = Sepal.Length, fill = Species)) +
  geom_density(alpha = 0.5) +
  labs(title = "Sepal Length Distribution by Species") + 
  theme(plot.title = element_text(hjust = 0.5))


Performing LDA on the iris dataset

lda_iris <- lda(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris_data)


Plot LDA

plot(lda_iris)


Predicting the class labels

lda_iris_pred <- predict(lda_iris)
lda_iris_class <- lda_iris_pred$class


Confusion matrix

confusion_matrix_iris <- table(lda_iris_class, iris_data$Species)
print(confusion_matrix_iris)
##               
## lda_iris_class setosa versicolor virginica
##     setosa         50          0         0
##     versicolor      0         48         1
##     virginica       0          2        49


Compute accuracy

accuracy_iris <- mean(lda_iris_class == iris_data$Species)
print(paste("LDA Classification Accuracy on Iris Dataset:", round(accuracy_iris, 4)))
## [1] "LDA Classification Accuracy on Iris Dataset: 0.98"


Visualizing LDA decision regions using the first two discriminants

iris_lda_plot <- data.frame(lda_iris_pred$x, Species = iris_data$Species)

ggplot(iris_lda_plot, aes(x = LD1, y = LD2, color = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "LDA Projection of Iris Dataset", x = "First Linear Discriminant", y = "Second Linear Discriminant") +
  theme_minimal() + 
  theme(plot.title = element_text(hjust = 0.5))