This report explores logistic regression and
linear discriminant analysis (LDA) to predict credit
card default using the Default
dataset from
ISLR2.
We will:
library(ISLR2)
library(tidyverse)
library(MASS)
library(pROC) # For ROC curves and AUC
default <- ISLR2::Default %>% as_tibble()
glimpse(default)
## Rows: 10,000
## Columns: 4
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No…
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, No, No, N…
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919.5885, 8…
## $ income <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496, 7491.55…
summary(default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
This plot helps visualize the density of data points across balance and income.
default %>%
ggplot(aes(x = balance, y = income, fill = default)) +
geom_hex(alpha = 2/3)
This boxplot shows how balance varies across defaulters and non-defaulters.
default %>%
ggplot(aes(y = balance, fill = default)) +
geom_boxplot()
The categorical ‘default’ variable is converted to a numeric binary format (1 for “Yes”, 0 for “No”).
default <- default %>%
mutate(default_bool = if_else(default == "Yes", 1, 0))
A simple linear regression model is fit to see how balance affects default probability.
lm.fit1 <- lm(default_bool ~ balance, data = default)
x <- default$balance
y <- default$default_bool
y_pred <- predict(lm.fit1)
plot(x, y, main = "Linear Regression Fit")
abline(lm.fit1, col = "blue")
We now fit a logistic regression model to predict default probability based on balance.
glm.fit1 <- glm(default_bool ~ balance, data = default, family = "binomial")
y_pred <- predict(glm.fit1, type = "response")
plot(x, y, main = "Logistic Regression Fit")
points(x, y_pred, col = 'red')
LDA is a classification technique that projects the data onto a space that maximizes class separation.
lda.fit1 <- lda(default_bool ~ balance, data = default)
print(lda.fit1)
## Call:
## lda(default_bool ~ balance, data = default)
##
## Prior probabilities of groups:
## 0 1
## 0.9667 0.0333
##
## Group means:
## balance
## 0 803.9438
## 1 1747.8217
##
## Coefficients of linear discriminants:
## LD1
## balance 0.002206916
plot(lda.fit1)
lda.fit2 <- lda(default_bool ~ balance + income, data = default)
print(lda.fit2)
## Call:
## lda(default_bool ~ balance + income, data = default)
##
## Prior probabilities of groups:
## 0 1
## 0.9667 0.0333
##
## Group means:
## balance income
## 0 803.9438 33566.17
## 1 1747.8217 32089.15
##
## Coefficients of linear discriminants:
## LD1
## balance 2.230835e-03
## income 7.793355e-06
plot(lda.fit2)
We predict class labels using the LDA model and compute accuracy.
lda.pred2 <- predict(lda.fit2)
lda.class <- lda.pred2$class
table(lda.class, default$default_bool)
##
## lda.class 0 1
## 0 9647 256
## 1 20 77
mean(lda.class == default$default_bool)
## [1] 0.9724
To evaluate model performance, we compute and visualize ROC curves and AUC values.
roc_glm <- roc(default$default_bool, predict(glm.fit1, type = "response"))
plot(roc_glm, col = "blue", main = "ROC Curve - Logistic Regression")
auc_glm <- auc(roc_glm)
print(paste("AUC (Logistic Regression):", auc_glm))
## [1] "AUC (Logistic Regression): 0.947978494683832"
roc_lda <- roc(default$default_bool, lda.pred2$posterior[,2])
plot(roc_glm, col = "blue", main = "ROC Curve - Logistic Regression")
plot(roc_lda, col = "red", add = TRUE)
legend("bottomright", legend = c("Logistic Regression", "LDA"),
col = c("blue", "red"), lwd = 2)
auc_lda <- auc(roc_lda)
print(paste("AUC (LDA):", auc_lda))
## [1] "AUC (LDA): 0.949071653633586"
iris_data <- as_tibble(iris)
glimpse(iris_data)
## Rows: 150
## Columns: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.…
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.…
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.…
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.…
## $ Species <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, s…
summary(iris_data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
table(iris_data$Species)
##
## setosa versicolor virginica
## 50 50 50
ggplot(iris_data, aes(x = Sepal.Length, fill = Species)) +
geom_density(alpha = 0.5) +
labs(title = "Sepal Length Distribution by Species") +
theme(plot.title = element_text(hjust = 0.5))
lda_iris <- lda(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris_data)
print(lda_iris)
## Call:
## lda(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
## data = iris_data)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 5.006 3.428 1.462 0.246
## versicolor 5.936 2.770 4.260 1.326
## virginica 6.588 2.974 5.552 2.026
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sepal.Length 0.8293776 -0.02410215
## Sepal.Width 1.5344731 -2.16452123
## Petal.Length -2.2012117 0.93192121
## Petal.Width -2.8104603 -2.83918785
##
## Proportion of trace:
## LD1 LD2
## 0.9912 0.0088
plot(lda_iris)
lda_iris_pred <- predict(lda_iris)
lda_iris_class <- lda_iris_pred$class
confusion_matrix_iris <- table(lda_iris_class, iris_data$Species)
print(confusion_matrix_iris)
##
## lda_iris_class setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 1
## virginica 0 2 49
accuracy_iris <- mean(lda_iris_class == iris_data$Species)
print(paste("LDA Classification Accuracy on Iris Dataset:", round(accuracy_iris, 4)))
## [1] "LDA Classification Accuracy on Iris Dataset: 0.98"
iris_lda_plot <- data.frame(lda_iris_pred$x, Species = iris_data$Species)
ggplot(iris_lda_plot, aes(x = LD1, y = LD2, color = Species)) +
geom_point(size = 3, alpha = 0.7) +
labs(title = "LDA Projection of Iris Dataset", x = "First Linear Discriminant", y = "Second Linear Discriminant") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))