##“Predicting Credit Default Risk Using Logistic Regression and PCA”
# 1 Load Libraries and Data
# Load required libraries
library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071) # needed for confusionMatrix
library(ggplot2)
# Load the dataset
data(Default)
df <- Default
# View first few rows
head(df)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
# 2 Preprocess Data
# Encode 'default' as binary
df$default <- ifelse(df$default == "Yes", 1, 0)
# Split into training and testing
set.seed(123)
train_index <- createDataPartition(df$default, p = 0.7, list = FALSE)
train <- df[train_index, ]
test <- df[-train_index, ]
# 3: Logistic Regression
# Fit logistic regression model
logit_model <- glm(default ~ balance + income + student,
data = train,
family = "binomial")
# Predict on test set
logit_probs <- predict(logit_model, newdata = test, type = "response")
logit_pred <- ifelse(logit_probs > 0.5, 1, 0)
# Confusion matrix
confusionMatrix(factor(logit_pred), factor(test$default))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2889 70
## 1 11 30
##
## Accuracy : 0.973
## 95% CI : (0.9666, 0.9785)
## No Information Rate : 0.9667
## P-Value [Acc > NIR] : 0.02705
##
## Kappa : 0.4142
##
## Mcnemar's Test P-Value : 1.16e-10
##
## Sensitivity : 0.9962
## Specificity : 0.3000
## Pos Pred Value : 0.9763
## Neg Pred Value : 0.7317
## Prevalence : 0.9667
## Detection Rate : 0.9630
## Detection Prevalence : 0.9863
## Balanced Accuracy : 0.6481
##
## 'Positive' Class : 0
##
# 4 : PCA + Logistic Regression
# Apply PCA (scale = TRUE is important)
pca_train <- prcomp(train[, c("balance", "income")], scale. = TRUE)
pca_test <- predict(pca_train, newdata = test[, c("balance", "income")])
# Use first 2 principal components
pca_train_df <- data.frame(PC1 = pca_train$x[,1],
PC2 = pca_train$x[,2],
default = train$default)
pca_test_df <- data.frame(PC1 = pca_test[,1],
PC2 = pca_test[,2],
default = test$default)
# Logistic regression using PCs
pca_logit <- glm(default ~ PC1 + PC2, data = pca_train_df, family = "binomial")
# Predict on test set
pca_probs <- predict(pca_logit, newdata = pca_test_df, type = "response")
pca_pred <- ifelse(pca_probs > 0.5, 1, 0)
# Confusion matrix
confusionMatrix(factor(pca_pred), factor(pca_test_df$default))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2890 69
## 1 10 31
##
## Accuracy : 0.9737
## 95% CI : (0.9673, 0.9791)
## No Information Rate : 0.9667
## P-Value [Acc > NIR] : 0.01601
##
## Kappa : 0.4286
##
## Mcnemar's Test P-Value : 6.777e-11
##
## Sensitivity : 0.9966
## Specificity : 0.3100
## Pos Pred Value : 0.9767
## Neg Pred Value : 0.7561
## Prevalence : 0.9667
## Detection Rate : 0.9633
## Detection Prevalence : 0.9863
## Balanced Accuracy : 0.6533
##
## 'Positive' Class : 0
##
# 5 : Visualization
ggplot(pca_train_df, aes(x = PC1, y = PC2, color = factor(default))) +
geom_point(alpha = 0.6) +
labs(title = "PCA Projection of Credit Default Data", color = "Default") +
theme_minimal()
# third model: K-Nearest Neighbors (KNN)
# Load required package
library(class)
# Normalize numeric columns
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
# Create normalized dataset
df_knn <- df
df_knn$balance <- normalize(df$balance)
df_knn$income <- normalize(df$income)
# Split train/test again using same indices for fair comparison
train_knn <- df_knn[train_index, ]
test_knn <- df_knn[-train_index, ]
# Use balance, income, student (encoded)
train_knn$student <- ifelse(train_knn$student == "Yes", 1, 0)
test_knn$student <- ifelse(test_knn$student == "Yes", 1, 0)
# Prepare features and labels
X_train <- train_knn[, c("balance", "income", "student")]
X_test <- test_knn[, c("balance", "income", "student")]
y_train <- train_knn$default
y_test <- test_knn$default
# Apply KNN (k = 5)
set.seed(123)
knn_pred <- knn(train = X_train, test = X_test, cl = y_train, k = 5)
# Confusion matrix
confusionMatrix(knn_pred, factor(y_test))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2878 67
## 1 22 33
##
## Accuracy : 0.9703
## 95% CI : (0.9636, 0.9761)
## No Information Rate : 0.9667
## P-Value [Acc > NIR] : 0.1422
##
## Kappa : 0.4119
##
## Mcnemar's Test P-Value : 3.101e-06
##
## Sensitivity : 0.9924
## Specificity : 0.3300
## Pos Pred Value : 0.9772
## Neg Pred Value : 0.6000
## Prevalence : 0.9667
## Detection Rate : 0.9593
## Detection Prevalence : 0.9817
## Balanced Accuracy : 0.6612
##
## 'Positive' Class : 0
##
Each model was able to predict credit default with high accuracy. Logistic regression provided interpretability, PCA reduced dimensionality while preserving performance, and KNN served as a strong baseline classifier. Depending on the use case, one might prioritize explainability (logistic) or slight accuracy gains (KNN).
In financial settings, such models can be used to automate risk assessments, approve or deny credit applications, or rank customers by default risk.