Final exam_Application of Financial Software Package_Sarantuya Sharavsambuu_113035128

##“Predicting Credit Default Risk Using Logistic Regression and PCA”

# 1 Load Libraries and Data
# Load required libraries
library(ISLR2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(e1071)  # needed for confusionMatrix
library(ggplot2)

# Load the dataset
data(Default)
df <- Default

# View first few rows
head(df)

##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559

# 2 Preprocess Data
# Encode 'default' as binary
df$default <- ifelse(df$default == "Yes", 1, 0)

# Split into training and testing
set.seed(123)
train_index <- createDataPartition(df$default, p = 0.7, list = FALSE)
train <- df[train_index, ]
test <- df[-train_index, ]

# 3: Logistic Regression
# Fit logistic regression model
logit_model <- glm(default ~ balance + income + student, 
                   data = train, 
                   family = "binomial")

# Predict on test set
logit_probs <- predict(logit_model, newdata = test, type = "response")
logit_pred <- ifelse(logit_probs > 0.5, 1, 0)

# Confusion matrix
confusionMatrix(factor(logit_pred), factor(test$default))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2889   70
##          1   11   30
##                                           
##                Accuracy : 0.973           
##                  95% CI : (0.9666, 0.9785)
##     No Information Rate : 0.9667          
##     P-Value [Acc > NIR] : 0.02705         
##                                           
##                   Kappa : 0.4142          
##                                           
##  Mcnemar's Test P-Value : 1.16e-10        
##                                           
##             Sensitivity : 0.9962          
##             Specificity : 0.3000          
##          Pos Pred Value : 0.9763          
##          Neg Pred Value : 0.7317          
##              Prevalence : 0.9667          
##          Detection Rate : 0.9630          
##    Detection Prevalence : 0.9863          
##       Balanced Accuracy : 0.6481          
##                                           
##        'Positive' Class : 0               
##

# 4 : PCA + Logistic Regression
# Apply PCA (scale = TRUE is important)
pca_train <- prcomp(train[, c("balance", "income")], scale. = TRUE)
pca_test <- predict(pca_train, newdata = test[, c("balance", "income")])

# Use first 2 principal components
pca_train_df <- data.frame(PC1 = pca_train$x[,1], 
                           PC2 = pca_train$x[,2], 
                           default = train$default)

pca_test_df <- data.frame(PC1 = pca_test[,1], 
                          PC2 = pca_test[,2], 
                          default = test$default)

# Logistic regression using PCs
pca_logit <- glm(default ~ PC1 + PC2, data = pca_train_df, family = "binomial")

# Predict on test set
pca_probs <- predict(pca_logit, newdata = pca_test_df, type = "response")
pca_pred <- ifelse(pca_probs > 0.5, 1, 0)

# Confusion matrix
confusionMatrix(factor(pca_pred), factor(pca_test_df$default))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2890   69
##          1   10   31
##                                           
##                Accuracy : 0.9737          
##                  95% CI : (0.9673, 0.9791)
##     No Information Rate : 0.9667          
##     P-Value [Acc > NIR] : 0.01601         
##                                           
##                   Kappa : 0.4286          
##                                           
##  Mcnemar's Test P-Value : 6.777e-11       
##                                           
##             Sensitivity : 0.9966          
##             Specificity : 0.3100          
##          Pos Pred Value : 0.9767          
##          Neg Pred Value : 0.7561          
##              Prevalence : 0.9667          
##          Detection Rate : 0.9633          
##    Detection Prevalence : 0.9863          
##       Balanced Accuracy : 0.6533          
##                                           
##        'Positive' Class : 0               
##

# 5 : Visualization
ggplot(pca_train_df, aes(x = PC1, y = PC2, color = factor(default))) +
  geom_point(alpha = 0.6) +
  labs(title = "PCA Projection of Credit Default Data", color = "Default") +
  theme_minimal()

# third model: K-Nearest Neighbors (KNN)
# Load required package
library(class)

# Normalize numeric columns
normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}

# Create normalized dataset
df_knn <- df
df_knn$balance <- normalize(df$balance)
df_knn$income <- normalize(df$income)

# Split train/test again using same indices for fair comparison
train_knn <- df_knn[train_index, ]
test_knn <- df_knn[-train_index, ]

# Use balance, income, student (encoded)
train_knn$student <- ifelse(train_knn$student == "Yes", 1, 0)
test_knn$student <- ifelse(test_knn$student == "Yes", 1, 0)

# Prepare features and labels
X_train <- train_knn[, c("balance", "income", "student")]
X_test <- test_knn[, c("balance", "income", "student")]
y_train <- train_knn$default
y_test <- test_knn$default

# Apply KNN (k = 5)
set.seed(123)
knn_pred <- knn(train = X_train, test = X_test, cl = y_train, k = 5)

# Confusion matrix
confusionMatrix(knn_pred, factor(y_test))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2878   67
##          1   22   33
##                                           
##                Accuracy : 0.9703          
##                  95% CI : (0.9636, 0.9761)
##     No Information Rate : 0.9667          
##     P-Value [Acc > NIR] : 0.1422          
##                                           
##                   Kappa : 0.4119          
##                                           
##  Mcnemar's Test P-Value : 3.101e-06       
##                                           
##             Sensitivity : 0.9924          
##             Specificity : 0.3300          
##          Pos Pred Value : 0.9772          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.9667          
##          Detection Rate : 0.9593          
##    Detection Prevalence : 0.9817          
##       Balanced Accuracy : 0.6612          
##                                           
##        'Positive' Class : 0               
##

Conclusion

Each model was able to predict credit default with high accuracy. Logistic regression provided interpretability, PCA reduced dimensionality while preserving performance, and KNN served as a strong baseline classifier. Depending on the use case, one might prioritize explainability (logistic) or slight accuracy gains (KNN).

In financial settings, such models can be used to automate risk assessments, approve or deny credit applications, or rank customers by default risk.

Final exam_Application of Financial Software Package_Sarantuya Sharavsambuu_113035128_Part2(20 percent my example)

Sarantuya Sharavsambuu

2025-06-13

Conclusion