# Loan Approval Policy Analysis
# Student: Diego Armando Garcia Martinez
# Enrolment number: A01286225
# 1. CLEAN ENVIRONMENT AND LOAD LIBRARIES
rm(list = ls()) # Clears all objects from the environment to avoid conflicts
library(tidyverse) # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MCMCpack) # For MCMC regression
## Loading required package: coda
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## ##
## ## Markov Chain Monte Carlo Package (MCMCpack)
## ## Copyright (C) 2003-2025 Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park
## ##
## ## Support provided by the U.S. National Science Foundation
## ## (Grants SES-0350646 and SES-0350613)
## ##
library(coda) # For convergence diagnostics
library(ggridges) # For visualization of posterior distributions
library(lmtest) # For OLS diagnostic tests
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Set seed for reproducibility
set.seed(20230703)
# 2. IMPORT AND PREPARE DATA
df <- read_csv("Bank_loans_credit_score_final.csv") %>%
mutate(row_id = row_number()) # Add a unique row ID
## Rows: 10127 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Education_Level, Marital_Status
## dbl (18): CLIENTNUM, Customer_Age, Gender, Dependent_count, Income, Months_o...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Recode categorical variables into factors
df2 <- df %>% mutate(
Gender = factor(Gender, levels = c(0,1), labels = c("Male", "Female")),
Education_Level = factor(Education_Level),
Marital_Status = factor(Marital_Status)
)
str(df2) # Displays the structure of the dataset to verify types of variables
## tibble [10,127 × 21] (S3: tbl_df/tbl/data.frame)
## $ CLIENTNUM : num [1:10127] 7.69e+08 8.19e+08 7.14e+08 7.70e+08 7.09e+08 ...
## $ Customer_Age : num [1:10127] 45 49 51 40 40 44 51 32 37 48 ...
## $ Gender : Factor w/ 2 levels "Male","Female": 1 2 1 2 1 1 1 1 1 1 ...
## $ Dependent_count : num [1:10127] 3 5 3 4 3 2 4 0 3 2 ...
## $ Education_Level : Factor w/ 7 levels "College","Doctorate",..: 4 3 3 4 6 3 7 4 6 3 ...
## $ Marital_Status : Factor w/ 4 levels "Divorced","Married",..: 2 3 2 4 2 2 2 4 3 3 ...
## $ Income : num [1:10127] 72181 21501 105120 38090 70579 ...
## $ Months_on_book : num [1:10127] 39 44 36 34 21 36 46 27 36 36 ...
## $ Total_Relationship_Count: num [1:10127] 5 6 4 3 5 3 6 2 5 6 ...
## $ Months_Inactive_12_mon : num [1:10127] 1 1 1 4 1 1 1 2 2 3 ...
## $ Contacts_Count_12_mon : num [1:10127] 3 2 0 1 0 2 3 2 0 3 ...
## $ Credit_Limit : num [1:10127] 12691 8256 3418 3313 4716 ...
## $ Total_Revolving_Bal : num [1:10127] 777 864 0 2517 0 ...
## $ Avg_Open_To_Buy : num [1:10127] 11914 7392 3418 796 4716 ...
## $ Total_Amt_Chng_Q4_Q1 : num [1:10127] 1.33 1.54 2.59 1.41 2.17 ...
## $ Total_Trans_Amt : num [1:10127] 1144 1291 1887 1171 816 ...
## $ Total_Trans_Ct : num [1:10127] 42 33 20 20 28 24 31 36 24 32 ...
## $ Total_Ct_Chng_Q4_Q1 : num [1:10127] 1.62 3.71 2.33 2.33 2.5 ...
## $ Avg_Utilization_Ratio : num [1:10127] 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...
## $ Credit_Score : num [1:10127] 763 850 822 839 699 ...
## $ row_id : int [1:10127] 1 2 3 4 5 6 7 8 9 10 ...
# 3. SPECIFY THE ECONOMETRIC MODEL
equation <- Credit_Score ~ Income + Customer_Age + Avg_Utilization_Ratio +
Total_Relationship_Count + Total_Trans_Amt + Months_on_book +
Dependent_count + Gender + Education_Level + Marital_Status
# Create model frame and keep row ID
df3 <- model.frame(equation, data = df2, na.action = na.omit) %>%
mutate(row_id = as.numeric(rownames(.)))
# Explanation:
# In R, regression models treat factor variables correctly for dummy encoding.
# We convert Gender, Education_Level, and Marital_Status to factors.
# Gender 0 = Male, 1 = Female according to the data dictionary.
#Hypothesis:
#Credit score is significantly influenced by income, age, utilization ratio
# , transactions, etc.
#Conclusion:
# Equation prepared; input variables and expected relationships are now
# specified for regression.
# 4. ESTIMATE MODEL USING MCMCregress
mcmc_fit <- MCMCregress(
equation,
data = df3,
burnin = 5000,
mcmc = 15000,
thin = 10,
verbose = 0
)
# We use MCMC instead of OLS to better capture uncertainty and avoid strict assumptions
# Convergence diagnostics
mcmc_list <- as.mcmc.list(mcmc_fit)
traceplot(mcmc_list) # Check chain mixing visually



















summary(mcmc_fit) # Output the estimated coefficients and credible intervals
##
## Iterations = 5001:19991
## Thinning interval = 10
## Number of chains = 1
## Sample size per chain = 1500
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## (Intercept) -2.328e+02 1.053e+01 2.719e-01 2.719e-01
## Income 2.067e-03 5.003e-05 1.292e-06 1.144e-06
## Customer_Age 3.749e+00 2.365e-01 6.106e-03 6.106e-03
## Avg_Utilization_Ratio 1.585e+02 4.345e+00 1.122e-01 1.122e-01
## Total_Relationship_Count 2.700e+01 7.583e-01 1.958e-02 1.809e-02
## Total_Trans_Amt 2.275e-02 3.496e-04 9.026e-06 9.026e-06
## Months_on_book 4.852e+00 2.320e-01 5.991e-03 6.146e-03
## Dependent_count 2.767e+01 9.193e-01 2.374e-02 2.224e-02
## GenderFemale 5.603e+01 3.340e+00 8.625e-02 8.625e-02
## Education_LevelDoctorate -1.525e+01 6.530e+00 1.686e-01 1.686e-01
## Education_LevelGraduate -3.720e+00 4.052e+00 1.046e-01 1.046e-01
## Education_LevelHigh School -4.127e+00 4.290e+00 1.108e-01 1.108e-01
## Education_LevelPost-Graduate -6.305e+00 6.109e+00 1.577e-01 1.577e-01
## Education_LevelUneducated 3.878e+00 4.623e+00 1.194e-01 1.194e-01
## Education_LevelUnknown -3.324e+00 4.436e+00 1.145e-01 1.145e-01
## Marital_StatusMarried -3.961e+00 4.575e+00 1.181e-01 1.179e-01
## Marital_StatusSingle 2.060e+00 4.553e+00 1.176e-01 1.176e-01
## Marital_StatusUnknown 1.952e+00 5.882e+00 1.519e-01 1.519e-01
## sigma2 1.145e+04 1.697e+02 4.381e+00 4.381e+00
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75%
## (Intercept) -2.533e+02 -2.401e+02 -2.329e+02 -2.256e+02
## Income 1.973e-03 2.032e-03 2.067e-03 2.100e-03
## Customer_Age 3.298e+00 3.591e+00 3.743e+00 3.907e+00
## Avg_Utilization_Ratio 1.499e+02 1.556e+02 1.584e+02 1.614e+02
## Total_Relationship_Count 2.546e+01 2.649e+01 2.700e+01 2.752e+01
## Total_Trans_Amt 2.207e-02 2.251e-02 2.274e-02 2.299e-02
## Months_on_book 4.402e+00 4.694e+00 4.849e+00 5.015e+00
## Dependent_count 2.587e+01 2.707e+01 2.766e+01 2.827e+01
## GenderFemale 4.963e+01 5.381e+01 5.607e+01 5.831e+01
## Education_LevelDoctorate -2.792e+01 -1.971e+01 -1.527e+01 -1.078e+01
## Education_LevelGraduate -1.143e+01 -6.545e+00 -3.751e+00 -9.277e-01
## Education_LevelHigh School -1.236e+01 -7.041e+00 -4.252e+00 -1.215e+00
## Education_LevelPost-Graduate -1.828e+01 -1.033e+01 -6.407e+00 -2.410e+00
## Education_LevelUneducated -5.010e+00 8.731e-01 3.766e+00 6.941e+00
## Education_LevelUnknown -1.183e+01 -6.299e+00 -3.280e+00 -3.917e-01
## Marital_StatusMarried -1.299e+01 -6.934e+00 -3.952e+00 -9.674e-01
## Marital_StatusSingle -7.375e+00 -9.285e-01 2.066e+00 5.211e+00
## Marital_StatusUnknown -9.342e+00 -2.180e+00 2.080e+00 5.901e+00
## sigma2 1.110e+04 1.134e+04 1.145e+04 1.156e+04
## 97.5%
## (Intercept) -2.121e+02
## Income 2.170e-03
## Customer_Age 4.227e+00
## Avg_Utilization_Ratio 1.672e+02
## Total_Relationship_Count 2.841e+01
## Total_Trans_Amt 2.345e-02
## Months_on_book 5.298e+00
## Dependent_count 2.948e+01
## GenderFemale 6.248e+01
## Education_LevelDoctorate -2.533e+00
## Education_LevelGraduate 4.504e+00
## Education_LevelHigh School 4.832e+00
## Education_LevelPost-Graduate 6.023e+00
## Education_LevelUneducated 1.322e+01
## Education_LevelUnknown 5.196e+00
## Marital_StatusMarried 5.009e+00
## Marital_StatusSingle 1.071e+01
## Marital_StatusUnknown 1.343e+01
## sigma2 1.178e+04
#Uses Bayesian regression (via MCMC sampling) to estimate the coefficients of
#the model, allowing for a flexible and probabilistic understanding of variable
#influence on credit score. Includes convergence diagnostics to validate the
#reliability of the MCMC samples.
#Hypothesis
# MCMC will provide more robust insights than OLS under assumption violations.
#Concluison
#Model estimated. Traceplots suggest convergence. Coefficient distributions
#ready for interpretation.
# 5. VIOLATION TESTS (EXPLORING OLS ASSUMPTIONS)
# While MCMC is more flexible, we still check OLS assumptions for comparison
# Multicollinearity check
cor(df3[, sapply(df3, is.numeric)])
## Credit_Score Income Customer_Age
## Credit_Score 1.00000000 0.260230965 0.342040898
## Income 0.26023097 1.000000000 0.029616220
## Customer_Age 0.34204090 0.029616220 1.000000000
## Avg_Utilization_Ratio 0.15065632 -0.328222965 0.007615891
## Total_Relationship_Count 0.09669151 -0.001727477 -0.007832566
## Total_Trans_Amt 0.36594220 0.013316558 -0.048698251
## Months_on_book 0.35548829 0.027230896 0.788225600
## Dependent_count 0.19266678 0.068312209 -0.127725673
## row_id 0.19109507 -0.087323654 -0.014945266
## Avg_Utilization_Ratio Total_Relationship_Count
## Credit_Score 0.150656321 0.096691511
## Income -0.328222965 -0.001727477
## Customer_Age 0.007615891 -0.007832566
## Avg_Utilization_Ratio 1.000000000 0.073559768
## Total_Relationship_Count 0.073559768 1.000000000
## Total_Trans_Amt -0.086267590 -0.354163856
## Months_on_book -0.007891576 -0.006465727
## Dependent_count -0.043580982 -0.041415273
## row_id -0.040875440 -0.423677825
## Total_Trans_Amt Months_on_book Dependent_count
## Credit_Score 0.36594220 0.355488287 0.19266678
## Income 0.01331656 0.027230896 0.06831221
## Customer_Age -0.04869825 0.788225600 -0.12772567
## Avg_Utilization_Ratio -0.08626759 -0.007891576 -0.04358098
## Total_Relationship_Count -0.35416386 -0.006465727 -0.04141527
## Total_Trans_Amt 1.00000000 -0.040544548 0.02242226
## Months_on_book -0.04054455 1.000000000 -0.10746369
## Dependent_count 0.02242226 -0.107463689 1.00000000
## row_id 0.73533997 -0.013165671 0.08323858
## row_id
## Credit_Score 0.19109507
## Income -0.08732365
## Customer_Age -0.01494527
## Avg_Utilization_Ratio -0.04087544
## Total_Relationship_Count -0.42367782
## Total_Trans_Amt 0.73533997
## Months_on_book -0.01316567
## Dependent_count 0.08323858
## row_id 1.00000000
# Normality of residuals (for linear model)
residuals <- residuals(lm(equation, data = df3))
hist(residuals, main = "Histogram of Residuals", xlab = "Residuals")

# Heteroskedasticity test (Breusch-Pagan)
bptest(lm(equation, data = df3))
##
## studentized Breusch-Pagan test
##
## data: lm(equation, data = df3)
## BP = 550.08, df = 17, p-value < 2.2e-16
# Runs traditional regression diagnostics (multicollinearity, normality of
#residuals, and heteroskedasticity) to assess whether classical OLS assumptions
#hold, helping to justify the use of Bayesian methods instead.
#Hypothesis
# OLS assumptions might be violated, supporting the use of MCMC regression.
#Conclusion:
#Some assumptions are not fully met (e.g., heteroskedasticity detected).
# MCMC is a justified approach.
# 6. GENERATE PREDICTIONS AND SCENARIOS
# Create design matrix (X matrix for predictions)
X <- model.matrix(equation, data = df3)
# Extract beta coefficients (excluding sigma2)
betas <- as.matrix(mcmc_fit)[, !colnames(mcmc_fit) %in% "sigma2"]
# Predict Credit Scores for each draw of the posterior
y_pred <- X %*% t(betas)
# Create percentiles and classify into scenarios
df4 <- df3 %>%
mutate(
CS_p5 = apply(y_pred, 1, quantile, probs = 0.05), # Pessimistic
CS_p50 = apply(y_pred, 1, quantile, probs = 0.50), # Most likely
CS_p95 = apply(y_pred, 1, quantile, probs = 0.95) # Optimistic
) %>%
rowwise() %>%
mutate(
Scenario = case_when(
Credit_Score <= CS_p5 ~ "Pessimistic",
Credit_Score >= CS_p95 ~ "Optimistic",
TRUE ~ "Most likely"
)
) %>%
ungroup() %>%
dplyr::select(row_id, CS_p5, CS_p50, CS_p95, Scenario)
# Merge scenarios with original data
df_final <- df2 %>%
left_join(df4, by = "row_id") %>%
dplyr::select(-row_id)
# Generates predicted credit scores from the Bayesian model using the posterior
# samples. Calculates 5th, 50th, and 95th percentiles of the predicted values
#for each individual and assigns a scenario label (Pessimistic, Most likely,
#Optimistic) based on these percentiles.
# Hypothesis
#Simulated percentiles allow better insight into uncertainty around predictions.
#Conclusion
#Each observation classified into a scenario. This allows scenario-based
#decision-making by the bank.
# 7. VISUALIZATION OF RESULTS
# Graph 1: Observed vs Predicted Credit Score
g1 <- ggplot(df_final, aes(x = Credit_Score, y = CS_p50, colour = Scenario)) +
geom_point(alpha = 0.6) +
geom_errorbar(aes(ymin = CS_p5, ymax = CS_p95), alpha = 0.25, width = 0) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
theme_minimal() +
labs(title = "Observed vs Posterior Median Credit Score")
# Graph 2: Distribution of Posterior Medians
g2 <- ggplot(df_final, aes(x = CS_p50, y = Scenario, fill = Scenario)) +
ggridges::geom_density_ridges(alpha = 0.8, show.legend = FALSE) +
theme_minimal()
# Graph 3: Boxplot of Observed Credit Score by Scenario
g3 <- ggplot(df_final, aes(x = Scenario, y = Credit_Score, fill = Scenario)) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.2) +
theme_minimal()
# Print graphs
g1
## Warning: Removed 1112 rows containing missing values or values outside the scale range
## (`geom_point()`).

g2
## Picking joint bandwidth of 23.3
## Warning: Removed 1112 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

g3

#Creates three plots: (1) a scatter plot comparing observed and predicted median
#scores with error bars, (2) density ridges showing posterior score distributions
#per scenario, and (3) a boxplot showing observed scores across scenarios.
# These visualizations help compare actual scores against modeled scenarios
#Hypothesis
#Graphs will show whether predictions align well with observed data and
#identify outliers or risk groups.
#Conclusion
#Visuals confirm the model's accuracy and scenario separation. Useful for
# decision support.