Mastery-level-evidence.R

# Loan Approval Policy Analysis
# Student: Diego Armando Garcia Martinez
# Enrolment number: A01286225

# 1. CLEAN ENVIRONMENT AND LOAD LIBRARIES
rm(list = ls())  # Clears all objects from the environment to avoid conflicts


library(tidyverse)    # For data manipulation and visualization

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(MCMCpack)     # For MCMC regression

## Loading required package: coda
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## ##
## ## Markov Chain Monte Carlo Package (MCMCpack)
## ## Copyright (C) 2003-2025 Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park
## ##
## ## Support provided by the U.S. National Science Foundation
## ## (Grants SES-0350646 and SES-0350613)
## ##

library(coda)         # For convergence diagnostics
library(ggridges)     # For visualization of posterior distributions
library(lmtest)       # For OLS diagnostic tests

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# Set seed for reproducibility
set.seed(20230703)


# 2. IMPORT AND PREPARE DATA

df <- read_csv("Bank_loans_credit_score_final.csv") %>%
  mutate(row_id = row_number())  # Add a unique row ID

## Rows: 10127 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Education_Level, Marital_Status
## dbl (18): CLIENTNUM, Customer_Age, Gender, Dependent_count, Income, Months_o...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Recode categorical variables into factors
df2 <- df %>% mutate(
  Gender          = factor(Gender, levels = c(0,1), labels = c("Male", "Female")),
  Education_Level = factor(Education_Level),
  Marital_Status  = factor(Marital_Status)
)

str(df2)  # Displays the structure of the dataset to verify types of variables

## tibble [10,127 × 21] (S3: tbl_df/tbl/data.frame)
##  $ CLIENTNUM               : num [1:10127] 7.69e+08 8.19e+08 7.14e+08 7.70e+08 7.09e+08 ...
##  $ Customer_Age            : num [1:10127] 45 49 51 40 40 44 51 32 37 48 ...
##  $ Gender                  : Factor w/ 2 levels "Male","Female": 1 2 1 2 1 1 1 1 1 1 ...
##  $ Dependent_count         : num [1:10127] 3 5 3 4 3 2 4 0 3 2 ...
##  $ Education_Level         : Factor w/ 7 levels "College","Doctorate",..: 4 3 3 4 6 3 7 4 6 3 ...
##  $ Marital_Status          : Factor w/ 4 levels "Divorced","Married",..: 2 3 2 4 2 2 2 4 3 3 ...
##  $ Income                  : num [1:10127] 72181 21501 105120 38090 70579 ...
##  $ Months_on_book          : num [1:10127] 39 44 36 34 21 36 46 27 36 36 ...
##  $ Total_Relationship_Count: num [1:10127] 5 6 4 3 5 3 6 2 5 6 ...
##  $ Months_Inactive_12_mon  : num [1:10127] 1 1 1 4 1 1 1 2 2 3 ...
##  $ Contacts_Count_12_mon   : num [1:10127] 3 2 0 1 0 2 3 2 0 3 ...
##  $ Credit_Limit            : num [1:10127] 12691 8256 3418 3313 4716 ...
##  $ Total_Revolving_Bal     : num [1:10127] 777 864 0 2517 0 ...
##  $ Avg_Open_To_Buy         : num [1:10127] 11914 7392 3418 796 4716 ...
##  $ Total_Amt_Chng_Q4_Q1    : num [1:10127] 1.33 1.54 2.59 1.41 2.17 ...
##  $ Total_Trans_Amt         : num [1:10127] 1144 1291 1887 1171 816 ...
##  $ Total_Trans_Ct          : num [1:10127] 42 33 20 20 28 24 31 36 24 32 ...
##  $ Total_Ct_Chng_Q4_Q1     : num [1:10127] 1.62 3.71 2.33 2.33 2.5 ...
##  $ Avg_Utilization_Ratio   : num [1:10127] 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...
##  $ Credit_Score            : num [1:10127] 763 850 822 839 699 ...
##  $ row_id                  : int [1:10127] 1 2 3 4 5 6 7 8 9 10 ...

# 3. SPECIFY THE ECONOMETRIC MODEL
equation <- Credit_Score ~ Income + Customer_Age + Avg_Utilization_Ratio +
  Total_Relationship_Count + Total_Trans_Amt + Months_on_book +
  Dependent_count + Gender + Education_Level + Marital_Status

# Create model frame and keep row ID
df3 <- model.frame(equation, data = df2, na.action = na.omit) %>%
  mutate(row_id = as.numeric(rownames(.)))

# Explanation:
# In R, regression models treat factor variables correctly for dummy encoding.
# We convert Gender, Education_Level, and Marital_Status to factors.
# Gender 0 = Male, 1 = Female according to the data dictionary.

#Hypothesis:
#Credit score is significantly influenced by income, age, utilization ratio
# , transactions, etc.

#Conclusion:
# Equation prepared; input variables and expected relationships are now
#  specified for regression.

# 4. ESTIMATE MODEL USING MCMCregress
mcmc_fit <- MCMCregress(
  equation,
  data    = df3,
  burnin  = 5000,
  mcmc    = 15000,
  thin    = 10,
  verbose = 0
)


# We use MCMC instead of OLS to better capture uncertainty and avoid strict assumptions

# Convergence diagnostics
mcmc_list <- as.mcmc.list(mcmc_fit)
traceplot(mcmc_list)             # Check chain mixing visually

summary(mcmc_fit)                # Output the estimated coefficients and credible intervals

## 
## Iterations = 5001:19991
## Thinning interval = 10 
## Number of chains = 1 
## Sample size per chain = 1500 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                                    Mean        SD  Naive SE Time-series SE
## (Intercept)                  -2.328e+02 1.053e+01 2.719e-01      2.719e-01
## Income                        2.067e-03 5.003e-05 1.292e-06      1.144e-06
## Customer_Age                  3.749e+00 2.365e-01 6.106e-03      6.106e-03
## Avg_Utilization_Ratio         1.585e+02 4.345e+00 1.122e-01      1.122e-01
## Total_Relationship_Count      2.700e+01 7.583e-01 1.958e-02      1.809e-02
## Total_Trans_Amt               2.275e-02 3.496e-04 9.026e-06      9.026e-06
## Months_on_book                4.852e+00 2.320e-01 5.991e-03      6.146e-03
## Dependent_count               2.767e+01 9.193e-01 2.374e-02      2.224e-02
## GenderFemale                  5.603e+01 3.340e+00 8.625e-02      8.625e-02
## Education_LevelDoctorate     -1.525e+01 6.530e+00 1.686e-01      1.686e-01
## Education_LevelGraduate      -3.720e+00 4.052e+00 1.046e-01      1.046e-01
## Education_LevelHigh School   -4.127e+00 4.290e+00 1.108e-01      1.108e-01
## Education_LevelPost-Graduate -6.305e+00 6.109e+00 1.577e-01      1.577e-01
## Education_LevelUneducated     3.878e+00 4.623e+00 1.194e-01      1.194e-01
## Education_LevelUnknown       -3.324e+00 4.436e+00 1.145e-01      1.145e-01
## Marital_StatusMarried        -3.961e+00 4.575e+00 1.181e-01      1.179e-01
## Marital_StatusSingle          2.060e+00 4.553e+00 1.176e-01      1.176e-01
## Marital_StatusUnknown         1.952e+00 5.882e+00 1.519e-01      1.519e-01
## sigma2                        1.145e+04 1.697e+02 4.381e+00      4.381e+00
## 
## 2. Quantiles for each variable:
## 
##                                    2.5%        25%        50%        75%
## (Intercept)                  -2.533e+02 -2.401e+02 -2.329e+02 -2.256e+02
## Income                        1.973e-03  2.032e-03  2.067e-03  2.100e-03
## Customer_Age                  3.298e+00  3.591e+00  3.743e+00  3.907e+00
## Avg_Utilization_Ratio         1.499e+02  1.556e+02  1.584e+02  1.614e+02
## Total_Relationship_Count      2.546e+01  2.649e+01  2.700e+01  2.752e+01
## Total_Trans_Amt               2.207e-02  2.251e-02  2.274e-02  2.299e-02
## Months_on_book                4.402e+00  4.694e+00  4.849e+00  5.015e+00
## Dependent_count               2.587e+01  2.707e+01  2.766e+01  2.827e+01
## GenderFemale                  4.963e+01  5.381e+01  5.607e+01  5.831e+01
## Education_LevelDoctorate     -2.792e+01 -1.971e+01 -1.527e+01 -1.078e+01
## Education_LevelGraduate      -1.143e+01 -6.545e+00 -3.751e+00 -9.277e-01
## Education_LevelHigh School   -1.236e+01 -7.041e+00 -4.252e+00 -1.215e+00
## Education_LevelPost-Graduate -1.828e+01 -1.033e+01 -6.407e+00 -2.410e+00
## Education_LevelUneducated    -5.010e+00  8.731e-01  3.766e+00  6.941e+00
## Education_LevelUnknown       -1.183e+01 -6.299e+00 -3.280e+00 -3.917e-01
## Marital_StatusMarried        -1.299e+01 -6.934e+00 -3.952e+00 -9.674e-01
## Marital_StatusSingle         -7.375e+00 -9.285e-01  2.066e+00  5.211e+00
## Marital_StatusUnknown        -9.342e+00 -2.180e+00  2.080e+00  5.901e+00
## sigma2                        1.110e+04  1.134e+04  1.145e+04  1.156e+04
##                                   97.5%
## (Intercept)                  -2.121e+02
## Income                        2.170e-03
## Customer_Age                  4.227e+00
## Avg_Utilization_Ratio         1.672e+02
## Total_Relationship_Count      2.841e+01
## Total_Trans_Amt               2.345e-02
## Months_on_book                5.298e+00
## Dependent_count               2.948e+01
## GenderFemale                  6.248e+01
## Education_LevelDoctorate     -2.533e+00
## Education_LevelGraduate       4.504e+00
## Education_LevelHigh School    4.832e+00
## Education_LevelPost-Graduate  6.023e+00
## Education_LevelUneducated     1.322e+01
## Education_LevelUnknown        5.196e+00
## Marital_StatusMarried         5.009e+00
## Marital_StatusSingle          1.071e+01
## Marital_StatusUnknown         1.343e+01
## sigma2                        1.178e+04

#Uses Bayesian regression (via MCMC sampling) to estimate the coefficients of 
#the model, allowing for a flexible and probabilistic understanding of variable 
#influence on credit score. Includes convergence diagnostics to validate the 
#reliability of the MCMC samples.

#Hypothesis
# MCMC will provide more robust insights than OLS under assumption violations.

#Concluison
#Model estimated. Traceplots suggest convergence. Coefficient distributions 
#ready for interpretation.


# 5. VIOLATION TESTS (EXPLORING OLS ASSUMPTIONS)
# While MCMC is more flexible, we still check OLS assumptions for comparison

# Multicollinearity check
cor(df3[, sapply(df3, is.numeric)])

##                          Credit_Score       Income Customer_Age
## Credit_Score               1.00000000  0.260230965  0.342040898
## Income                     0.26023097  1.000000000  0.029616220
## Customer_Age               0.34204090  0.029616220  1.000000000
## Avg_Utilization_Ratio      0.15065632 -0.328222965  0.007615891
## Total_Relationship_Count   0.09669151 -0.001727477 -0.007832566
## Total_Trans_Amt            0.36594220  0.013316558 -0.048698251
## Months_on_book             0.35548829  0.027230896  0.788225600
## Dependent_count            0.19266678  0.068312209 -0.127725673
## row_id                     0.19109507 -0.087323654 -0.014945266
##                          Avg_Utilization_Ratio Total_Relationship_Count
## Credit_Score                       0.150656321              0.096691511
## Income                            -0.328222965             -0.001727477
## Customer_Age                       0.007615891             -0.007832566
## Avg_Utilization_Ratio              1.000000000              0.073559768
## Total_Relationship_Count           0.073559768              1.000000000
## Total_Trans_Amt                   -0.086267590             -0.354163856
## Months_on_book                    -0.007891576             -0.006465727
## Dependent_count                   -0.043580982             -0.041415273
## row_id                            -0.040875440             -0.423677825
##                          Total_Trans_Amt Months_on_book Dependent_count
## Credit_Score                  0.36594220    0.355488287      0.19266678
## Income                        0.01331656    0.027230896      0.06831221
## Customer_Age                 -0.04869825    0.788225600     -0.12772567
## Avg_Utilization_Ratio        -0.08626759   -0.007891576     -0.04358098
## Total_Relationship_Count     -0.35416386   -0.006465727     -0.04141527
## Total_Trans_Amt               1.00000000   -0.040544548      0.02242226
## Months_on_book               -0.04054455    1.000000000     -0.10746369
## Dependent_count               0.02242226   -0.107463689      1.00000000
## row_id                        0.73533997   -0.013165671      0.08323858
##                               row_id
## Credit_Score              0.19109507
## Income                   -0.08732365
## Customer_Age             -0.01494527
## Avg_Utilization_Ratio    -0.04087544
## Total_Relationship_Count -0.42367782
## Total_Trans_Amt           0.73533997
## Months_on_book           -0.01316567
## Dependent_count           0.08323858
## row_id                    1.00000000

# Normality of residuals (for linear model)
residuals <- residuals(lm(equation, data = df3))
hist(residuals, main = "Histogram of Residuals", xlab = "Residuals")

# Heteroskedasticity test (Breusch-Pagan)
bptest(lm(equation, data = df3))

## 
##  studentized Breusch-Pagan test
## 
## data:  lm(equation, data = df3)
## BP = 550.08, df = 17, p-value < 2.2e-16

# Runs traditional regression diagnostics (multicollinearity, normality of 
#residuals, and heteroskedasticity) to assess whether classical OLS assumptions 
#hold, helping to justify the use of Bayesian methods instead.

#Hypothesis
# OLS assumptions might be violated, supporting the use of MCMC regression.

#Conclusion:
#Some assumptions are not fully met (e.g., heteroskedasticity detected).
#  MCMC is a justified approach.

# 6. GENERATE PREDICTIONS AND SCENARIOS

# Create design matrix (X matrix for predictions)
X <- model.matrix(equation, data = df3)

# Extract beta coefficients (excluding sigma2)
betas <- as.matrix(mcmc_fit)[, !colnames(mcmc_fit) %in% "sigma2"]

# Predict Credit Scores for each draw of the posterior
y_pred <- X %*% t(betas)

# Create percentiles and classify into scenarios
df4 <- df3 %>%
  mutate(
    CS_p5  = apply(y_pred, 1, quantile, probs = 0.05),   # Pessimistic
    CS_p50 = apply(y_pred, 1, quantile, probs = 0.50),   # Most likely
    CS_p95 = apply(y_pred, 1, quantile, probs = 0.95)    # Optimistic
  ) %>%
  rowwise() %>%
  mutate(
    Scenario = case_when(
      Credit_Score <= CS_p5  ~ "Pessimistic",
      Credit_Score >= CS_p95 ~ "Optimistic",
      TRUE                   ~ "Most likely"
    )
  ) %>%
  ungroup() %>%
  dplyr::select(row_id, CS_p5, CS_p50, CS_p95, Scenario)

# Merge scenarios with original data
df_final <- df2 %>%
  left_join(df4, by = "row_id") %>%
  dplyr::select(-row_id)

# Generates predicted credit scores from the Bayesian model using the posterior
# samples. Calculates 5th, 50th, and 95th percentiles of the predicted values 
#for each individual and assigns a scenario label (Pessimistic, Most likely, 
#Optimistic) based on these percentiles.

# Hypothesis
#Simulated percentiles allow better insight into uncertainty around predictions.

#Conclusion
#Each observation classified into a scenario. This allows scenario-based 
#decision-making by the bank.


# 7. VISUALIZATION OF RESULTS

# Graph 1: Observed vs Predicted Credit Score
g1 <- ggplot(df_final, aes(x = Credit_Score, y = CS_p50, colour = Scenario)) +
  geom_point(alpha = 0.6) +
  geom_errorbar(aes(ymin = CS_p5, ymax = CS_p95), alpha = 0.25, width = 0) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Observed vs Posterior Median Credit Score")

# Graph 2: Distribution of Posterior Medians
g2 <- ggplot(df_final, aes(x = CS_p50, y = Scenario, fill = Scenario)) +
  ggridges::geom_density_ridges(alpha = 0.8, show.legend = FALSE) +
  theme_minimal()

# Graph 3: Boxplot of Observed Credit Score by Scenario
g3 <- ggplot(df_final, aes(x = Scenario, y = Credit_Score, fill = Scenario)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.2) +
  theme_minimal()

# Print graphs
g1

## Warning: Removed 1112 rows containing missing values or values outside the scale range
## (`geom_point()`).

g2

## Picking joint bandwidth of 23.3

## Warning: Removed 1112 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

g3

#Creates three plots: (1) a scatter plot comparing observed and predicted median 
#scores with error bars, (2) density ridges showing posterior score distributions 
#per scenario, and (3) a boxplot showing observed scores across scenarios.
# These visualizations help compare actual scores against modeled scenarios

#Hypothesis
#Graphs will show whether predictions align well with observed data and 
#identify outliers or risk groups.

#Conclusion
#Visuals confirm the model's accuracy and scenario separation. Useful for
# decision support.

Mastery-level-evidence.R

diegogarcia

2025-06-14