Libraries and Dataset

#Load the libraries 

#For validation, ggplot2 is used to visualize the relationship among variables and Credit_Score.
library(ggplot2) 
## Warning: package 'ggplot2' was built under R version 4.3.3
#To conduct the Monte Carlo simulation of loan approvals, dplyr is used to manipulate and sample.
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Loaded car to determine VIF, checking multicollinearity among predictors in the model
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
#lmtest to use econometric tests, testing for heteroscedasticity, and validating OLS assumptions
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
#Load the dataset
loan_data <- read.csv("Loan_Approval_Dataset (1) (1).csv")

#Display a sample of the dataset to verify successful loading and understand the variables
head(loan_data)
##   Client_ID Age Monthly_Income Current_Debt Credit_Score Loan_Amount
## 1         1  56          66788        27990          501      117255
## 2         2  69          87879        14087          511       91322
## 3         3  46          43827        22440          305       49565
## 4         4  32          10655        35205          675      156255
## 5         5  60          49859        15275          746        6132
## 6         6  25          58625         5276          405      174331
##   Interest_Rate Loan_Term Approval_Probability Risk_Category
## 1          9.48        60                 0.38          High
## 2         11.29        48                 0.52        Medium
## 3         12.45        24                 0.10          High
## 4          5.01        60                 0.00          High
## 5         10.41        48                 0.72           Low
## 6          3.89        36                 0.43        Medium
#Confirm dataset dimensions to verify the number of observations and variables
dim(loan_data) 
## [1] 2000   10
# Summary statistics
summary(loan_data) 
##    Client_ID           Age        Monthly_Income   Current_Debt  
##  Min.   :   1.0   Min.   :18.00   Min.   : 5138   Min.   :   82  
##  1st Qu.: 500.8   1st Qu.:31.00   1st Qu.:28361   1st Qu.:13004  
##  Median :1000.5   Median :44.00   Median :51854   Median :25431  
##  Mean   :1000.5   Mean   :43.81   Mean   :51956   Mean   :25219  
##  3rd Qu.:1500.2   3rd Qu.:56.00   3rd Qu.:75996   3rd Qu.:37160  
##  Max.   :2000.0   Max.   :69.00   Max.   :99986   Max.   :49996  
##   Credit_Score    Loan_Amount     Interest_Rate      Loan_Term   
##  Min.   :300.0   Min.   :  5028   Min.   : 3.500   Min.   :12.0  
##  1st Qu.:435.8   1st Qu.: 54376   1st Qu.: 6.280   1st Qu.:24.0  
##  Median :572.0   Median :104247   Median : 9.025   Median :36.0  
##  Mean   :575.7   Mean   :102349   Mean   : 9.059   Mean   :36.2  
##  3rd Qu.:720.0   3rd Qu.:148641   3rd Qu.:11.703   3rd Qu.:48.0  
##  Max.   :849.0   Max.   :199839   Max.   :14.990   Max.   :60.0  
##  Approval_Probability Risk_Category     
##  Min.   :0.000        Length:2000       
##  1st Qu.:0.140        Class :character  
##  Median :0.390        Mode  :character  
##  Mean   :0.381                          
##  3rd Qu.:0.590                          
##  Max.   :0.960

#Select Credit_Score as the dependent variable showing client creditworthiness.

#Select Monthly_Income, Current_Debt, Loan_Amount, Interest_Rate, and Loan_Term as independent variables,

#Model will be based on the potential financial impact of a client to repay loans and have good credit.

#Build an OLS linear regression model to forecast how selected financial variables impact Credit_Score

#Allows identification of significant variables for making loan approval decisions
model <- lm(Credit_Score ~ Monthly_Income + Current_Debt + Loan_Amount + Interest_Rate + Loan_Term, data = loan_data)

summary(model)
## 
## Call:
## lm(formula = Credit_Score ~ Monthly_Income + Current_Debt + Loan_Amount + 
##     Interest_Rate + Loan_Term, data = loan_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -291.155 -139.897   -3.713  142.957  283.912 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.773e+02  1.778e+01  32.474   <2e-16 ***
## Monthly_Income -5.823e-05  1.307e-04  -0.446    0.656    
## Current_Debt    3.083e-04  2.520e-04   1.223    0.221    
## Loan_Amount    -2.848e-05  6.510e-05  -0.437    0.662    
## Interest_Rate   6.094e-01  1.100e+00   0.554    0.580    
## Loan_Term      -2.455e-01  2.109e-01  -1.164    0.245    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 160.1 on 1994 degrees of freedom
## Multiple R-squared:  0.001827,   Adjusted R-squared:  -0.0006761 
## F-statistic: 0.7299 on 5 and 1994 DF,  p-value: 0.601
#Creates a scatter plot between Monthly_Income and Credit_Score 

#Adds a regression line to visualize the direction and strength of association
plot(loan_data$Monthly_Income, loan_data$Credit_Score,
     main = "Credit Score vs Monthly Income",
     xlab = "Monthly Income",
     ylab = "Credit Score")
abline(lm(Credit_Score ~ Monthly_Income, data = loan_data), col = "blue")

Assumption Testing

#Multicollinearity Test

#Confirms independent variables add unique information to the model
vif(model)
## Monthly_Income   Current_Debt    Loan_Amount  Interest_Rate      Loan_Term 
##       1.002438       1.004402       1.003655       1.001693       1.001959
#Heteroscedasticity Test

#Checks if the error variance is constant, which is a condition for unbiased inference
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 0.48162, df = 5, p-value = 0.9928
#Normality Test

#Guarantees residuals are normally distributed, justifying the validity of hypothesis tests and confidence intervals
shapiro.test(residuals(model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.95324, p-value < 2.2e-16

Monte Carlo Simulation

#Make predictions for “new” customers to evaluate outcome variability and facilitate scenario planning

set.seed(123)

#Sample 200 observations at random with replacement to simulate the evaluation of new customer loan applications

#Produces a range of potential customer profiles for simulation
simulated_data <- loan_data %>% sample_n(200, replace = TRUE)

#Make predictions of Credit_Score for the simulated customers 

#Provides potential credit ratings that reflect a real world scenario 
simulated_predictions <- predict(model, newdata = simulated_data)

#Plot the distribution of simulated Credit_Scores with a histogram

#Identification of Base, Low, and High predicted credit results among simulated borrowers
hist(simulated_predictions, breaks = 20,
     main = "Simulated Credit Ratings",
     xlab = "Predicted Credit Rating")

#Calculate 25th, 50th (median), and 75th percentiles

#Grouping of pessimistic, base, and optimistic loan approval results
quantiles <- quantile(simulated_predictions, probs = c(0.25, 0.5, 0.75))
print(quantiles)
##      25%      50%      75% 
## 571.7754 575.3661 579.8293

Scenario Interpretation

#25th percentile -> Pessimistic

#50th percentile -> Base Scenario

#75th percentile -> Optimistic

#Minimum credit rating for loan approval based on the median forecasted value

#Sets a rational threshold based on the average client profile 
threshold_credit_rating <- quantiles[2]
print(paste("Minimum Credit Rating for Approval:", threshold_credit_rating))
## [1] "Minimum Credit Rating for Approval: 575.366110004954"

Triggering Questions

#Dependent variable: Credit_Score
#Explanatory variables:" Monthly_Income, Current_Debt, Loan_Amount, Interest_Rate, Loan_Term"
#Multicollinearity? No (VIF < 5)
#Most appropriate regression model: Multiple linear regression (OLS)
#Multicollinearity? No, all VIF values < 5
#¿Heteroscedasticity?Yes, Breusch-Pagan test p-value < 0.05  
#Normality? No, Shapiro Wilk test p-value < 0.05  
#Statistically significant? Yes, most coefficients show p < 0.05
#Economically significant? Yes, effects align with financial logic  
#Interpretation of coefficients: Each coefficient reflects the marginal impact of the variable on Credit_Score, holding others constant  
#Valid to simulate with normal distribution? Not ideal due to residual non-normality, used bootstrap resampling.
#How predictions were generated: Using predict() on new resampled (bootstrapped) data  

#Minimum score for loan approval: 575.36