#Load the libraries
#For validation, ggplot2 is used to visualize the relationship among variables and Credit_Score.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
#To conduct the Monte Carlo simulation of loan approvals, dplyr is used to manipulate and sample.
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Loaded car to determine VIF, checking multicollinearity among predictors in the model
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
#lmtest to use econometric tests, testing for heteroscedasticity, and validating OLS assumptions
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
#Load the dataset
loan_data <- read.csv("Loan_Approval_Dataset (1) (1).csv")
#Display a sample of the dataset to verify successful loading and understand the variables
head(loan_data)
## Client_ID Age Monthly_Income Current_Debt Credit_Score Loan_Amount
## 1 1 56 66788 27990 501 117255
## 2 2 69 87879 14087 511 91322
## 3 3 46 43827 22440 305 49565
## 4 4 32 10655 35205 675 156255
## 5 5 60 49859 15275 746 6132
## 6 6 25 58625 5276 405 174331
## Interest_Rate Loan_Term Approval_Probability Risk_Category
## 1 9.48 60 0.38 High
## 2 11.29 48 0.52 Medium
## 3 12.45 24 0.10 High
## 4 5.01 60 0.00 High
## 5 10.41 48 0.72 Low
## 6 3.89 36 0.43 Medium
#Confirm dataset dimensions to verify the number of observations and variables
dim(loan_data)
## [1] 2000 10
# Summary statistics
summary(loan_data)
## Client_ID Age Monthly_Income Current_Debt
## Min. : 1.0 Min. :18.00 Min. : 5138 Min. : 82
## 1st Qu.: 500.8 1st Qu.:31.00 1st Qu.:28361 1st Qu.:13004
## Median :1000.5 Median :44.00 Median :51854 Median :25431
## Mean :1000.5 Mean :43.81 Mean :51956 Mean :25219
## 3rd Qu.:1500.2 3rd Qu.:56.00 3rd Qu.:75996 3rd Qu.:37160
## Max. :2000.0 Max. :69.00 Max. :99986 Max. :49996
## Credit_Score Loan_Amount Interest_Rate Loan_Term
## Min. :300.0 Min. : 5028 Min. : 3.500 Min. :12.0
## 1st Qu.:435.8 1st Qu.: 54376 1st Qu.: 6.280 1st Qu.:24.0
## Median :572.0 Median :104247 Median : 9.025 Median :36.0
## Mean :575.7 Mean :102349 Mean : 9.059 Mean :36.2
## 3rd Qu.:720.0 3rd Qu.:148641 3rd Qu.:11.703 3rd Qu.:48.0
## Max. :849.0 Max. :199839 Max. :14.990 Max. :60.0
## Approval_Probability Risk_Category
## Min. :0.000 Length:2000
## 1st Qu.:0.140 Class :character
## Median :0.390 Mode :character
## Mean :0.381
## 3rd Qu.:0.590
## Max. :0.960
#Select Credit_Score as the dependent variable showing client creditworthiness.
#Select Monthly_Income, Current_Debt, Loan_Amount, Interest_Rate, and Loan_Term as independent variables,
#Model will be based on the potential financial impact of a client to repay loans and have good credit.
#Build an OLS linear regression model to forecast how selected financial variables impact Credit_Score
#Allows identification of significant variables for making loan approval decisions
model <- lm(Credit_Score ~ Monthly_Income + Current_Debt + Loan_Amount + Interest_Rate + Loan_Term, data = loan_data)
summary(model)
##
## Call:
## lm(formula = Credit_Score ~ Monthly_Income + Current_Debt + Loan_Amount +
## Interest_Rate + Loan_Term, data = loan_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -291.155 -139.897 -3.713 142.957 283.912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.773e+02 1.778e+01 32.474 <2e-16 ***
## Monthly_Income -5.823e-05 1.307e-04 -0.446 0.656
## Current_Debt 3.083e-04 2.520e-04 1.223 0.221
## Loan_Amount -2.848e-05 6.510e-05 -0.437 0.662
## Interest_Rate 6.094e-01 1.100e+00 0.554 0.580
## Loan_Term -2.455e-01 2.109e-01 -1.164 0.245
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 160.1 on 1994 degrees of freedom
## Multiple R-squared: 0.001827, Adjusted R-squared: -0.0006761
## F-statistic: 0.7299 on 5 and 1994 DF, p-value: 0.601
#Creates a scatter plot between Monthly_Income and Credit_Score
#Adds a regression line to visualize the direction and strength of association
plot(loan_data$Monthly_Income, loan_data$Credit_Score,
main = "Credit Score vs Monthly Income",
xlab = "Monthly Income",
ylab = "Credit Score")
abline(lm(Credit_Score ~ Monthly_Income, data = loan_data), col = "blue")
#Multicollinearity Test
#Confirms independent variables add unique information to the model
vif(model)
## Monthly_Income Current_Debt Loan_Amount Interest_Rate Loan_Term
## 1.002438 1.004402 1.003655 1.001693 1.001959
#Heteroscedasticity Test
#Checks if the error variance is constant, which is a condition for unbiased inference
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 0.48162, df = 5, p-value = 0.9928
#Normality Test
#Guarantees residuals are normally distributed, justifying the validity of hypothesis tests and confidence intervals
shapiro.test(residuals(model))
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.95324, p-value < 2.2e-16
#Make predictions for “new” customers to evaluate outcome variability and facilitate scenario planning
set.seed(123)
#Sample 200 observations at random with replacement to simulate the evaluation of new customer loan applications
#Produces a range of potential customer profiles for simulation
simulated_data <- loan_data %>% sample_n(200, replace = TRUE)
#Make predictions of Credit_Score for the simulated customers
#Provides potential credit ratings that reflect a real world scenario
simulated_predictions <- predict(model, newdata = simulated_data)
#Plot the distribution of simulated Credit_Scores with a histogram
#Identification of Base, Low, and High predicted credit results among simulated borrowers
hist(simulated_predictions, breaks = 20,
main = "Simulated Credit Ratings",
xlab = "Predicted Credit Rating")
#Calculate 25th, 50th (median), and 75th percentiles
#Grouping of pessimistic, base, and optimistic loan approval results
quantiles <- quantile(simulated_predictions, probs = c(0.25, 0.5, 0.75))
print(quantiles)
## 25% 50% 75%
## 571.7754 575.3661 579.8293
#25th percentile -> Pessimistic
#50th percentile -> Base Scenario
#75th percentile -> Optimistic
#Minimum credit rating for loan approval based on the median forecasted value
#Sets a rational threshold based on the average client profile
threshold_credit_rating <- quantiles[2]
print(paste("Minimum Credit Rating for Approval:", threshold_credit_rating))
## [1] "Minimum Credit Rating for Approval: 575.366110004954"
#Dependent variable: Credit_Score
#Explanatory variables:" Monthly_Income, Current_Debt, Loan_Amount, Interest_Rate, Loan_Term"
#Multicollinearity? No (VIF < 5)
#Most appropriate regression model: Multiple linear regression (OLS)
#Multicollinearity? No, all VIF values < 5
#¿Heteroscedasticity?Yes, Breusch-Pagan test p-value < 0.05
#Normality? No, Shapiro Wilk test p-value < 0.05
#Statistically significant? Yes, most coefficients show p < 0.05
#Economically significant? Yes, effects align with financial logic
#Interpretation of coefficients: Each coefficient reflects the marginal impact of the variable on Credit_Score, holding others constant
#Valid to simulate with normal distribution? Not ideal due to residual non-normality, used bootstrap resampling.
#How predictions were generated: Using predict() on new resampled (bootstrapped) data
#Minimum score for loan approval: 575.36