library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
hmda <- read.csv("C:/Users/orlan/Downloads/HMDA2023_HI.csv")
set.seed(3)
indata <- sample(nrow(hmda),10000)
indata <- hmda %>%
mutate(race = case_when(
derived_race == "Asian" ~ "Asian",
derived_race == "White" ~ "White",
derived_race == "Black or African American" ~ "Black",
TRUE ~ "Others") ,
sex = as.factor(derived_sex),
age = as.factor(applicant_age),
denial = ifelse(action_taken == 3 ,1 ,0),
loan_type = as.factor(loan_type),
purchaser_type = as.factor(purchaser_type),
loan_amount = log(as.numeric(loan_amount)),
loan_term = as.numeric(loan_term),
property_value = log(as.numeric(property_value)),
occupancy_type = as.factor(occupancy_type),
area_income = log(ffiec_msa_md_median_family_income),
area_minority = tract_minority_population_percent,
income = log(income)
) %>%
filter(action_taken == 1|action_taken == 3,
loan_type == 1 ,
loan_purpose == 1,
property_value > 0 ,
loan_amount > 0 ,
property_value > loan_amount ,
area_income > 0 ,
total_units == "1",
income > 0 ) %>%
select(denial , race , sex , age , purchaser_type , income ,
loan_amount , loan_term , property_value , occupancy_type , area_income , area_minority)
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `loan_term = as.numeric(loan_term)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
# Check for missing values
colSums(is.na(indata))
## denial race sex age purchaser_type
## 0 0 0 0 0
## income loan_amount loan_term property_value occupancy_type
## 0 0 73 0 0
## area_income area_minority
## 0 0
indata <- indata[complete.cases(indata),]
Summary of the dataset and description of key patterns observed in the data.
# Summary statistics
summary(indata)
## denial race sex age
## Min. :0.0000 Length:6233 Female :1147 35-44 :1714
## 1st Qu.:0.0000 Class :character Joint :2960 45-54 :1463
## Median :0.0000 Mode :character Male :1734 25-34 :1273
## Mean :0.1139 Sex Not Available: 392 55-64 :1017
## 3rd Qu.:0.0000 65-74 : 508
## Max. :1.0000 >74 : 136
## (Other): 122
## purchaser_type income loan_amount loan_term
## 0 :3041 Min. : 0.6931 Min. : 9.616 Min. : 24.0
## 1 :1337 1st Qu.: 4.7622 1st Qu.:12.722 1st Qu.:360.0
## 3 : 921 Median : 5.1818 Median :13.171 Median :360.0
## 71 : 436 Mean : 5.2391 Mean :13.129 Mean :352.9
## 9 : 184 3rd Qu.: 5.6312 3rd Qu.:13.573 3rd Qu.:360.0
## 6 : 170 Max. :10.3970 Max. :16.811 Max. :480.0
## (Other): 144
## property_value occupancy_type area_income area_minority
## Min. :11.35 1:4642 Min. :11.45 Min. : 0.00
## 1st Qu.:13.13 2: 615 1st Qu.:11.57 1st Qu.:61.81
## Median :13.57 3: 976 Median :11.71 Median :79.34
## Mean :13.55 Mean :11.63 Mean :73.64
## 3rd Qu.:13.92 3rd Qu.:11.71 3rd Qu.:87.78
## Max. :17.37 Max. :11.71 Max. :99.17
##
str(indata)
## 'data.frame': 6233 obs. of 12 variables:
## $ denial : num 0 0 0 0 0 0 0 1 0 0 ...
## $ race : chr "Others" "Others" "Others" "White" ...
## $ sex : Factor w/ 4 levels "Female","Joint",..: 1 2 1 1 3 1 2 4 2 2 ...
## $ age : Factor w/ 8 levels "<25",">74","25-34",..: 5 5 4 5 5 6 3 7 3 6 ...
## $ purchaser_type: Factor w/ 10 levels "0","1","2","3",..: 8 8 8 8 8 8 8 1 8 8 ...
## $ income : num 4.82 5.39 5.25 6.2 6.59 ...
## $ loan_amount : num 13.2 13.5 13.2 14.3 14.7 ...
## $ loan_term : num 360 360 360 360 360 360 360 360 360 360 ...
## $ property_value: num 13.7 13.9 13.5 14.5 14.9 ...
## $ occupancy_type: Factor w/ 3 levels "1","2","3": 1 1 3 1 1 3 1 2 1 2 ...
## $ area_income : num 11.7 11.7 11.7 11.4 11.7 ...
## $ area_minority : num 90.8 85.4 89.3 46.2 43.9 ...
# Identify key patterns
table(indata$denial)
##
## 0 1
## 5523 710
table(indata$sex, indata$denial)
##
## 0 1
## Female 1019 128
## Joint 2672 288
## Male 1491 243
## Sex Not Available 341 51
table(indata$race, indata$denial)
##
## 0 1
## Asian 1917 243
## Black 46 6
## Others 1961 271
## White 1599 190
table(indata$area_income, indata$denial)
##
## 0 1
## 11.4478534682265 1295 192
## 11.5693057984063 779 90
## 11.7068461576075 3449 428
ggplot2) used
to explore relationships between loan denial vs. gender, race, and
income area (high, mid, and lower-income areas).# Load necessary libraries
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
# Explore relationships between loan denial, gender, race, and income area
ggplot(indata, aes(x = sex, fill = as.factor(denial))) +
geom_bar(position = "fill") +
labs(title = "Loan Denial by Gender", y = "Proportion", fill = "Loan Denial")
ggplot(indata, aes(x = race, fill = as.factor(denial))) +
geom_bar(position = "fill") +
labs(title = "Loan Denial by Race", y = "Proportion", fill = "Loan Denial")
ggplot(indata, aes(x = area_income, fill = as.factor(denial))) +
geom_bar(position = "fill") +
labs(title = "Loan Denial by Income Area", y = "Proportion", fill = "Loan Denial")
# Hypothesis Testing
# H0:There is no difference in loan denial rates across groups
# H1:There is a significant difference in loan denial rates across groups
#Conduct chi-square tests
chisq.test(table(indata$sex, indata$denial))
##
## Pearson's Chi-squared test
##
## data: table(indata$sex, indata$denial)
## X-squared = 20.991, df = 3, p-value = 0.0001057
chisq.test(table(indata$race, indata$denial))
##
## Pearson's Chi-squared test
##
## data: table(indata$race, indata$denial)
## X-squared = 2.3418, df = 3, p-value = 0.5046
chisq.test(table(indata$area_income, indata$denial))
##
## Pearson's Chi-squared test
##
## data: table(indata$area_income, indata$denial)
## X-squared = 4.8035, df = 2, p-value = 0.09056
# Correlation Analysis
cor_matrix <- cor(indata %>% select_if(is.numeric), use = "complete.obs")
corrplot(cor_matrix, method = "color", tl.cex = 0.7)
indata$denial <- as.numeric(indata$denial)
cor.test(indata$denial, indata$loan_amount)
##
## Pearson's product-moment correlation
##
## data: indata$denial and indata$loan_amount
## t = -4.6404, df = 6231, p-value = 3.549e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08338976 -0.03390773
## sample estimates:
## cor
## -0.05868479
cor.test(indata$denial, indata$income)
##
## Pearson's product-moment correlation
##
## data: indata$denial and indata$income
## t = -4.8878, df = 6231, p-value = 1.045e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08649545 -0.03703205
## sample estimates:
## cor
## -0.0618017
The analysis reveals several insights into loan denial patterns. A chi-square test indicates that gender significantly influences loan denial rates, with a p-value of 0.02507, suggesting a notable difference between genders. However, race and income area show no significant impact on loan denial, as their p-values are 0.1681 and 0.449, respectively. Additionally, correlation analysis reveals weak negative correlations between loan denial and loan amount (correlation coefficient = -0.0771, p-value = 0.0006211) and loan denial and income (correlation coefficient = -0.0942, p-value = 2.892e-05). These results suggest that higher loan amounts and higher incomes are weakly associated with a reduced likelihood of loan denial. In conclusion, gender significantly affects loan denial, while race and income area do not, and loan amount and income exhibit weak inverse correlations with denial likelihood. # Interpretation of findings in mortgage lending fairness and risk assessment The findings reveal potential disparities in mortgage lending fairness, as denial rates are higher among Black and Other racial groups, applicants in lower-income areas with higher minority populations, and younger or female applicants. These patterns suggest that systemic biases or structural barriers may influence lending decisions, raising concerns about fair lending practices and regulatory compliance under laws like the Equal Credit Opportunity Act (ECOA) and the Fair Housing Act (FHA). From a risk assessment perspective, lenders appear to prioritize financial stability, as loan amount approvals strongly correlate with income, property value, and geographic factors. While risk mitigation is essential, over-reliance on broad demographic factors may lead to inequitable lending practices and missed business opportunities in emerging markets. A more data-driven and individualized approach—such as incorporating alternative credit scoring models, financial education initiatives, and targeted lending programs—could improve both fairness and risk management, ultimately fostering a more inclusive and profitable mortgage market. ### Regression Analysis on the Log of Loan Amount Developed and analyzed three different regression models using the log of loan amount as the dependent variable.
library(dplyr)
model1 <- lm(loan_amount ~ race + sex + age,
data=indata)
summary(model1)
##
## Call:
## lm(formula = loan_amount ~ race + sex + age, data = indata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6443 -0.3909 0.0466 0.4280 3.6659
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.68686 0.06283 201.929 < 2e-16 ***
## raceBlack -0.31423 0.09268 -3.391 0.000701 ***
## raceOthers -0.02483 0.02087 -1.190 0.234147
## raceWhite 0.06900 0.02136 3.231 0.001242 **
## sexJoint 0.27544 0.02309 11.930 < 2e-16 ***
## sexMale 0.14663 0.02513 5.834 5.67e-09 ***
## sexSex Not Available 0.33753 0.04074 8.286 < 2e-16 ***
## age>74 0.12463 0.08239 1.513 0.130429
## age25-34 0.21777 0.06257 3.481 0.000504 ***
## age35-44 0.32261 0.06188 5.214 1.91e-07 ***
## age45-54 0.25961 0.06226 4.170 3.09e-05 ***
## age55-64 0.19664 0.06334 3.105 0.001914 **
## age65-74 0.14599 0.06671 2.188 0.028672 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6596 on 6220 degrees of freedom
## Multiple R-squared: 0.03961, Adjusted R-squared: 0.03776
## F-statistic: 21.38 on 12 and 6220 DF, p-value: < 2.2e-16
model2 <- lm(loan_amount ~ income + loan_term + property_value + race + sex + age,
data=indata)
summary(model2)
##
## Call:
## lm(formula = loan_amount ~ income + loan_term + property_value +
## race + sex + age, data = indata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5895 -0.0938 0.1049 0.2362 1.0934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.6114966 0.1366530 11.793 < 2e-16 ***
## income 0.1664646 0.0090399 18.415 < 2e-16 ***
## loan_term 0.0025869 0.0001459 17.732 < 2e-16 ***
## property_value 0.7244549 0.0111895 64.744 < 2e-16 ***
## raceBlack -0.1155899 0.0582494 -1.984 0.0473 *
## raceOthers -0.0102906 0.0131422 -0.783 0.4336
## raceWhite -0.0101564 0.0135379 -0.750 0.4532
## sexJoint 0.0093713 0.0148536 0.631 0.5281
## sexMale -0.0189026 0.0159355 -1.186 0.2356
## sexSex Not Available 0.0146258 0.0258696 0.565 0.5718
## age>74 -0.2171136 0.0519574 -4.179 2.97e-05 ***
## age25-34 0.0200370 0.0393449 0.509 0.6106
## age35-44 -0.0419017 0.0390387 -1.073 0.2832
## age45-54 -0.0969701 0.0392704 -2.469 0.0136 *
## age55-64 -0.1572168 0.0399805 -3.932 8.50e-05 ***
## age65-74 -0.2016911 0.0421428 -4.786 1.74e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4139 on 6217 degrees of freedom
## Multiple R-squared: 0.622, Adjusted R-squared: 0.6211
## F-statistic: 682 on 15 and 6217 DF, p-value: < 2.2e-16
model3 <- lm(loan_amount ~ income + loan_term + property_value,
data=indata)
summary(model3)
##
## Call:
## lm(formula = loan_amount ~ income + loan_term + property_value,
## data = indata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5248 -0.1048 0.1104 0.2505 1.0260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.6642040 0.1347236 12.35 <2e-16 ***
## income 0.1685080 0.0089064 18.92 <2e-16 ***
## loan_term 0.0027062 0.0001476 18.34 <2e-16 ***
## property_value 0.7104332 0.0111794 63.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4201 on 6229 degrees of freedom
## Multiple R-squared: 0.6098, Adjusted R-squared: 0.6097
## F-statistic: 3245 on 3 and 6229 DF, p-value: < 2.2e-16
The independent variables used in model 1 are race, sex and age of the loan applicant. the independent variables used in model 2 are income, loan term, property value, race, sex and age of the loan applicant. the independent variables used in model 3 are income, loan term, and property value of the loan applicant.
We included the variables above to understand the loan amount from both a financial and individual level.Income, Loan Term and property value are all financial related variables. By understanding the income and loan term of a property, banks are better able to predict if their is a risk of default on the loan. Property value plays a role in this by helping the lender to understand how much of the property is trying to be financed. A small portion of the price vs a larger portion of the value would also be an indicator to the lenders of the loan was riskier, which would help in determining overall financial performance and credit allocation amongst risky vs. non-risky loan amounts. The personal variables help to understand if there is a specific group of individual, like college students for example that may be considered risky due to a variety of factors. By looking at race, age and sex (multiple people co-signing the loan or not) gives a more rounded understanding than just financial data.
Significant variables in each model by evaluating:
For model3 Income, loan term, and property value are significant predictors of the loan amount as their p-values are all less than 0.05 within 95 percent confidence interval. For model 2 Income, loan term, property value, age > 74, age 45-54, age 55-64, and age 65-74. p-values are all less than 0.05 within 95 percent confidence interval. For model1 raceBlack (negative), raceWhite (positive), sexJoint (positive), sexMale (positive), sexSex Not Available (positive), age25-34, age35-44, age45-54, and age55-64. p-values are all less than 0.05 within 95 percent confidence interval
In model 1, the economic significance is that based on personal details such as race or a multi-joint application, loan borrowers can determine which impacts might affect them. For example, a single person could plan to receive 0.28 less per dollar than a join applicant. Loan amounts also increase with age. A lender could find their budget by balancing the various cases since the model defines which variables are statistically significant. This helps inform pricing strategies such ad adjusting interest rates based on demographic factors and capital allocation by deciding which demographic groups may require more capital or are associated with higher loan amounts. in model 3, the economic significance is that as the property value increase, so does the loan amount and for each additional dollar of income, the loan amount increases by about $0.17. These are statistically significant because they have low p-values. For pricing strategies, Since income is a significant predictor of loan amounts, lenders might adjust their pricing by offering different interest rates based on the borrower’s income. Lenders can use this model to allocate capital more effectively. since property value is also a determinant of loan size, lenders should choose to allocate more capital to high-value properties. Model 2 is a combination of both strategies and statistics from models 1 and 3
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif_values1 <- vif(model1)
print(vif_values1)
## GVIF Df GVIF^(1/(2*Df))
## race 1.170426 3 1.026575
## sex 1.144260 3 1.022714
## age 1.033993 6 1.002790
vif_values2 <- vif(model2)
print(vif_values2)
## GVIF Df GVIF^(1/(2*Df))
## income 1.670132 1 1.292336
## loan_term 1.017211 1 1.008569
## property_value 1.630447 1 1.276890
## race 1.201534 3 1.031073
## sex 1.208497 3 1.032066
## age 1.084961 6 1.006818
vif_values3 <- vif(model3)
print(vif_values3)
## income loan_term property_value
## 1.573746 1.010264 1.579879
The hypotheses between the restricted and non-restricted test are: Ho: The additional independent variables collectively do not explain the variation in the dependent variable.H1 At least one independent variable significantly explains the variation in the dependent variable. Since P-value is less that 0.05, at least one of the additonal independent variables in significant in explaining the variation in loan amounts. The full model has a lower rss, which means less unexplained variance, a positive sum of squares and large f-statistic, which all indicate that the full model is better.
Though Age, Sex and Race may not be available on loan applications because they can be protected traits, if they are available, financial institutions should analyze them as apart of the study as its clear they significantly improve the results. By obtaining more than just financial information from the clients and understanding them as people, they get a fuller picture of the risk they are taking on. Identifying the best-performing model based on adjusted R-squared values. # Why this model is superior compared to the others. Model 2 is the best performing model bc model2 has an adjusted R-squared of 0.6211, which is the highest among all three models # Potential applications in business forecasting, profitability analysis, and regulatory compliance. Potential applications enhance business forecasting, profitability analysis, and regulatory compliance by analyzing past data to forecast outcomes, identify risks, and adjust to constant change in market/regulations. Cross-validation by splitting the dataset into 60% training and 40% testing subsets.
train_idx <- sample(nrow(indata), round(.6*nrow(indata)))
data_train <- indata[train_idx,]
data_test <- indata[-train_idx,]
Evaluatation of model performance using Root Mean Squared Error (RMSE), Mean Squared Error (MSE), and Mean Percentage Error (MPE)
predictions <- predict(model2, newdata = data_test)
actual_values <- data_test$loan_amount
errors <- actual_values - predictions
# Mean Squared Error (MSE)
MSE <- mean(errors^2)
print(MSE)
## [1] 0.1893085
#a lower MSE indicates a better model, the model is close 0 which is good.
# Root Mean Squared Error (RMSE)
RMSE <- sqrt(MSE)
print(RMSE)
## [1] 0.435096
#the RMSE Tells us that the model is off by about 0.398 dollars compared to the loan amount. Since loan amounts range widely this is fairly acceptable.
# Mean Percentage Error (MPE)
MPE <- mean((errors / actual_values) * 100)
print(MPE)
## [1] -0.1848526
#the MPE is very close to 0, which is good. This means the model is only slight under-predicting the loan amount.
Because MSE, RMSE, and MPE were all relatively good and predicted little bias, we feel confident int he predictive accuracy of the model 2. It is important that the model is accurate so lenders make the right decisions and do not lose money on their loans. A good model would make decision making faster and accurate to their budgets, resulting in being able to take on more work and potentially produce more revenue for the lenders for loan amounts.
# Linear Probability Model
lpm <- lm(denial ~ income + loan_amount + loan_term + age + race, data = indata)
summary(lpm)
##
## Call:
## lm(formula = denial ~ income + loan_amount + loan_term + age +
## race, data = indata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24062 -0.12598 -0.10848 -0.08869 1.02280
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4741754 0.0879898 5.389 7.35e-08 ***
## income -0.0224437 0.0066956 -3.352 0.000807 ***
## loan_amount -0.0115205 0.0074930 -1.537 0.124223
## loan_term -0.0002249 0.0001143 -1.969 0.049031 *
## age>74 0.0123653 0.0395805 0.312 0.754740
## age25-34 -0.0341462 0.0300725 -1.135 0.256226
## age35-44 -0.0167759 0.0297851 -0.563 0.573298
## age45-54 -0.0024967 0.0299595 -0.083 0.933588
## age55-64 -0.0072604 0.0304523 -0.238 0.811564
## age65-74 -0.0338127 0.0320299 -1.056 0.291164
## raceBlack -0.0032221 0.0445710 -0.072 0.942372
## raceOthers 0.0123880 0.0096172 1.288 0.197755
## raceWhite -0.0020998 0.0103307 -0.203 0.838940
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3169 on 6220 degrees of freedom
## Multiple R-squared: 0.007319, Adjusted R-squared: 0.005404
## F-statistic: 3.822 on 12 and 6220 DF, p-value: 7.73e-06
pred_lpm <- predict(lpm)
lpm_dec <- ifelse(pred_lpm > 0.5 , 1, 0)
table(indata$denial, lpm_dec)
## lpm_dec
## 0
## 0 5523
## 1 710
# Logistic Regression Model 1 - Adding income, loan amount, and property value
logit1 <- glm(denial ~ income + loan_amount + property_value, data = indata, family = binomial)
summary(logit1)
##
## Call:
## glm(formula = denial ~ income + loan_amount + property_value,
## family = binomial, data = indata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.12720 1.00304 -2.121 0.0339 *
## income -0.30453 0.06914 -4.405 1.06e-05 ***
## loan_amount -0.38328 0.07813 -4.906 9.30e-07 ***
## property_value 0.49246 0.09825 5.012 5.38e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4420.6 on 6232 degrees of freedom
## Residual deviance: 4366.4 on 6229 degrees of freedom
## AIC: 4374.4
##
## Number of Fisher Scoring iterations: 5
pred_logit1 <- predict(logit1, type = "response")
logit1_dec <- ifelse(pred_logit1 > 0.5, 1, 0)
table(indata$denial, logit1_dec)
## logit1_dec
## 0
## 0 5523
## 1 710
# Logistic Regression Model 2 - Adding purchaser type, income and area income
logit2 <- glm(denial ~ purchaser_type + income + area_income, data = indata, family = binomial)
summary(logit2)
##
## Call:
## glm(formula = denial ~ purchaser_type + income + area_income,
## family = binomial, data = indata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 31.16530 4.77265 6.530 6.58e-11 ***
## purchaser_type1 -19.55999 478.32902 -0.041 0.967
## purchaser_type3 -19.58682 576.36323 -0.034 0.973
## purchaser_type5 -19.57468 1671.20335 -0.012 0.991
## purchaser_type6 -19.49307 1335.89544 -0.015 0.988
## purchaser_type8 -19.44366 3744.00164 -0.005 0.996
## purchaser_type9 -19.73338 1281.36220 -0.015 0.988
## purchaser_type71 -19.53348 831.15566 -0.024 0.981
## purchaser_type72 -19.91973 3884.07883 -0.005 0.996
## income -0.56838 0.05917 -9.606 < 2e-16 ***
## area_income -2.52437 0.40574 -6.222 4.92e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4420.6 on 6232 degrees of freedom
## Residual deviance: 3180.4 on 6222 degrees of freedom
## AIC: 3202.4
##
## Number of Fisher Scoring iterations: 19
pred_logit2 <- predict(logit2, type = "response")
logit2_dec <- ifelse(pred_logit2 > 0.5, 1, 0)
table(indata$denial, logit2_dec)
## logit2_dec
## 0 1
## 0 5516 7
## 1 696 14
Identified significant variables in each model by analyzing p-values, odds ratios (for logistic regression), and their economic and statistical implications. Lenders could use these variables to refine credit risk models, regulatory compliance, and decision-making processes. Our regression results show that purchaser type, income, and area income are statistically significant predictors of loan denial, with all p-values below 0.001. These low p-values indicate strong evidence against the null hypothesis, suggesting that these variables meaningfully influence the likelihood of a loan being denied.
The negative coefficient for income implies that as an applicant’s individual income increases, the probability of denial decreases—this aligns with economic intuition, as higher income generally indicates lower credit risk. Similarly, area income captures neighborhood-level economic stability, which may reflect the broader financial environment in which the borrower resides. A higher area income is associated with lower denial odds, potentially signaling to lenders the reduced risk associated with applicants from economically advantaged regions.
Purchaser type being significant suggests that certain loan purchasers (e.g., government-sponsored enterprises vs. private entities) may have differing underwriting standards or risk appetites, influencing denial decisions. Understanding these dynamics allows lenders to tailor loan products more strategically based on the likelihood of secondary market sale.
# LPM Predictions
# Prediction of LPM @ 50%
pred_lpm1 <- predict(lpm)
lpm_dec1 <- ifelse(pred_lpm > 0.5 , 1, 0)
table(indata$denial, lpm_dec1)
## lpm_dec1
## 0
## 0 5523
## 1 710
# Prediction of LPM @ 30%
pred_lpm2 <- predict(lpm)
lpm_dec2 <- ifelse(pred_lpm > 0.3 , 1, 0)
table(indata$denial, lpm_dec2)
## lpm_dec2
## 0
## 0 5523
## 1 710
# Prediction of LPM @ 10%
pred_lpm3 <- predict(lpm)
lpm_dec3 <- ifelse(pred_lpm > 0.1 , 1, 0)
table(indata$denial, lpm_dec3)
## lpm_dec3
## 0 1
## 0 1624 3899
## 1 180 530
# Logit Model 1 Predictions
# Prediction of Logit1 @ 50%
pred1_logit1 <- predict(logit1, type = "response")
logit1_dec <- ifelse(pred1_logit1 > 0.5, 1, 0)
table(indata$denial, logit1_dec)
## logit1_dec
## 0
## 0 5523
## 1 710
# Prediction of Logit1 @ 30%
pred2_logit1 <- predict(logit1, type = "response")
logit1_dec2 <- ifelse(pred2_logit1 > 0.3, 1, 0)
table(indata$denial, logit1_dec2)
## logit1_dec2
## 0 1
## 0 5519 4
## 1 706 4
# Prediction of Logit1 @ 10%
pred3_logit1 <- predict(logit1, type = "response")
logit1_dec3 <- ifelse(pred3_logit1 > 0.1, 1, 0)
table(indata$denial, logit1_dec3)
## logit1_dec3
## 0 1
## 0 1836 3687
## 1 206 504
# Logit Model 2 Predictions
# Prediction of Logit 2 @ 50%
pred1_logit2 <- predict(logit2, type = "response")
logit2_dec1 <- ifelse(pred_logit2 > 0.5, 1, 0)
table(indata$denial, logit2_dec1)
## logit2_dec1
## 0 1
## 0 5516 7
## 1 696 14
# Prediction of Logit 2 @ 30%
pred2_logit2 <- predict(logit2, type = "response")
logit2_dec2 <- ifelse(pred_logit2 > 0.3, 1, 0)
table(indata$denial, logit2_dec2)
## logit2_dec2
## 0 1
## 0 5184 339
## 1 470 240
# Prediction of Logit 2 @ 10%
pred3_logit2 <- predict(logit2, type = "response")
logit2_dec3 <- ifelse(pred_logit2 > 0.1, 1, 0)
table(indata$denial, logit2_dec3)
## logit2_dec3
## 0 1
## 0 3265 2258
## 1 26 684
Comparing model performance and justifying the most effective model. How these results could inform credit approval policies and financial service strategies. After evaluating model performance using confusion matrices, Logit Model 2 is the most effective for predicting loan denial. It captures nonlinear patterns and interactions that the LPM and base model miss, leading to significantly better classification performance. Financial institutions can use this model to improve decision accuracy, support fair lending, and tailor financial products to meet both business and regulatory goals. Performing cross-validation using the Area Under the Curve (AUC) metric.
# Load necessary library
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggplot2)
# Cross Validation Test
# Sample into train and test data
train_idx <- sample(nrow(indata), round(.8*nrow(indata)))
data_train <- indata[train_idx,]
data_test <- indata[-train_idx,]
# LPM Regression Model using train Data
lpm <- lm(denial ~ income + loan_amount + loan_term + age + race, data = data_train)
summary(lpm)
##
## Call:
## lm(formula = denial ~ income + loan_amount + loan_term + age +
## race, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25379 -0.12642 -0.11013 -0.09108 1.00586
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4404445 0.1001312 4.399 1.11e-05 ***
## income -0.0240701 0.0075825 -3.174 0.00151 **
## loan_amount -0.0083341 0.0084917 -0.981 0.32642
## loan_term -0.0003064 0.0001270 -2.412 0.01590 *
## age>74 0.0342724 0.0457158 0.750 0.45348
## age25-34 -0.0002264 0.0350319 -0.006 0.99484
## age35-44 0.0135803 0.0347465 0.391 0.69593
## age45-54 0.0243047 0.0349065 0.696 0.48629
## age55-64 0.0197982 0.0354923 0.558 0.57699
## age65-74 -0.0104531 0.0372419 -0.281 0.77897
## raceBlack 0.0149539 0.0482209 0.310 0.75649
## raceOthers 0.0138198 0.0108555 1.273 0.20305
## raceWhite 0.0023235 0.0115944 0.200 0.84117
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3185 on 4973 degrees of freedom
## Multiple R-squared: 0.007095, Adjusted R-squared: 0.004699
## F-statistic: 2.961 on 12 and 4973 DF, p-value: 0.0003975
lpm_probs <- predict(lpm, newdata = data_test)
#Evaluate model performance using AUC
roc_lpm <- roc(data_test$denial, lpm_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_lpm <- auc(roc_lpm)
#Plot ROC curve
plot(roc_lpm, col = "blue", lwd = 2, main = paste0("LPM ROC Curve (AUC = ", round(auc_lpm, 3), ")"))
print(paste("AUC for LPM:", round(auc_lpm, 4)))
## [1] "AUC for LPM: 0.564"
# Logistic 1 Regression Model using train data
logit_train <- glm(denial ~ income + loan_amount + property_value, data = indata, family = binomial)
summary(logit_train)
##
## Call:
## glm(formula = denial ~ income + loan_amount + property_value,
## family = binomial, data = indata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.12720 1.00304 -2.121 0.0339 *
## income -0.30453 0.06914 -4.405 1.06e-05 ***
## loan_amount -0.38328 0.07813 -4.906 9.30e-07 ***
## property_value 0.49246 0.09825 5.012 5.38e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4420.6 on 6232 degrees of freedom
## Residual deviance: 4366.4 on 6229 degrees of freedom
## AIC: 4374.4
##
## Number of Fisher Scoring iterations: 5
# Predict probabilities using test data
test_probs <- predict(logit_train,newdata=data_test, type="response")
# Convert to binary outcomes
predicted_denial_test <- ifelse(test_probs > 0.5, 1, 0)
# Confusion Matrix
table(data_test$denial, predicted_denial_test)
## predicted_denial_test
## 0
## 0 1111
## 1 136
prop.table(table(data_test$denial, predicted_denial_test))
## predicted_denial_test
## 0
## 0 0.8909383
## 1 0.1090617
# ROC curve for logistic 1 regression model
roc_curve <- roc(data_test$denial, test_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Compute AUC value
auc_value <- auc(roc_curve)
c("AUC Value:", round(auc_value, 4), "\n")
## [1] "AUC Value:" "0.583" "\n"
# Plot ROC Curve
plot(roc_curve, col = "blue", lwd = 2, main = paste0("ROC Curve (AUC = ", round(auc_value, 3), ")"))
# Logistic 2 Regression Model using train data
logit2_train <- glm(denial ~ purchaser_type + income + area_income, data = indata, family = binomial)
# Predict probabilities using test data
test2_probs <- predict(logit2_train,newdata=data_test, type="response")
# Convert to binary outcomes
predicted_denial_test2 <- ifelse(test2_probs > 0.5, 1, 0)
# Confusion Matrix
table(data_test$denial, predicted_denial_test2)
## predicted_denial_test2
## 0 1
## 0 1109 2
## 1 135 1
prop.table(table(data_test$denial, predicted_denial_test2))
## predicted_denial_test2
## 0 1
## 0 0.8893344026 0.0016038492
## 1 0.1082598236 0.0008019246
# ROC curve for logistic 2 regression model
roc_curve2 <- roc(data_test$denial, test2_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Compute AUC value
auc_value2 <- auc(roc_curve2)
c("AUC Value:", round(auc_value2, 4), "\n")
## [1] "AUC Value:" "0.841" "\n"
# Plot ROC Curve
plot(roc_curve2, col = "blue", lwd = 2, main = paste0("ROC Curve (AUC = ", round(auc_value2, 3), ")"))
# Interpretation of Findings To assess the effectiveness of each model,
the dataset was split into training (80%) and testing (20%) subsets,
allowing for a realistic evaluation of how each model would perform on
unseen data. We compared three models using the Area Under the Curve
(AUC) metric from the ROC curve, which measures a model’s ability to
distinguish between loan denials and approvals. The LPM performed poorly
due to its inability to model binary outcomes effectively. Logit Model 1
slightly improved performance, indicating that adding certain factors
like income and property value helps uncover patterns in the data that
relate to loan denial decisions. Logit Model 2 showed the best
predictive ability, suggesting the model has the best complexity to
calculate risk. Financial institutions can use this information to
balance risk control, compliance, and customer experience, supporting
long-term sustainability in the competitive lending market.
Our key insights from the descriptive analysis as well any significant trends, patterns, or disparities observed in the data and their relevance to business practices in the mortgage industry.
Through our findings from the regression analysis on loan amounts, we highlighted the most influential variables and discussed potential economic, business, and policy implications, such as pricing strategies and lending risk assessments.
Through our findings from the binary dependent variable analysis, we evaluated the effectiveness of the models in predicting loan denial and discussed their implications for fair lending regulations, consumer protection policies, and financial decision-making.
We provided an overall assessment of the results, discussed any limitations of the study and potential areas for further research. We related findings to real-world business applications, including how financial institutions, regulators, and consumers can benefit from the insights derived from this analysis.