# Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(moments)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggplot2)
# Load Data
hmda <- read.csv("~/Downloads/HMDA2023_MT.csv")
set.seed(38503)
indata <- sample(nrow(hmda),10000)
indata <- hmda %>%
mutate(race=case_when(
derived_race == "Asian" ~ "Asian",
derived_race == "White" ~ "White",
derived_race == "Black or African American" ~ "Black",
TRUE ~ "Others"),
sex = as.factor(derived_sex),
age = as.factor(applicant_age),
denial = ifelse(action_taken == 3,1,0),
loan_type = as.factor(loan_type),
purchaser_type = as.factor(purchaser_type),
loan_amount = log(as.numeric(loan_amount)),
loan_term = as.numeric(loan_term),
property_value = log(as.numeric(property_value)),
occupancy_type = as.factor(occupancy_type),
area_income = log(ffiec_msa_md_median_family_income),
area_minority = tract_minority_population_percent,
income = log(income)
) %>%
filter(action_taken==1 | action_taken ==3,
loan_type == 1,
loan_purpose== 1,
property_value > 0,
loan_amount > 0,
property_value > loan_amount,
area_income > 0,
total_units == "1",
income > 0) %>%
select(denial, race, sex, age, purchaser_type, income, loan_amount,
loan_term, property_value, occupancy_type, area_minority, area_income)
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `loan_term = as.numeric(loan_term)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
# Check any missing cases by column
colSums(is.na(indata))
## denial race sex age purchaser_type
## 0 0 0 0 0
## income loan_amount loan_term property_value occupancy_type
## 0 0 10 0 0
## area_minority area_income
## 0 0
indata <- indata[complete.cases(indata),]
table(indata$denial)
##
## 0 1
## 6896 701
summary(indata)
## denial race sex age
## Min. :0.00000 Length:7597 Female :1463 25-34 :2015
## 1st Qu.:0.00000 Class :character Joint :3742 35-44 :1982
## Median :0.00000 Mode :character Male :1976 45-54 :1372
## Mean :0.09227 Sex Not Available: 416 55-64 :1094
## 3rd Qu.:0.00000 65-74 : 581
## Max. :1.00000 <25 : 367
## (Other): 186
## purchaser_type income loan_amount loan_term
## 0 :2347 Min. : 1.099 Min. : 8.517 Min. : 4.0
## 1 :1983 1st Qu.: 4.369 1st Qu.:12.231 1st Qu.:360.0
## 3 :1443 Median : 4.804 Median :12.692 Median :360.0
## 71 : 897 Mean : 4.885 Mean :12.575 Mean :333.5
## 9 : 488 3rd Qu.: 5.278 3rd Qu.:13.071 3rd Qu.:360.0
## 6 : 337 Max. :10.228 Max. :16.487 Max. :480.0
## (Other): 102
## property_value occupancy_type area_minority area_income
## Min. : 9.616 1:6498 Min. : 0.00 Min. :11.35
## 1st Qu.:12.692 2: 555 1st Qu.: 9.58 1st Qu.:11.35
## Median :13.071 3: 544 Median :11.80 Median :11.35
## Mean :13.075 Mean :13.28 Mean :11.39
## 3rd Qu.:13.452 3rd Qu.:14.08 3rd Qu.:11.46
## Max. :17.148 Max. :89.73 Max. :11.52
##
summary_statistics <-
summary(indata)
print(summary_statistics)
## denial race sex age
## Min. :0.00000 Length:7597 Female :1463 25-34 :2015
## 1st Qu.:0.00000 Class :character Joint :3742 35-44 :1982
## Median :0.00000 Mode :character Male :1976 45-54 :1372
## Mean :0.09227 Sex Not Available: 416 55-64 :1094
## 3rd Qu.:0.00000 65-74 : 581
## Max. :1.00000 <25 : 367
## (Other): 186
## purchaser_type income loan_amount loan_term
## 0 :2347 Min. : 1.099 Min. : 8.517 Min. : 4.0
## 1 :1983 1st Qu.: 4.369 1st Qu.:12.231 1st Qu.:360.0
## 3 :1443 Median : 4.804 Median :12.692 Median :360.0
## 71 : 897 Mean : 4.885 Mean :12.575 Mean :333.5
## 9 : 488 3rd Qu.: 5.278 3rd Qu.:13.071 3rd Qu.:360.0
## 6 : 337 Max. :10.228 Max. :16.487 Max. :480.0
## (Other): 102
## property_value occupancy_type area_minority area_income
## Min. : 9.616 1:6498 Min. : 0.00 Min. :11.35
## 1st Qu.:12.692 2: 555 1st Qu.: 9.58 1st Qu.:11.35
## Median :13.071 3: 544 Median :11.80 Median :11.35
## Mean :13.075 Mean :13.28 Mean :11.39
## 3rd Qu.:13.452 3rd Qu.:14.08 3rd Qu.:11.46
## Max. :17.148 Max. :89.73 Max. :11.52
##
# Summary of data- grouped by occupancy type, comparing denial and income
summary_occupancy <- indata %>%
group_by(occupancy_type) %>%
summarize(
Denial_mean = mean(denial, na.rm = TRUE),
Denial_median = median(denial, na.rm = TRUE),
Denial_sd = sd(denial, na.rm = TRUE),
Denial_skew = skewness(denial, na.rm = TRUE),
Denial_kurt = kurtosis(denial, na.rm = TRUE),
Denial_CV = Denial_sd / Denial_mean,
Income_mean = mean(income, na.rm = TRUE),
Income_median = median(income, na.rm = TRUE),
Income_sd = sd(income, na.rm = TRUE),
Income_skew = skewness(income, na.rm = TRUE),
Income_kurt = kurtosis(income, na.rm = TRUE),
Income_CV = Income_sd / Income_mean
)
print(summary_occupancy)
## # A tibble: 3 × 13
## occupancy_type Denial_mean Denial_median Denial_sd Denial_skew Denial_kurt
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.0877 0 0.283 2.91 9.50
## 2 2 0.124 0 0.330 2.28 6.19
## 3 3 0.114 0 0.318 2.43 6.90
## # ℹ 7 more variables: Denial_CV <dbl>, Income_mean <dbl>, Income_median <dbl>,
## # Income_sd <dbl>, Income_skew <dbl>, Income_kurt <dbl>, Income_CV <dbl>
The denial rate is highest for Occupancy Type 2 (12.4%), followed by Type 3 (11.4%) and Type 1 (8.8%).
Implication: Depending on the type of occupancy, lenders may modify risk assessments or pricing.
The majority of applicants are between the ages of 25 and 44, and 49% of them are joint borrowers, suggesting that their households have two incomes.
Implication: Age-specific requirements and affordability can be met by customized mortgage packages.
30-year periods and high loan-to-value ratios point to financial limitations.
Implication: Lenders might provide flexible repayment plans or tighten loan-to-income ratios.
Disparities in loan availability may exist in some minority-heavy areas (up to 89.7%).
Implication: Inclusivity can be enhanced by bolstering fair lending policies and financial literacy initiatives.
Financial institutions can optimize loan offerings, improve risk models, and advance financial inclusion while maintaining profitability by taking these aspects into consideration.
ggplot2) to explore relationships between loan denial
vs. gender, race, and income area (high, mid, and lower-income
areas).
# Loan Denial by Sex
ggplot(indata, aes(x = sex, fill = as.factor(denial))) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Loan Denial by Gender", x = "Gender", y = "Proportion") +
theme_minimal()
# Loan Denial by Race
ggplot(indata, aes(x = race, fill = as.factor(denial))) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Loan Denial by Race", x = "Race", y = "Proportion") +
theme_minimal()
# Loan Denial by Income Area
indata <- indata %>%
mutate(income_area = factor(area_income, levels = c("Low", "Mid", "High")))
ggplot(indata, aes(x = area_income, fill = as.factor(denial))) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Loan Denial by Income Area",
x = "Income Area",
y = "Proportion",
fill = "Denial Status") +
scale_fill_manual(values = c("0" = "blue", "1" = "red"),
labels = c("Approved", "Denied")) +
theme_minimal()
### Intreprtation of Findings 1. Loan Denial by Gender
The loan denial rates for various gender categories are examined in the first visualization.
The denial percentages for male and female applications are comparable, however the “Sex Not Available” category seems to have a little greater percentage of denials.
It’s possible that combined incomes increase creditworthiness, which explains why joint applications have the lowest denial rates.
Differences in loan denial rates between various racial groupings are shown in the second visualization.
Some racial groupings seem to have a larger percentage of loan denials (blue) than others.
White candidates appear to have the lowest rejection rate, while Black applicants appear to have the highest. Due to institutional biases, variations in creditworthiness, or other causes, this may indicate racial discrepancies in loan approval procedures.
The third graphic displays the percentage of loans denied in various income brackets.
The majority of applications were accepted (blue), while a tiny portion were rejected (red).
The fact that the rejection rate is largely constant across income levels suggests that income level might not be the main determinant in loan denials.
Fair Lending Practices: Racial differences in loan rejections point to possible prejudices, necessitating a review of policies to guarantee equity and adherence to anti-discrimination legislation.
Enhancements to Risk Assessment: In order to avoid inadvertent disadvantages for specific groups, lenders should examine credit models. Machine learning methods and alternative credit data may lessen biases.
Financial Education & Outreach: Programs that teach financial literacy about credit building and loan applications are necessary, as evidenced by higher refusal rates in particular populations.
Regulatory Compliance & Transparency: While increased loan process transparency fosters accountability and confidence, internal audits can guarantee equitable lending.
# Chi-Square Test (Denial by Race)
chisq_test_race <- chisq.test(table(indata$denial, indata$race))
## Warning in chisq.test(table(indata$denial, indata$race)): Chi-squared
## approximation may be incorrect
print(chisq_test_race)
##
## Pearson's Chi-squared test
##
## data: table(indata$denial, indata$race)
## X-squared = 18.675, df = 3, p-value = 0.0003192
X^2 = 18.675
Degree of Freedom = 3
P-value = 0.0003192
- Since the P value is lower than the conventional threshold of 0.05, we reject the null hypothesis. This means that there is a significant relationship between race and loan denial rates.
# Loan Amounts and Continuous Variables
cor(indata[c("loan_amount", "income", "property_value", "loan_term")])
## loan_amount income property_value loan_term
## loan_amount 1.0000000 0.56829929 0.7153256 0.41251086
## income 0.5682993 1.00000000 0.6438250 0.08230416
## property_value 0.7153256 0.64382501 1.0000000 0.13467394
## loan_term 0.4125109 0.08230416 0.1346739 1.00000000
# Loan Denial and Other Relevant Variables
chisq.test(indata$denial, indata$sex)
##
## Pearson's Chi-squared test
##
## data: indata$denial and indata$sex
## X-squared = 35.367, df = 3, p-value = 1.019e-07
chisq.test(indata$denial, indata$race)
## Warning in chisq.test(indata$denial, indata$race): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: indata$denial and indata$race
## X-squared = 18.675, df = 3, p-value = 0.0003192
***Chi-square test reveal: - For loan denial and sex there is (p < 0.001) statistically significant association, a need for further investigation for fairness - For loan denial and race there is (p < 0.0003) significant disparity, there is a concern for potential regulatory issues.
# Model 1 - Income and Property Value
model1 <- lm(loan_amount ~ income + property_value, data = indata)
summary(model1)
##
## Call:
## lm(formula = loan_amount ~ income + property_value, data = indata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8939 -0.0931 0.1888 0.3384 1.1267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.00938 0.15036 6.713 2.05e-11 ***
## income 0.21389 0.01193 17.929 < 2e-16 ***
## property_value 0.80461 0.01384 58.144 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6173 on 7594 degrees of freedom
## Multiple R-squared: 0.5315, Adjusted R-squared: 0.5314
## F-statistic: 4308 on 2 and 7594 DF, p-value: < 2.2e-16
# Model 2
model2 <- lm(loan_amount ~ income + property_value + sex + race, data = indata)
summary(model2)
##
## Call:
## lm(formula = loan_amount ~ income + property_value + sex + race,
## data = indata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8858 -0.0926 0.1885 0.3377 1.1360
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.035797 0.165541 6.257 4.14e-10 ***
## income 0.214328 0.012086 17.733 < 2e-16 ***
## property_value 0.804314 0.013875 57.968 < 2e-16 ***
## sexJoint 0.004271 0.019672 0.217 0.8281
## sexMale 0.025808 0.021504 1.200 0.2301
## sexSex Not Available 0.074381 0.040282 1.847 0.0649 .
## raceBlack -0.466741 0.197995 -2.357 0.0184 *
## raceOthers -0.064040 0.070844 -0.904 0.3660
## raceWhite -0.032619 0.067605 -0.482 0.6295
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6171 on 7588 degrees of freedom
## Multiple R-squared: 0.5322, Adjusted R-squared: 0.5317
## F-statistic: 1079 on 8 and 7588 DF, p-value: < 2.2e-16
# Model 3
model3 <- lm(loan_amount ~ income + property_value + sex + race + loan_term, data = indata)
summary(model3)
##
## Call:
## lm(formula = loan_amount ~ income + property_value + sex + race +
## loan_term, data = indata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6083 -0.1207 0.1236 0.2626 1.8277
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.136e-01 1.469e-01 2.815 0.00489 **
## income 2.169e-01 1.068e-02 20.306 < 2e-16 ***
## property_value 7.431e-01 1.233e-02 60.250 < 2e-16 ***
## sexJoint 1.472e-02 1.739e-02 0.847 0.39723
## sexMale 2.839e-02 1.900e-02 1.494 0.13525
## sexSex Not Available 9.705e-02 3.560e-02 2.726 0.00642 **
## raceBlack -3.822e-01 1.750e-01 -2.184 0.02897 *
## raceOthers -9.657e-02 6.261e-02 -1.543 0.12299
## raceWhite -2.938e-02 5.974e-02 -0.492 0.62284
## loan_term 4.214e-03 9.131e-05 46.147 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5454 on 7587 degrees of freedom
## Multiple R-squared: 0.6347, Adjusted R-squared: 0.6343
## F-statistic: 1465 on 9 and 7587 DF, p-value: < 2.2e-16
Income: Higher income generally increases borrowing capacity, making it a critical determinant of loan approval and loan amount.
Property Values: Loan amounts are directly influenced by property values, this is due to loans being typically built around what the property values are.
Sex: Included to assess whether disparities exist in loan amounts based on gender, ensuring compliance with fair lending regulations.
Race: Identifying potential racial disparities, it is relevant for regulatory fair lending practices.
Loan Term: Longer loan terms can effect loan amounts by impacting the monthly payment affordability.
Base Model: Strong predictors (p < 0.001), R^2 is 0.531, this explains that the variables are roughly 53% of the loan amount variation.
Expanded Model: The R^2 increases slightly to 0.532, sex and most of the categories are not significant, suggesting potential disparities.
Full Model: R^2 improves to 0.635 with the Loan term being (p < 0.001) being a significant factors. Black or African American borrows receive lower loans (p = 0.029), and “Sex Not Available” becomes significant (p = 0.006).
Lenders must ensure that income and property value remain the primary factors in the decision making process while also monitoring the disparities in the loan allocation.
Regulators must look into the differences in lending practices to make sure that no one is being discriminated against because of their race or other factors.
Financial institutions should consider loan term adjustments to help with balancing affordability and risk.
Income: Estimate = 0.213889, p-value < 2e-16 (highly significant)
Property Value: Estimate = 0.80461, p-value < 2e-16 (higly significant)
Interpretation: Both income and property value are statistically significant, meaning that they have a strong influence over loan amount.
Economic Significance: Income has a positive relationship with loan amount, this suggests that higher income can lead to to higher loans. This is important to lenders when creating loans based on income. Property value also highly significant implying that its also important to loan size. Lenders could use this in pricing the loans and assessing collateral.
Statistical Significance: There’s a strong relationship due to both variables being highly significant.
Income: Estimate = 0.214328, p-value < 2e-16 (highly significant)
Property Value: Estimate = 0.804314, p-value < 2e-16 (highly significant)
Sex (Male, Joint, Not Available): All p-values > 0.05, indicating not significant.
Race (Black): Estimate = -0.466741, p-value = 0.0184 (significant)
Race (Others, White): p-values > 0.05, indicating not significant.
Interpretation: Income and Property Value remains significant still, none of sex related variables are significant meaning they don’t really influence loan amount. Only Race(Black) is significant, suggesting that there are racial disparities in loan amounts.
Economic Significance: Sex doesn’t have an impact, however Race(Black) shows a negative coefficient, this can indicate that black borrowers may receive lower loans compared to others. This can suggest that there are potential inequities that may require addressing in business strategies.
Statistical Significance: The significance for Race(Black) means that addressing racial disparities could improve the fairness of loan amounts and allocation
Income: Estimate = 0.2169, p-value < 2e-16 (highly significant)
Property Value: Estimate = 0.7431, p-value < 2e-16 (highly significant)
Sex (Male, Joint, Not Available): Only Sex Not Available is significant (p-value = 0.00642), with a positive coefficient of 0.09705. This indicates a slightly higher loan amount for “Sex Not Available” compared to the baseline category.
Race (Black): Estimate = -0.3822, p-value = 0.02897 (significant)
Loan Term: Estimate = 0.004214, p-value < 2e-16 (highly significant)
Interpretations: Income and Property Value are still significant, the significance of “Sex Not Available” can mean that there’s a bit of an increase in loan amounts. There is still a negative coefficient for Race(Black) still indicating racial disparities. For Loan Term, it is a significant variable with a very low positive coefficient, the loan amount increases very slightly for each additional loan term.
Economic Significance: Loan Term as a signifcant positive effect on loan amounts, longer loan terms increases loan size. Race(Black) still indicates lower amounts of loan for black borrowers, meaning that there is still an unfairness and potential areas for intervention.
Statistical Significance: Income, Property Value, and Loan Term all are consistently significant, supporting the importance in pricing allocation strategies for loans.
# Variance Inflation Factor (VIF)
vif(model3)
## GVIF Df GVIF^(1/(2*Df))
## income 1.754149 1 1.324443
## property_value 1.738228 1 1.318419
## sex 1.545145 3 1.075214
## race 1.456778 3 1.064712
## loan_term 1.020338 1 1.010118
GVIF (Generalized Variance Inflation Factor) values assess multicollinearity. Higher GVIF indicates stronger correlation with other variables.
GVIF^(1/(2*Df)) provides a normalized measure of multicollinearity. Values above 1 suggest potential multicollinearity issues.
Income & Property Value: GVIF close to 1, indicating minimal multicollinearity
Sex & Race: GVIF is slightly higher GVIF, there is some degree of correlation but not a huge amount
Loan Term: lowest GVIF, minimal correlation
The understanding of the importance of individual variables may be distorted by high multicollinearity, which could make loan terms and price-setting more difficult.
Collinearity between property value and income, for example, may result in inaccurate conclusions about how much of an impact each factor has on loan amounts.
Remove or combine strongly correlated variables (such income and property value) to simplify the model.
Refine correlated variables into a smaller, easier-to-manage set of uncorrelated components.
anova(model1, model2, model3)
## Analysis of Variance Table
##
## Model 1: loan_amount ~ income + property_value
## Model 2: loan_amount ~ income + property_value + sex + race
## Model 3: loan_amount ~ income + property_value + sex + race + loan_term
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7594 2893.9
## 2 7588 2889.8 6 4.04 2.2629 0.03484 *
## 3 7587 2256.5 1 633.37 2129.5917 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Null Hypothesis (H₀): The model is not substantially enhanced by the addition of new variables (sex, race, and loan_term).
Alternative Hypothesis (H₁): The model is much enhanced by the addition of new variables (sex, race, and loan_term).
The base model, Model 1 (income + property_value), is not directly tested in this instance.
With a p-value of 0.03484, Model 2 (income + property_value + sex + race) produces an F-statistic of 2.2629. This suggests that the model is considerably improved (at the 5% level) by including sex and race.
With a p-value of less than 2e-16 and an exceptionally significant F-statistic of 2129.5917, Model 3 (income + property_value + sex + race + loan_term) demonstrates how significantly the model is improved by including loan_term.
Model 2: The model is greatly enhanced by the inclusion of sex and ethnicity, indicating that demographic variables are crucial in predicting loan amounts.
Model 3: By adding loan_term, the model is further enhanced and yields a highly significant result, demonstrating the importance of the loan duration in explaining loan amounts.
Financial organizations can more precisely evaluate the risk associated with loan applications by identifying important criteria such as income, property_value, sex, race, and loan_term. For example, knowing how loan terms affect loan amounts enables institutions to more accurately evaluate long-term repayment risks.
While demographic information can be used to predict default risks, race and sex factors can be used to inform credit scoring models in order to reduce the risk of discrimination and guarantee fair lending practices.
Based on the borrower’s demographics (e.g., loan terms catered to various income groups) and property value, institutions might use these results to create customized loan products.
Banks can maintain an advantage over rivals by implementing more precise pricing strategies and focusing on underserved market groups by delivering competitive lending terms and products based on these findings. Both market share and client loyalty may rise as a result.
Business Forecasting: Model 3 is perfect for predicting loan amounts because of its extensive variables, which include loan length, which enable more precise financial estimates.
Profitability Analysis: By identifying important variables affecting loan amounts, the model helps banks optimize loan pricing and boost profitability.
Regulatory Compliance: Model 3 guarantees equity in lending practices by taking into account factors including sex, color, and loan length. This is essential for fulfilling legal obligations and preventing discriminatory practices.
# Evaluate Adjusted R-squared for models
cat("Adjusted R-squared for Model 1: ", summary(model1)$adj.r.squared, "\n")
## Adjusted R-squared for Model 1: 0.5313987
cat("Adjusted R-squared for Model 2: ", summary(model2)$adj.r.squared, "\n")
## Adjusted R-squared for Model 2: 0.5316826
cat("Adjusted R-squared for Model 3: ", summary(model3)$adj.r.squared, "\n")
## Adjusted R-squared for Model 3: 0.6342758
# Cross-validation for all models
set.seed(38503)
trainIndex <- createDataPartition(indata$loan_amount, p = 0.6, list = FALSE)
trainData <- indata[trainIndex, ]
testData <- indata[-trainIndex, ]
# Model 1: Income and Property Value
predictions1 <- predict(lm(loan_amount ~ income + property_value, data = trainData), testData)
rmse1 <- sqrt(mean((testData$loan_amount - predictions1)^2))
mse1 <- mean((testData$loan_amount - predictions1)^2)
mpe1 <- mean(abs(testData$loan_amount - predictions1) / testData$loan_amount) * 100
# Model 2: Income, Property Value, Sex, Race
predictions2 <- predict(lm(loan_amount ~ income + property_value + sex + race, data = trainData), testData)
rmse2 <- sqrt(mean((testData$loan_amount - predictions2)^2))
mse2 <- mean((testData$loan_amount - predictions2)^2)
mpe2 <- mean(abs(testData$loan_amount - predictions2) / testData$loan_amount) * 100
# Model 3: Income, Property Value, Sex, Race, Loan Term
predictions3 <- predict(lm(loan_amount ~ income + property_value + sex + race + loan_term, data = trainData), testData)
rmse3 <- sqrt(mean((testData$loan_amount - predictions3)^2))
mse3 <- mean((testData$loan_amount - predictions3)^2)
mpe3 <- mean(abs(testData$loan_amount - predictions3) / testData$loan_amount) * 100
# Display Performance Metrics
cat("Performance Metrics for Model 1: RMSE =", rmse1, "MSE =", mse1, "MPE =", mpe1, "\n")
## Performance Metrics for Model 1: RMSE = 0.6351613 MSE = 0.4034299 MPE = 3.391547
cat("Performance Metrics for Model 2: RMSE =", rmse2, "MSE =", mse2, "MPE =", mpe2, "\n")
## Performance Metrics for Model 2: RMSE = 0.6348638 MSE = 0.4030521 MPE = 3.3915
cat("Performance Metrics for Model 3: RMSE =", rmse3, "MSE =", mse3, "MPE =", mpe3, "\n")
## Performance Metrics for Model 3: RMSE = 0.565539 MSE = 0.3198344 MPE = 2.982872
Conclusion: Even though Model 2 strikes a good balance between fit and accuracy, Model 3 offers the best performance and is most helpful for financial decisions.
# Linear Probability Model
lpm <- lm(denial ~ income + property_value + sex + race, data = indata)
summary(lpm)
##
## Call:
## lm(formula = denial ~ income + property_value + sex + race, data = indata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.22843 -0.11371 -0.08687 -0.05828 1.05350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3680083 0.0770057 4.779 1.80e-06 ***
## income -0.0408402 0.0056222 -7.264 4.13e-13 ***
## property_value 0.0006927 0.0064544 0.107 0.91453
## sexJoint -0.0207630 0.0091508 -2.269 0.02330 *
## sexMale 0.0033860 0.0100033 0.338 0.73501
## sexSex Not Available 0.0082914 0.0187382 0.442 0.65815
## raceBlack -0.1680222 0.0921029 -1.824 0.06815 .
## raceOthers -0.0521624 0.0329550 -1.583 0.11350
## raceWhite -0.0815270 0.0314483 -2.592 0.00955 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2871 on 7588 degrees of freedom
## Multiple R-squared: 0.01727, Adjusted R-squared: 0.01623
## F-statistic: 16.66 on 8 and 7588 DF, p-value: < 2.2e-16
# Logistic Regression Models
logit1 <- glm(denial ~ income + property_value + sex, data = indata, family = "binomial")
summary(logit1)
##
## Call:
## glm(formula = denial ~ income + property_value + sex, family = "binomial",
## data = indata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.267237 0.861120 0.310 0.7563
## income -0.546798 0.073518 -7.438 1.03e-13 ***
## property_value 0.008956 0.078260 0.114 0.9089
## sexJoint -0.232650 0.108311 -2.148 0.0317 *
## sexMale 0.040109 0.111786 0.359 0.7197
## sexSex Not Available 0.370656 0.171970 2.155 0.0311 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4676.2 on 7596 degrees of freedom
## Residual deviance: 4552.5 on 7591 degrees of freedom
## AIC: 4564.5
##
## Number of Fisher Scoring iterations: 5
logit2 <- glm(denial ~ income + property_value + sex + race, data = indata, family = "binomial")
summary(logit2)
##
## Call:
## glm(formula = denial ~ income + property_value + sex + race,
## family = "binomial", data = indata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.006444 0.912280 1.103 0.2699
## income -0.553767 0.073652 -7.519 5.53e-14 ***
## property_value 0.006839 0.078474 0.087 0.9305
## sexJoint -0.217015 0.108905 -1.993 0.0463 *
## sexMale 0.055635 0.112121 0.496 0.6198
## sexSex Not Available 0.102591 0.203010 0.505 0.6133
## raceBlack -12.968577 258.373450 -0.050 0.9600
## raceOthers -0.398929 0.312594 -1.276 0.2019
## raceWhite -0.738173 0.292781 -2.521 0.0117 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4676.2 on 7596 degrees of freedom
## Residual deviance: 4538.5 on 7588 degrees of freedom
## AIC: 4556.5
##
## Number of Fisher Scoring iterations: 13
# AUC
roc_curve <- roc(testData$denial, predict(logit2, testData, type = "response"))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve)
## Area under the curve: 0.629
Income has a negative impact on loan denial and is very significant (p = 4.13e-13).
Property Value: p = 0.91453, not significant.
Sex(Joint): Significant (p = 0.02330), has a negative impact on loan rejection.
Race(White) applicants have a reduced denial rate, which is significant (p = 0.00955).
Economic Implication: When evaluating loan risk, lenders should take sex and income into account. Although further research is required to determine its impact, race may possibly play a part.
Income: Reduces the likelihood of denial; highly significant (p = 1.03e-13).
Property Value: p = 0.9089, not significant.
Denial likelihood is impacted by sex (joint, not available), which is significant (p = 0.0317, p = 0.0311).
Economic Implication: Income can be used by lenders to determine the likelihood that a loan will be approved. The sex factor emphasizes how crucial it is to take into account many applications and a range of demographic information.
Income: Reduces the likelihood of rejection; highly significant (p = 5.53e-14).
Property Value: p = 0.9305, not significant.
Sex(Joint): Significant (p = 0.0463), reduces the likelihood of denial.
Race(White) applicants had a decreased denial rate, which was significant (p = 0.0117).
Economic Significance: The findings support the necessity of taking race, sex, and income into account when determining loan eligibility. It might be necessary for lenders to create more inclusive policies in order to prevent racial and sexual prejudice.
threshold_50 <- ifelse(predict(logit2, type = "response") > 0.5, 1, 0)
confusionMatrix(as.factor(threshold_50), as.factor(indata$denial))
## Warning in confusionMatrix.default(as.factor(threshold_50),
## as.factor(indata$denial)): Levels are not in the same order for reference and
## data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6896 701
## 1 0 0
##
## Accuracy : 0.9077
## 95% CI : (0.901, 0.9141)
## No Information Rate : 0.9077
## P-Value [Acc > NIR] : 0.5101
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9077
## Neg Pred Value : NaN
## Prevalence : 0.9077
## Detection Rate : 0.9077
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
threshold_30 <- ifelse(predict(logit2, type = "response") > 0.3, 1, 0)
confusionMatrix(as.factor(threshold_30), as.factor(indata$denial))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6894 692
## 1 2 9
##
## Accuracy : 0.9086
## 95% CI : (0.9019, 0.915)
## No Information Rate : 0.9077
## P-Value [Acc > NIR] : 0.4003
##
## Kappa : 0.0225
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99971
## Specificity : 0.01284
## Pos Pred Value : 0.90878
## Neg Pred Value : 0.81818
## Prevalence : 0.90773
## Detection Rate : 0.90746
## Detection Prevalence : 0.99855
## Balanced Accuracy : 0.50627
##
## 'Positive' Class : 0
##
threshold_10 <- ifelse(predict(logit2, type = "response") > 0.1, 1, 0)
confusionMatrix(as.factor(threshold_10), as.factor(indata$denial))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4516 323
## 1 2380 378
##
## Accuracy : 0.6442
## 95% CI : (0.6333, 0.655)
## No Information Rate : 0.9077
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0837
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6549
## Specificity : 0.5392
## Pos Pred Value : 0.9333
## Neg Pred Value : 0.1371
## Prevalence : 0.9077
## Detection Rate : 0.5944
## Detection Prevalence : 0.6370
## Balanced Accuracy : 0.5971
##
## 'Positive' Class : 0
##
Accuracy: 0.9077, High accuracy but skewed because of imbalance
1.0000 is the True Positive Rate (Sensitivity), which accurately indicates that the positive class is 0.
0.0000 is the False Positive Rate (1-Specificity) (no false positives).
Implications: - Risk Mitigation: This methodology may overlook positive cases (1), but it nearly always finds negatives (0).
Customer satisfaction may suffer if customers who ought to be authorized are passed over.
Business Profitability: This model may miss out on prospective business by failing to detect certain problematic clients, even while it is accurate in rejecting loans.
Accuracy: 0.9086, which is marginally superior to Model 1.
True Positive Rate (Sensitivity): 0.9997 (almost flawless positive detection)
0.01284 is the false positive rate (some false positives).
Implications: - Risk mitigation strikes a balance between minimizing false positives and discovering positives. could be helpful in locating and accepting applications who pose a risk.
Customer satisfaction: There is a slight danger of mistakenly turning away loyal consumers, but there is a higher likelihood of accurate approval.
Profitability for businesses: Assists in reducing risk while preserving client happiness.
Accuracy: 0.6442 (poor accuracy)
Sensitivity (True Positive Rate): 0.6549 (Sufficient positive detection)
The rate of false positives is 0.5392, which is high.
Implications: - Risk Mitigation: This model is likely to approve high-risk applicants due to its high false positive rate, which increases financial exposure.
Customer satisfaction: There may be a larger likelihood of loan defaults, but more customers may be approved.
Business Profitability: As more clients are approved, there is a greater chance of defaults, which lowers profitability.
Accuracy: 0.9077 Sensitivity: 1.0000 (perfect detection of negatives) Specificity: 0.0000 (no false positives) Implications: By identifying every loan denial, this methodology reduces risk, but it overlooks a large number of loan approvals, which could result in missed business possibilities.
Accuracy: 0.9086 Sensitivity: 0.9997 (near-perfect detection of positives) Specificity: 0.01284 (minimal false positives) Implications: This approach is perfect for granting legitimate loans while efficiently managing risk because it strikes a balance between identifying loan approvals and reducing false positives.
Accuracy: 0.6442 Sensitivity: 0.6549 (adequate detection of positives) Specificity: 0.5392 (higher false positives) Implications: Profitability may be harmed by this model’s decreased accuracy and increased chance of accepting unqualified applications.
Credit Approval Policies: Model 2 enables more informed decision-making, reducing defaults and sanctioning loans with assurance.
Financial Services Strategies: Targeting creditworthy clients, dynamically modifying loan terms, and increasing profitability while lowering risk are all made possible by this methodology.
Perform cross-validation using the Area Under the Curve (AUC) metric.
# Create a logistic regression model
log_model <- glm(denial ~ age + sex + income + loan_amount + property_value,
data = indata, family = binomial())
# Create a decision tree model
library(rpart)
tree_model <- rpart(denial ~ age + sex + income + loan_amount + property_value,
data = indata, method = "class")
# Predictions for Logistic Model
log_pred <- predict(log_model, type = "response")
log_roc <- roc(indata$denial, log_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Predictions for Decision Tree Model
tree_pred <- predict(tree_model, type = "prob")[,2]
tree_roc <- roc(indata$denial, tree_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Compare AUC for both models
log_auc <- auc(log_roc)
tree_auc <- auc(tree_roc)
# Print AUC values
cat("Logistic Regression AUC: ", log_auc, "\n")
## Logistic Regression AUC: 0.622226
cat("Decision Tree AUC: ", tree_auc, "\n")
## Decision Tree AUC: 0.5638311
# ROC curve plot comparison
plot.roc(log_roc, col = "blue", main = "Model Comparison: ROC Curves")
plot.roc(tree_roc, col = "red", add = TRUE)
legend("bottomright", legend = c("Logistic Regression", "Decision Tree"),
col = c("blue", "red"), lty = 1)
### Comparison of Model Performance - The Logistic Regression model (AUC
= 0.622) performs better than the Decision Tree model (AUC = 0.564) in
forecasting loan denial, according to the ROC curve and AUC values.
The blue curve, or logistic regression, demonstrates a superior capacity to distinguish between loans that are granted and those that are denied. Its moderate predictive power is indicated by its AUC of 0.622.
With a lower AUC (0.564), the Decision Tree (red curve) performs worse and is less reliable at differentiating loan denial cases.
The model performs better the closer the AUC is to 1. In this instance, Logistic Regression is the superior model because to its higher AUC.
These findings can be used by financial organizations to improve loan approval procedures and guarantee sustainability over the long run:
Risk assessment and credit scoring: By providing a more accurate way to distinguish between loans that are authorized and those that are denied, logistic regression helps reduce defaults and guarantee eligible approvals.
Operational Efficiency: Higher accuracy speeds up loan approvals and lowers expenses by reducing manual reviews.
Fair & Responsible Lending: Data-driven methods encourage objective judgments, guaranteeing adherence to rules and lowering legal risks.
Loan Portfolio Management: A strong predictive model adjusts to borrower profiles and economic conditions while striking a balance between approvals and financial stability.
Summarize the findings from each section of the project:
Important relationships in mortgage lending were found in the dataset. Loan amounts have a moderate correlation (0.568) with income and a substantial correlation (0.715) with property values. The association between loan length and loan amount is lower (0.412).
Disparities are suggested by the examination of loan denial rates. According to chi-square tests, there is a statistically significant correlation between loan denial and race (p < 0.0003) and sex (p < 0.001). Concerns regarding equity and possible regulatory scrutiny are raised by these findings.
Model 3, which had the lowest Root Mean Squared Error (RMSE = 0.5655) and the greatest Adjusted R2 (0.634), outperformed the other two regression models in terms of predictive accuracy.
Across all models, income and property worth were consistently significant predictors of loan amounts.
Concerns over equity in lending were raised by the negative coefficient for race (Black), which suggested less loan amounts for Black customers.
Longer loan durations are linked to somewhat larger loan amounts, according to the minor but substantial positive effect of loan length.
To forecast loan denial, two logistic regression models and a linear probability model (LPM) were created.
In both models, income significantly decreased the chance of loan denial (p < 0.0001).
Perhaps as a result of the financial security that comes with having several sources of income, sex (combined applications) dramatically decreased loan denial rates.
Racial differences in acceptance choices were evident in the much lower loan denial rate for White applicants.
The model evaluation’s AUC values revealed that the logistic regression model (AUC = 0.622) outperformed the decision tree model (AUC = 0.564), suggesting a reasonable level of predictive potential.
Limitations: Although the models were able to identify important factors that affect loan amounts and denial rates, they could also improve predictive power by taking into account unobserved borrower characteristics like credit scores and debt-to-income ratios.
Business and Regulatory Consequences: Fair lending issues: In order to prevent discrimination claims, there may be a need for compliance reviews and possible policy changes given the strong correlation between race and loan denial.
Enhancements to risk assessment: While enhancing risk management, financial institutions should improve credit scoring models to integrate fair lending principles.
Pricing strategies: By using the findings from the regression model, lenders can better target lucrative client segments and optimize loan pricing.
Consumer protection: These results back up regulatory initiatives to keep an eye on mortgage lending practices and encourage openness.
Overall, the study shows that sex, ethnicity, and wealth are important considerations when it comes to mortgage lending decisions. To maintain good risk management and guarantee lending equity, financial institutions and regulators must collaborate.
Reminder: Each student must choose a different state for their HMDA dataset analysis. Duplications will not be accepted.