set.seed(1) #
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret) # contains createDataPartition
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet) # performed lasso regression
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caTools)
library(caret)
library(mltools) # One-hot encoding
## Warning: package 'mltools' was built under R version 4.3.2
##
## Attaching package: 'mltools'
##
## The following object is masked from 'package:tidyr':
##
## replace_na
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
travel2 <- read.table("C:/Users/DELL/Desktop/Lasso logistic regression/travel2.csv")
head(travel2, 5)
## X Approved census_tract Female CoApplicant Race.Ethnicity Black White
## 1 63 1 42071010900 0 0 Hispanic 0 0
## 2 64 1 42029304201 0 0 Hispanic 0 0
## 3 65 1 42091207300 0 0 Hispanic 0 0
## 4 66 1 42059970400 0 1 White 0 1
## 5 67 1 42091203602 0 0 Hispanic 0 0
## Hispanic Asian Indigenous tract_minority_population_percent
## 1 1 0 0 18.34
## 2 1 0 0 25.33
## 3 1 0 0 32.20
## 4 0 0 0 1.71
## 5 1 0 0 58.31
## tract_to_msa_income_percentage income loan_amount property_value
## 1 104 74 155000 185000
## 2 77 63 195000 245000
## 3 80 51 185000 225000
## 4 105 28 85000 125000
## 5 74 44 135000 165000
## loan_to_income_ratio loan_to_value_ratio log.income log.loan_amount
## 1 2.094595 80 11.21182 11.95118
## 2 3.095238 80 11.05089 12.18075
## 3 3.627451 80 10.83958 12.12811
## 4 3.035714 70 10.23996 11.35041
## 5 3.068182 80 10.69194 11.81303
## log.loan_to_income_ratio log.loan_to_value_ratio
## 1 0.739360 4.382027
## 2 1.129865 4.382027
## 3 1.288530 4.382027
## 4 1.110447 4.248495
## 5 1.121085 4.382027
travel2$Approved <- as.factor(travel2$Approved)
travel2$Female <- as.factor(travel2$Female)
travel2$CoApplicant <- as.factor(travel2$CoApplicant)
travel2$Race.Ethnicity <- as.factor(travel2$Race.Ethnicity)
travel2$Black <- as.factor(travel2$Black)
travel2$White <- as.factor(travel2$White)
travel2$Hispanic <- as.factor(travel2$Hispanic)
travel2$Asian <- as.factor(travel2$Asian)
travel2$Indigenous <- as.factor(travel2$Indigenous)
str(travel2)
# Calculate percentages
percentages <- travel2 %>%
group_by(Race.Ethnicity) %>%
summarise(percent = (n() / nrow(travel2)) * 100)
# Create a bar chart with data labels
ggplot(percentages, aes(x = Race.Ethnicity, y = percent)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = paste0(round(percent, 2), "%")), vjust = -0.5) +
labs(title = "Percentage of Each Race/Ethnicity", x = "Race/Ethnicity", y = "Percentage") +
theme_minimal()
# Approved
percentages_A <- travel2 %>%
group_by(Approved) %>%
summarise(percent = (n() / nrow(travel2)) * 100)
# Create a bar chart with data labels
ggplot(percentages_A, aes(x = Approved, y = percent)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = paste0(round(percent, 2), "%")), vjust = -0.5) +
labs(title = "Percentage of morgage approve or not", x = "Race/Ethnicity", y = "Percentage") +
theme_minimal()
#Gender
percentages_B <- travel2 %>%
group_by(Female) %>%
summarise(percent = (n() / nrow(travel2)) * 100)
# Create a bar chart with data labels
ggplot(percentages_B, aes(x = Female, y = percent)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = paste0(round(percent, 2), "%")), vjust = -0.5) +
labs(title = "Percentage of Female vs Male", x = "Race/Ethnicity", y = "Percentage") +
theme_minimal()
Approved <- travel2 %>% filter(Approved ==1)
head(Approved,5)
## X Approved census_tract Female CoApplicant Race.Ethnicity Black White
## 1 63 1 42071010900 0 0 Hispanic 0 0
## 2 64 1 42029304201 0 0 Hispanic 0 0
## 3 65 1 42091207300 0 0 Hispanic 0 0
## 4 66 1 42059970400 0 1 White 0 1
## 5 67 1 42091203602 0 0 Hispanic 0 0
## Hispanic Asian Indigenous tract_minority_population_percent
## 1 1 0 0 18.34
## 2 1 0 0 25.33
## 3 1 0 0 32.20
## 4 0 0 0 1.71
## 5 1 0 0 58.31
## tract_to_msa_income_percentage income loan_amount property_value
## 1 104 74 155000 185000
## 2 77 63 195000 245000
## 3 80 51 185000 225000
## 4 105 28 85000 125000
## 5 74 44 135000 165000
## loan_to_income_ratio loan_to_value_ratio log.income log.loan_amount
## 1 2.094595 80 11.21182 11.95118
## 2 3.095238 80 11.05089 12.18075
## 3 3.627451 80 10.83958 12.12811
## 4 3.035714 70 10.23996 11.35041
## 5 3.068182 80 10.69194 11.81303
## log.loan_to_income_ratio log.loan_to_value_ratio
## 1 0.739360 4.382027
## 2 1.129865 4.382027
## 3 1.288530 4.382027
## 4 1.110447 4.248495
## 5 1.121085 4.382027
percent <- Approved %>% select(Race.Ethnicity)%>%
group_by(Race.Ethnicity) %>%
summarise(percent = (n() / nrow(Approved)) * 100)
# Create a bar chart with data labels
ggplot(percent, aes(x = Race.Ethnicity, y = percent)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = paste0(round(percent, 2), "%")), vjust = -0.5) +
labs(title = "Percentage of Each Race/Ethnicity", x = "Race/Ethnicity", y = "Percentage") +
theme_minimal()
### Filtering for bid that were approved Not Approved
Not_Approved <- travel2 %>% filter(Approved ==0)
head(Not_Approved,5)
## X Approved census_tract Female CoApplicant Race.Ethnicity Black White
## 13 77 0 42003496200 1 0 White 0 1
## 20 91 0 42071010101 0 0 White 0 1
## 25 102 0 42131400500 0 0 White 0 1
## 26 104 0 42117950300 0 0 White 0 1
## 54 204 0 42063961500 1 0 Hispanic 0 0
## Hispanic Asian Indigenous tract_minority_population_percent
## 13 0 0 0 2.31
## 20 0 0 0 4.74
## 25 0 0 0 3.90
## 26 0 0 0 2.80
## 54 1 0 0 2.78
## tract_to_msa_income_percentage income loan_amount property_value
## 13 101 15 85000 115000
## 20 116 26 95000 155000
## 25 106 36 155000 155000
## 26 89 72 155000 155000
## 54 89 22 45000 55000
## loan_to_income_ratio loan_to_value_ratio log.income log.loan_amount
## 13 5.666667 80.00 9.615805 11.35041
## 20 3.653846 101.75 10.165852 11.46163
## 25 4.305556 100.00 10.491274 11.95118
## 26 2.152778 101.75 11.184421 11.95118
## 54 2.045455 90.00 9.998798 10.71442
## log.loan_to_income_ratio log.loan_to_value_ratio
## 13 1.734601 4.382027
## 20 1.295780 4.622519
## 25 1.459906 4.605170
## 26 0.766759 4.622519
## 54 0.715620 4.499810
percent_Not <- Not_Approved %>% select(Race.Ethnicity)%>%
group_by(Race.Ethnicity) %>%
summarise(percent = (n() / nrow(Approved)) * 100)
# Create a bar chart with data labels
ggplot(percent_Not, aes(x = Race.Ethnicity, y = percent)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = paste0(round(percent, 2), "%")), vjust = -0.5) +
labs(title = "Percentage of Each Race/Ethnicity", x = "Race/Ethnicity", y = "Percentage") +
theme_minimal()
ggplot(percentages_A, aes(x = Approved, y = percent)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = paste0(round(percent, 2), "%")), vjust = -0.5) +
labs(title = "Percentage of morgage approve or not", x = "Race/Ethnicity", y = "Percentage") +
theme_minimal()
Num <- travel2%>% select(tract_minority_population_percent,
tract_to_msa_income_percentage,
income, loan_amount, property_value,
loan_to_income_ratio, loan_to_value_ratio,
log.income, log.loan_amount, log.loan_to_income_ratio,
log.loan_to_value_ratio )
cor(Num)
## tract_minority_population_percent
## tract_minority_population_percent 1.000000000
## tract_to_msa_income_percentage -0.269879731
## income -0.004440587
## loan_amount -0.039315496
## property_value -0.046526651
## loan_to_income_ratio 0.080992509
## loan_to_value_ratio 0.013315789
## log.income -0.127809429
## log.loan_amount -0.045246032
## log.loan_to_income_ratio 0.111179359
## log.loan_to_value_ratio 0.086989610
## tract_to_msa_income_percentage income
## tract_minority_population_percent -0.269879731 -0.0044405870
## tract_to_msa_income_percentage 1.000000000 0.0230837034
## income 0.023083703 1.0000000000
## loan_amount 0.418466095 0.0492488570
## property_value 0.352599647 0.0409865682
## loan_to_income_ratio 0.003712528 -0.0221270842
## loan_to_value_ratio -0.012458065 -0.0008199386
## log.income 0.374396010 0.1108370136
## log.loan_amount 0.407190447 0.0314390208
## log.loan_to_income_ratio 0.020580398 -0.1062472897
## log.loan_to_value_ratio -0.110105520 -0.0089751520
## loan_amount property_value
## tract_minority_population_percent -0.039315496 -0.04652665
## tract_to_msa_income_percentage 0.418466095 0.35259965
## income 0.049248857 0.04098657
## loan_amount 1.000000000 0.75921867
## property_value 0.759218667 1.00000000
## loan_to_income_ratio 0.104237989 0.05304767
## loan_to_value_ratio -0.001251133 -0.01373620
## log.income 0.681657871 0.54028261
## log.loan_amount 0.869942616 0.64047528
## log.loan_to_income_ratio 0.199571497 0.09635148
## log.loan_to_value_ratio 0.072136413 -0.15661686
## loan_to_income_ratio loan_to_value_ratio
## tract_minority_population_percent 0.080992509 0.0133157893
## tract_to_msa_income_percentage 0.003712528 -0.0124580649
## income -0.022127084 -0.0008199386
## loan_amount 0.104237989 -0.0012511327
## property_value 0.053047669 -0.0137362033
## loan_to_income_ratio 1.000000000 0.0039016779
## loan_to_value_ratio 0.003901678 1.0000000000
## log.income -0.342244606 -0.0068002486
## log.loan_amount 0.167027432 0.0007860133
## log.loan_to_income_ratio 0.661040026 0.0099414419
## log.loan_to_value_ratio 0.102693824 0.1572979515
## log.income log.loan_amount
## tract_minority_population_percent -0.127809429 -0.0452460324
## tract_to_msa_income_percentage 0.374396010 0.4071904471
## income 0.110837014 0.0314390208
## loan_amount 0.681657871 0.8699426158
## property_value 0.540282610 0.6404752845
## loan_to_income_ratio -0.342244606 0.1670274324
## loan_to_value_ratio -0.006800249 0.0007860133
## log.income 1.000000000 0.6996099634
## log.loan_amount 0.699609963 1.0000000000
## log.loan_to_income_ratio -0.434175110 0.3399111976
## log.loan_to_value_ratio -0.039213133 0.1401361291
## log.loan_to_income_ratio
## tract_minority_population_percent 0.111179359
## tract_to_msa_income_percentage 0.020580398
## income -0.106247290
## loan_amount 0.199571497
## property_value 0.096351476
## loan_to_income_ratio 0.661040026
## loan_to_value_ratio 0.009941442
## log.income -0.434175110
## log.loan_amount 0.339911198
## log.loan_to_income_ratio 1.000000000
## log.loan_to_value_ratio 0.228287207
## log.loan_to_value_ratio
## tract_minority_population_percent 0.086989610
## tract_to_msa_income_percentage -0.110105520
## income -0.008975152
## loan_amount 0.072136413
## property_value -0.156616860
## loan_to_income_ratio 0.102693824
## loan_to_value_ratio 0.157297952
## log.income -0.039213133
## log.loan_amount 0.140136129
## log.loan_to_income_ratio 0.228287207
## log.loan_to_value_ratio 1.000000000
summary(Num)
## tract_minority_population_percent tract_to_msa_income_percentage
## Min. : 0.00 Min. : 0.0
## 1st Qu.: 5.36 1st Qu.: 90.0
## Median : 10.29 Median :109.0
## Mean : 16.98 Mean :115.1
## 3rd Qu.: 20.06 3rd Qu.:132.0
## Max. :100.00 Max. :443.0
## income loan_amount property_value loan_to_income_ratio
## Min. : 1.0 Min. : 5000 Min. : 5000 Min. : 0.00007
## 1st Qu.: 55.0 1st Qu.: 135000 1st Qu.: 165000 1st Qu.: 1.73077
## Median : 85.0 Median : 205000 Median : 245000 Median : 2.37705
## Mean : 115.1 Mean : 237796 Mean : 294765 Mean : 2.56347
## 3rd Qu.: 131.0 3rd Qu.: 305000 3rd Qu.: 365000 3rd Qu.: 3.16327
## Max. :365001.0 Max. :3845000 Max. :38225000 Max. :255.00000
## loan_to_value_ratio log.income log.loan_amount log.loan_to_income_ratio
## Min. : 1.05 Min. : 6.908 Min. : 8.517 Min. :-9.5888
## 1st Qu.: 80.00 1st Qu.:10.915 1st Qu.:11.813 1st Qu.: 0.5486
## Median : 87.67 Median :11.350 Median :12.231 Median : 0.8659
## Mean : 84.97 Mean :11.368 Mean :12.187 Mean : 0.8188
## 3rd Qu.: 95.00 3rd Qu.:11.783 3rd Qu.:12.628 3rd Qu.: 1.1516
## Max. :77235.80 Max. :19.715 Max. :15.162 Max. : 5.5413
## log.loan_to_value_ratio
## Min. : 0.04879
## 1st Qu.: 4.38203
## Median : 4.47360
## Mean : 4.40834
## 3rd Qu.: 4.55388
## Max. :11.25462
travel2$tract_minority_population_percent <- scale(travel2$tract_minority_population_percent)
travel2$tract_to_msa_income_percentage <- scale(travel2$tract_to_msa_income_percentage)
travel2$income <- scale(travel2$income)
travel2$loan_amount <- scale(travel2$loan_amount)
travel2$property_value <- scale(travel2$property_value)
travel2$loan_to_income_ratio <- scale(travel2$loan_to_income_ratio)
travel2$loan_to_value_ratio <- scale(travel2$loan_to_value_ratio)
travel2$log.income <- scale(travel2$log.income)
travel2$log.loan_amount <- scale(travel2$log.loan_amount)
travel2$log.loan_amount<- scale(travel2$log.loan_amount)
travel2$log.loan_to_income_ratio <- scale(travel2$log.loan_to_income_ratio)
travel2$log.loan_to_value_ratio <- scale(travel2$log.loan_to_value_ratio)
sample <- sample.split(travel2$census_tract, SplitRatio = 0.8)
train <- subset(travel2, sample == T)
test <- subset(travel2, sample == F)
p.retain <- sum(train$Approved == 1)/length(train$Approved)
weights <- rep(NA, times=length(train$Approved))
weights[train$Approved == 0] <- p.retain
weights[train$Approved == 1] <- 1-p.retain
train_subset <- train%>%select("Black", "White", "Hispanic", "Asian",
"Indigenous", "CoApplicant", "Female",
"log.income", "log.loan_amount", "property_value"
, "Approved")
y_matrix <- as.matrix(train_subset$Approved)
x_matrix <- as.matrix(train_subset%>%select(-Approved))
model_cv_lasso_NO <- cv.glmnet(y = y_matrix, x = x_matrix,
family = "binomial",
weights = weights,
type.measure = "auc")
print(model_cv_lasso_NO)
##
## Call: cv.glmnet(x = x_matrix, y = y_matrix, weights = weights, type.measure = "auc", family = "binomial")
##
## Measure: AUC
##
## Lambda Index Measure SE Nonzero
## min 0.000483 60 0.6750 0.002969 9
## 1se 0.007874 30 0.6724 0.003082 6
predict_train <- train%>%select("Black", "White", "Hispanic", "Asian", "Indigenous",
"CoApplicant", "Female", "log.income", "log.loan_amount", "property_value",
"log.loan_to_income_ratio", "log.loan_to_value_ratio", "Approved")
predict_test <-test%>%select("Black", "White", "Hispanic", "Asian", "Indigenous",
"CoApplicant", "Female", "log.income", "log.loan_amount", "property_value",
"log.loan_to_income_ratio", "log.loan_to_value_ratio", "Approved")
predict_test$interaction_term <- predict_test$log.loan_to_income_ratio * predict_test$log.loan_to_value_ratio
predict_train$interaction_term <- predict_train$log.loan_to_income_ratio * predict_train$log.loan_to_value_ratio
Y_matrix <- as.matrix(predict_train$Approved)
X_matrix <- as.matrix(predict_train%>%select(-Approved))
model_cv_lasso_interaction <- cv.glmnet(y = Y_matrix, x = X_matrix,
family = "binomial", na.action = NULL,
weights = weights,
type.measure = "auc")
### Print the cross-validated results
print(model_cv_lasso_interaction)
##
## Call: cv.glmnet(x = X_matrix, y = Y_matrix, weights = weights, type.measure = "auc", family = "binomial", na.action = NULL)
##
## Measure: AUC
##
## Lambda Index Measure SE Nonzero
## min 0.000276 66 0.6856 0.005479 11
## 1se 0.005956 33 0.6803 0.005471 8
We evaluate the model performance using the “Area Under the Curve (AUC) ## MODEL 1: No interaction terms
probs <- predict(model_cv_lasso_NO, as.matrix(test%>%select("Black", "White", "Hispanic", "Asian",
"Indigenous", "CoApplicant", "Female", "log.income", "log.loan_amount", "property_value") ), s=model_cv_lasso_NO$lambda.min, type = "response")
# Create ROC object
roc_obj <- roc(as.matrix(
test$Approved) ~ as.vector(probs), smoothed = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC curve
plot(roc_obj, main = "AUC Lasso Regression: No Interactions")
# Arguments for confidence interval
ci(roc_obj, alpha = 0.9, stratified = FALSE)
## 95% CI: 0.6615-0.6961 (DeLong)
# Additional plot options
coords(roc_obj, "best", ret = c("threshold", "accuracy"))
## threshold accuracy
## threshold 0.4695458 0.7324072
# Display AUC
auc(roc_obj)
## Area under the curve: 0.6788
# Show the plot
legend("bottomright", legend = "95% CI", col = "gray", lty = 2, cex = 0.8)
roc_obj_NO <- roc(as.matrix(test$Approved) ~ probs, smoothed = TRUE,
# arguments for ci
ci=TRUE, ci.alpha=0.9, stratified=FALSE,
# arguments for plot
plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
print.auc=TRUE, show.thres=TRUE, main="AUC Lasso Regression: No Interactions")
## Setting levels: control = 0, case = 1
## Warning in roc.default(response, predictors[, 1], ...): Deprecated use a matrix
## as predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
# Adding confidence intervals to the plot
sens.ci_NO <- ci.se(roc_obj_NO)
plot(sens.ci_NO, type="shape", col="lightblue")
## Warning in plot.ci.se(sens.ci_NO, type = "shape", col = "lightblue"): Low
## definition shape.
plot(sens.ci_NO, type="bars")
probs_interaction <- predict(model_cv_lasso_interaction , as.matrix(predict_test%>%select(-Approved)), s=model_cv_lasso_interaction$lambda.min, type = "response")
roc_obj_i <- roc(as.matrix(predict_test%>%select(Approved)) ~ probs_interaction , smoothed = TRUE,
# arguments for ci
ci=TRUE, ci.alpha=0.9, stratified=FALSE,
# arguments for plot
plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
print.auc=TRUE, show.thres=TRUE, main="AUC Lasso Regression: with Interactions")
## Setting levels: control = 0, case = 1
## Warning in roc.default(response, predictors[, 1], ...): Deprecated use a matrix
## as predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
## Adding confidence intervals to the plot
sens.ci_i <- ci.se(roc_obj_i)
plot(sens.ci_i, type="shape", col="lightblue")
## Warning in plot.ci.se(sens.ci_i, type = "shape", col = "lightblue"): Low
## definition shape.
plot(sens.ci_i, type="bars")
Coefficient interpretation is possible for Model 1 (because no intereaction terms) Getting exponential of coefficients (odds ratios)
odds_ratios <- exp(coef(model_cv_lasso_NO, model_cv_lasso_NO$lambda.min))
odds_ratios <- array(odds_ratios, dimnames = list(c("Intercept", colnames(train_subset%>%select(-Approved)))))
t(t(odds_ratios))
## [,1]
## Intercept 0.8110930
## Black 0.5751037
## White 1.5349017
## Hispanic 1.0000000
## Asian 0.8652843
## Indigenous 0.7744054
## CoApplicant 0.9919661
## Female 1.1609520
## log.income 1.4563389
## log.loan_amount 1.5852917
## property_value 0.5989867
odds_ratios_i <- exp(coef(model_cv_lasso_interaction, model_cv_lasso_interaction$lambda.min))
odds_ratios_i <- array(odds_ratios_i, dimnames = list(c("Intercept", colnames(X_matrix))))
t(t(odds_ratios))
## [,1]
## Intercept 0.8110930
## Black 0.5751037
## White 1.5349017
## Hispanic 1.0000000
## Asian 0.8652843
## Indigenous 0.7744054
## CoApplicant 0.9919661
## Female 1.1609520
## log.income 1.4563389
## log.loan_amount 1.5852917
## property_value 0.5989867