Loan Approval Status
We will be working with a dataset of loan approval status information. The task is to develop models to predict loan approval status with the given feature variables. After a preliminary exploratory data analysis, we will fit Linear Discriminant, K-Nearest Neighbors, Decision Trees and Random Forest models to a subset of the data and evaluate performance on a hold-out data set.
To begin, the following code will import the necessary libraries and import the data:
library(tidyr)
library(dplyr)
library(ggplot2)
library(VIM)
library(corrplot)
library(purrr)
library(scales)
library(caret)
# import data
url <- 'https://raw.githubusercontent.com/SmilodonCub/DATA622/master/Loan_approval.csv'
df <- read.csv(url)
# convert column names to lowercase
names(df) <- lapply(names(df), tolower)
# quick look at what the data structure looks like
glimpse(df)## Rows: 614
## Columns: 13
## $ loan_id <chr> "LP001002", "LP001003", "LP001005", "LP001006", "LP0~
## $ gender <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Mal~
## $ married <chr> "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "Yes"~
## $ dependents <chr> "0", "1", "0", "0", "0", "2", "0", "3+", "2", "1", "~
## $ education <chr> "Graduate", "Graduate", "Graduate", "Not Graduate", ~
## $ self_employed <chr> "No", "No", "Yes", "No", "No", "Yes", "No", "No", "N~
## $ applicantincome <int> 5849, 4583, 3000, 2583, 6000, 5417, 2333, 3036, 4006~
## $ coapplicantincome <dbl> 0, 1508, 0, 2358, 0, 4196, 1516, 2504, 1526, 10968, ~
## $ loanamount <int> NA, 128, 66, 120, 141, 267, 95, 158, 168, 349, 70, 1~
## $ loan_amount_term <int> 360, 360, 360, 360, 360, 360, 360, 360, 360, 360, 36~
## $ credit_history <int> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, NA, ~
## $ property_area <chr> "Urban", "Rural", "Urban", "Urban", "Urban", "Urban"~
## $ loan_status <chr> "Y", "N", "Y", "Y", "Y", "Y", "Y", "N", "Y", "N", "Y~
# summary of each field
summary(df)## loan_id gender married dependents
## Length:614 Length:614 Length:614 Length:614
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## education self_employed applicantincome coapplicantincome
## Length:614 Length:614 Min. : 150 Min. : 0
## Class :character Class :character 1st Qu.: 2878 1st Qu.: 0
## Mode :character Mode :character Median : 3812 Median : 1188
## Mean : 5403 Mean : 1621
## 3rd Qu.: 5795 3rd Qu.: 2297
## Max. :81000 Max. :41667
##
## loanamount loan_amount_term credit_history property_area
## Min. : 9.0 Min. : 12 Min. :0.0000 Length:614
## 1st Qu.:100.0 1st Qu.:360 1st Qu.:1.0000 Class :character
## Median :128.0 Median :360 Median :1.0000 Mode :character
## Mean :146.4 Mean :342 Mean :0.8422
## 3rd Qu.:168.0 3rd Qu.:360 3rd Qu.:1.0000
## Max. :700.0 Max. :480 Max. :1.0000
## NA's :22 NA's :14 NA's :50
## loan_status
## Length:614
## Class :character
## Mode :character
##
##
##
##
# remove loan ID
df <- df %>%
select(-loan_id)VIM::aggr(df,
col=c('green','red'),
numbers = T,
sortVars = T,
cex.axis = .5,
ylab=c("Proportion of Data", "Combinations and Percentiles")
)credit_history has ~8% missing values.loanamount has ~4% missing values.loan_amount_term has ~2% missing values.Before any analysis and based on common knowledge, it would be a fair guess to expect these three variables to have considerable influence for any loan approval status.
plot_corr_matrix(df, -1)# numeric distributions
df %>%
select_if(is.numeric) %>%
bind_cols(select(df, loan_status)) %>%
gather(var, val, -loan_status) %>%
ggplot(aes(x = val, fill = loan_status)) +
geom_density(alpha = .3) +
facet_wrap(~var, scales = 'free') +
theme_bw() +
labs(x = element_blank(),
y = element_blank(),
title = 'Distribution of Numeric Variables by Loan Approval Status'
)Credit history seems to have a clear affect on loan approval status, with credit history meeting fully meeting guidelines showing higher rates of approvals.
# df %>%
# select_if(negate(is.numeric)) %>%
# gather(var, value, -loan_status) %>%
# ggplot(aes(x = value, fill = loan_status)) +
# geom_bar(position = 'dodge') +
# facet_wrap(~var, scales = 'free')
yes_count <- sum(df$loan_status == 'Y')
no_count <- sum(df$loan_status == 'N')
df %>%
select_if(negate(is.numeric)) %>%
gather(var, value, -loan_status) %>%
group_by(var, value, loan_status) %>%
summarize(count = n(),
.groups = 'drop') %>%
mutate(prop = count / ifelse(loan_status == 'Y', yes_count, no_count)) %>%
ggplot(aes(x = value, y = prop, fill = loan_status)) +
geom_col(position = 'dodge') +
facet_wrap(~var, scales = 'free') +
theme_bw() +
labs(y = 'Frequency Proportion',
x = element_blank(),
title = 'Frequency Distributions For Non-Numeric Variables') +
scale_y_continuous(labels = percent_format(accuracy = 1))# impute NA
preproc <- preProcess(df, 'bagImpute')
df2 <- predict(preproc, df)
# train test split
set.seed(101)
trainIndex <- createDataPartition(df2$loan_status,
p = 0.75,
list = F)
train <- df2[trainIndex,]
test <- df2[-trainIndex,]
# cross validation train control
ctrl <- trainControl(method = 'cv', number = 10)lda <- train(loan_status ~ .,
data = train,
method = 'lda',
trControl = ctrl
)
lda## Linear Discriminant Analysis
##
## 461 samples
## 11 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 416, 415, 415, 415, 415, 414, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8136139 0.4947024
knn <- train(loan_status ~ .,
data = train,
method = 'knn',
trControl = ctrl
)
knn## k-Nearest Neighbors
##
## 461 samples
## 11 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 415, 415, 414, 416, 414, 415, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.6075085 -0.034867070
## 7 0.6334125 0.001357728
## 9 0.6680101 0.062579260
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
cart <- train(loan_status ~ .,
data = train,
method = 'rpart',
trControl = ctrl
)
cart## CART
##
## 461 samples
## 11 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 414, 414, 416, 415, 415, 416, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.000000000 0.7487584 0.3850676
## 0.007638889 0.7683256 0.4138454
## 0.423611111 0.7177963 0.1224685
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.007638889.
rf <- train(loan_status ~ .,
data = train,
method = 'rf',
trControl = ctrl
)
rf## Random Forest
##
## 461 samples
## 11 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 416, 414, 415, 414, 416, 414, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8047250 0.4695397
## 10 0.7875609 0.4631458
## 18 0.7810854 0.4448609
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# calculate jpredictions
test$lda <- predict(lda, test)
test$knn <- predict(knn, test)
test$cart <- predict(cart, test)
test$rf <- predict(rf, test)
table(test$loan_status, test$lda, dnn = c('approval status','LDA predictions'))## LDA predictions
## approval status N Y
## N 25 23
## Y 1 104
table(test$loan_status, test$knn, dnn = c('approval status','KNN predictions'))## KNN predictions
## approval status N Y
## N 8 40
## Y 13 92
table(test$loan_status, test$cart, dnn = c('approval status','CART predictions'))## CART predictions
## approval status N Y
## N 28 20
## Y 1 104
table(test$loan_status, test$rf, dnn = c('approval status','RF predictions'))## RF predictions
## approval status N Y
## N 28 20
## Y 1 104
# model accuracy table
data.frame(accuracy = rbind(
sum(test$loan_status == test$lda) / nrow(test),
sum(test$loan_status == test$knn) / nrow(test),
sum(test$loan_status == test$cart) / nrow(test),
sum(test$loan_status == test$rf) / nrow(test)
),
row.names = c('LDA',
'KNN',
'CART',
'Random Forest'
)
)## accuracy
## LDA 0.8431373
## KNN 0.6535948
## CART 0.8627451
## Random Forest 0.8627451