Homework 3 Data 622 Section 2 Group 5

Loan Approval Status

We will be working with a dataset of loan approval status information. The task is to develop models to predict loan approval status with the given feature variables. After a preliminary exploratory data analysis, we will fit Linear Discriminant, K-Nearest Neighbors, Decision Trees and Random Forest models to a subset of the data and evaluate performance on a hold-out data set.

Import Data

To begin, the following code will import the necessary libraries and import the data:

library(tidyr)
library(dplyr)
library(ggplot2)
library(VIM)
library(corrplot)
library(purrr)
library(scales)
library(caret)

# import data
url <- 'https://raw.githubusercontent.com/SmilodonCub/DATA622/master/Loan_approval.csv'
df <- read.csv(url)

# convert column names to lowercase
names(df) <- lapply(names(df), tolower)

# quick look at what the data structure looks like
glimpse(df)

## Rows: 614
## Columns: 13
## $ loan_id           <chr> "LP001002", "LP001003", "LP001005", "LP001006", "LP0~
## $ gender            <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Mal~
## $ married           <chr> "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "Yes"~
## $ dependents        <chr> "0", "1", "0", "0", "0", "2", "0", "3+", "2", "1", "~
## $ education         <chr> "Graduate", "Graduate", "Graduate", "Not Graduate", ~
## $ self_employed     <chr> "No", "No", "Yes", "No", "No", "Yes", "No", "No", "N~
## $ applicantincome   <int> 5849, 4583, 3000, 2583, 6000, 5417, 2333, 3036, 4006~
## $ coapplicantincome <dbl> 0, 1508, 0, 2358, 0, 4196, 1516, 2504, 1526, 10968, ~
## $ loanamount        <int> NA, 128, 66, 120, 141, 267, 95, 158, 168, 349, 70, 1~
## $ loan_amount_term  <int> 360, 360, 360, 360, 360, 360, 360, 360, 360, 360, 36~
## $ credit_history    <int> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, NA, ~
## $ property_area     <chr> "Urban", "Rural", "Urban", "Urban", "Urban", "Urban"~
## $ loan_status       <chr> "Y", "N", "Y", "Y", "Y", "Y", "Y", "N", "Y", "N", "Y~

# summary of each field
summary(df)

##    loan_id             gender            married           dependents       
##  Length:614         Length:614         Length:614         Length:614        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   education         self_employed      applicantincome coapplicantincome
##  Length:614         Length:614         Min.   :  150   Min.   :    0    
##  Class :character   Class :character   1st Qu.: 2878   1st Qu.:    0    
##  Mode  :character   Mode  :character   Median : 3812   Median : 1188    
##                                        Mean   : 5403   Mean   : 1621    
##                                        3rd Qu.: 5795   3rd Qu.: 2297    
##                                        Max.   :81000   Max.   :41667    
##                                                                         
##    loanamount    loan_amount_term credit_history   property_area     
##  Min.   :  9.0   Min.   : 12      Min.   :0.0000   Length:614        
##  1st Qu.:100.0   1st Qu.:360      1st Qu.:1.0000   Class :character  
##  Median :128.0   Median :360      Median :1.0000   Mode  :character  
##  Mean   :146.4   Mean   :342      Mean   :0.8422                     
##  3rd Qu.:168.0   3rd Qu.:360      3rd Qu.:1.0000                     
##  Max.   :700.0   Max.   :480      Max.   :1.0000                     
##  NA's   :22      NA's   :14       NA's   :50                         
##  loan_status       
##  Length:614        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

# remove loan ID
df <- df %>%
  select(-loan_id)

Missing Values

VIM::aggr(df, 
          col=c('green','red'), 
          numbers = T, 
          sortVars = T,
          cex.axis = .5,
          ylab=c("Proportion of Data", "Combinations and Percentiles")
          )

credit_history has ~8% missing values.
loanamount has ~4% missing values.
loan_amount_term has ~2% missing values.

Before any analysis and based on common knowledge, it would be a fair guess to expect these three variables to have considerable influence for any loan approval status.

Correlation Plot

plot_corr_matrix(df, -1)

Distributions of Numeric Variables

# numeric distributions
df %>%
  select_if(is.numeric) %>%
  bind_cols(select(df, loan_status)) %>%
  gather(var, val, -loan_status) %>%
  ggplot(aes(x = val, fill = loan_status)) +
  geom_density(alpha = .3) +
  facet_wrap(~var, scales = 'free') +
  theme_bw() + 
  labs(x = element_blank(),
       y = element_blank(),
       title = 'Distribution of Numeric Variables by Loan Approval Status'
       )

Credit history seems to have a clear affect on loan approval status, with credit history meeting fully meeting guidelines showing higher rates of approvals.

Distributions of Categorical Variables

# df %>%
#   select_if(negate(is.numeric)) %>%
#   gather(var, value, -loan_status) %>%
#   ggplot(aes(x = value, fill = loan_status)) +
#   geom_bar(position = 'dodge') +
#   facet_wrap(~var, scales = 'free')

yes_count <- sum(df$loan_status == 'Y')
no_count <- sum(df$loan_status == 'N')
  
df %>%
  select_if(negate(is.numeric)) %>%
  gather(var, value, -loan_status) %>%
  group_by(var, value, loan_status) %>%
  summarize(count = n(),
            .groups = 'drop') %>%
  mutate(prop = count / ifelse(loan_status == 'Y', yes_count, no_count)) %>%
  ggplot(aes(x = value, y = prop, fill = loan_status)) +
  geom_col(position = 'dodge') +
  facet_wrap(~var, scales = 'free') +
  theme_bw() +
  labs(y = 'Frequency Proportion',
       x = element_blank(),
       title = 'Frequency Distributions For Non-Numeric Variables') +
  scale_y_continuous(labels = percent_format(accuracy = 1))

Data Prep

# impute NA
preproc <- preProcess(df, 'bagImpute')
df2 <- predict(preproc, df)

# train test split
set.seed(101)
trainIndex <- createDataPartition(df2$loan_status,
                                  p = 0.75,
                                  list = F)

train <- df2[trainIndex,]
test <- df2[-trainIndex,]

# cross validation train control
ctrl <- trainControl(method = 'cv', number = 10)

LDA

lda <- train(loan_status ~ .,
             data = train,
             method = 'lda',
             trControl = ctrl
             )

lda

## Linear Discriminant Analysis 
## 
## 461 samples
##  11 predictor
##   2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 416, 415, 415, 415, 415, 414, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8136139  0.4947024

KNN

knn <- train(loan_status ~ .,
             data = train,
             method = 'knn',
             trControl = ctrl
             )

knn

## k-Nearest Neighbors 
## 
## 461 samples
##  11 predictor
##   2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 415, 415, 414, 416, 414, 415, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa       
##   5  0.6075085  -0.034867070
##   7  0.6334125   0.001357728
##   9  0.6680101   0.062579260
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

Decision Tree

cart <- train(loan_status ~ .,
              data = train,
              method = 'rpart',
              trControl = ctrl
              )

cart

## CART 
## 
## 461 samples
##  11 predictor
##   2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 414, 414, 416, 415, 415, 416, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.000000000  0.7487584  0.3850676
##   0.007638889  0.7683256  0.4138454
##   0.423611111  0.7177963  0.1224685
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.007638889.

Random Forest

rf <- train(loan_status ~ .,
            data = train,
            method = 'rf',
            trControl = ctrl
            )

rf

## Random Forest 
## 
## 461 samples
##  11 predictor
##   2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 416, 414, 415, 414, 416, 414, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8047250  0.4695397
##   10    0.7875609  0.4631458
##   18    0.7810854  0.4448609
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Confusion Matrix

# calculate jpredictions
test$lda <- predict(lda, test)
test$knn <- predict(knn, test)
test$cart <- predict(cart, test)
test$rf <- predict(rf, test)

table(test$loan_status, test$lda, dnn = c('approval status','LDA predictions'))

##                LDA predictions
## approval status   N   Y
##               N  25  23
##               Y   1 104

table(test$loan_status, test$knn, dnn = c('approval status','KNN predictions'))

##                KNN predictions
## approval status  N  Y
##               N  8 40
##               Y 13 92

table(test$loan_status, test$cart, dnn = c('approval status','CART predictions'))

##                CART predictions
## approval status   N   Y
##               N  28  20
##               Y   1 104

table(test$loan_status, test$rf, dnn = c('approval status','RF predictions'))

##                RF predictions
## approval status   N   Y
##               N  28  20
##               Y   1 104

Comparison of Model Accuracy

# model accuracy table
data.frame(accuracy = rbind(
  sum(test$loan_status == test$lda) / nrow(test),
  sum(test$loan_status == test$knn) / nrow(test),
  sum(test$loan_status == test$cart) / nrow(test),
  sum(test$loan_status == test$rf) / nrow(test)
),
  row.names = c('LDA',
                'KNN',
                'CART', 
                'Random Forest'
                )
)

##                accuracy
## LDA           0.8431373
## KNN           0.6535948
## CART          0.8627451
## Random Forest 0.8627451