load the data set

EDA

dim(loan)  ## the number of rows and columns
## [1] 45000    14
head(loan, 10)
## # A tibble: 10 × 14
##    person_age person_gender person_education person_income person_emp_exp
##         <dbl> <fct>         <fct>                    <dbl>          <dbl>
##  1         22 female        Master                   71948              0
##  2         21 female        High School              12282              0
##  3         25 female        High School              12438              3
##  4         23 female        Bachelor                 79753              0
##  5         24 male          Master                   66135              1
##  6         21 female        High School              12951              0
##  7         26 female        Bachelor                 93471              1
##  8         24 female        High School              95550              5
##  9         24 female        Associate               100684              3
## 10         21 female        High School              12739              0
## # ℹ 9 more variables: person_home_ownership <fct>, loan_amnt <dbl>,
## #   loan_intent <fct>, loan_int_rate <dbl>, loan_percent_income <dbl>,
## #   cb_person_cred_hist_length <dbl>, credit_score <dbl>,
## #   previous_loan_defaults_on_file <fct>, loan_status <fct>
str(loan)   ## structure of the data
## spc_tbl_ [45,000 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ person_age                    : num [1:45000] 22 21 25 23 24 21 26 24 24 21 ...
##  $ person_gender                 : Factor w/ 2 levels "female","male": 1 1 1 1 2 1 1 1 1 1 ...
##  $ person_education              : Factor w/ 5 levels "Master","High School",..: 1 2 2 3 1 2 3 2 4 2 ...
##  $ person_income                 : num [1:45000] 71948 12282 12438 79753 66135 ...
##  $ person_emp_exp                : num [1:45000] 0 0 3 0 1 0 1 5 3 0 ...
##  $ person_home_ownership         : Factor w/ 4 levels "RENT","OWN","MORTGAGE",..: 1 2 3 1 1 2 1 1 1 2 ...
##  $ loan_amnt                     : num [1:45000] 35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
##  $ loan_intent                   : Factor w/ 6 levels "PERSONAL","EDUCATION",..: 1 2 3 3 3 4 2 3 1 4 ...
##  $ loan_int_rate                 : num [1:45000] 16 11.1 12.9 15.2 14.3 ...
##  $ loan_percent_income           : num [1:45000] 0.49 0.08 0.44 0.44 0.53 0.19 0.37 0.37 0.35 0.13 ...
##  $ cb_person_cred_hist_length    : num [1:45000] 3 2 3 2 4 2 3 4 2 3 ...
##  $ credit_score                  : num [1:45000] 561 504 635 675 586 532 701 585 544 640 ...
##  $ previous_loan_defaults_on_file: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ loan_status                   : Factor w/ 2 levels "1","0": 1 2 1 1 1 1 1 1 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   person_age = col_number(),
##   ..   person_gender = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   person_education = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   person_income = col_number(),
##   ..   person_emp_exp = col_number(),
##   ..   person_home_ownership = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   loan_amnt = col_number(),
##   ..   loan_intent = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   loan_int_rate = col_number(),
##   ..   loan_percent_income = col_number(),
##   ..   cb_person_cred_hist_length = col_number(),
##   ..   credit_score = col_number(),
##   ..   previous_loan_defaults_on_file = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   loan_status = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE)
##   .. )
##  - attr(*, "problems")=<externalptr>
glimpse(loan)   ### looking into the data
## Rows: 45,000
## Columns: 14
## $ person_age                     <dbl> 22, 21, 25, 23, 24, 21, 26, 24, 24, 21,…
## $ person_gender                  <fct> female, female, female, female, male, f…
## $ person_education               <fct> Master, High School, High School, Bache…
## $ person_income                  <dbl> 71948, 12282, 12438, 79753, 66135, 1295…
## $ person_emp_exp                 <dbl> 0, 0, 3, 0, 1, 0, 1, 5, 3, 0, 0, 0, 3, …
## $ person_home_ownership          <fct> RENT, OWN, MORTGAGE, RENT, RENT, OWN, R…
## $ loan_amnt                      <dbl> 35000, 1000, 5500, 35000, 35000, 2500, …
## $ loan_intent                    <fct> PERSONAL, EDUCATION, MEDICAL, MEDICAL, …
## $ loan_int_rate                  <dbl> 16.02, 11.14, 12.87, 15.23, 14.27, 7.14…
## $ loan_percent_income            <dbl> 0.49, 0.08, 0.44, 0.44, 0.53, 0.19, 0.3…
## $ cb_person_cred_hist_length     <dbl> 3, 2, 3, 2, 4, 2, 3, 4, 2, 3, 4, 2, 2, …
## $ credit_score                   <dbl> 561, 504, 635, 675, 586, 532, 701, 585,…
## $ previous_loan_defaults_on_file <fct> No, Yes, No, No, No, No, No, No, No, No…
## $ loan_status                    <fct> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
summary(loan)   ### summary of the whole data
##    person_age     person_gender     person_education person_income    
##  Min.   : 20.00   female:20159   Master     : 6980   Min.   :   8000  
##  1st Qu.: 24.00   male  :24841   High School:11972   1st Qu.:  47204  
##  Median : 26.00                  Bachelor   :13399   Median :  67048  
##  Mean   : 27.76                  Associate  :12028   Mean   :  80319  
##  3rd Qu.: 30.00                  Doctorate  :  621   3rd Qu.:  95789  
##  Max.   :144.00                                      Max.   :7200766  
##  person_emp_exp   person_home_ownership   loan_amnt    
##  Min.   :  0.00   RENT    :23443        Min.   :  500  
##  1st Qu.:  1.00   OWN     : 2951        1st Qu.: 5000  
##  Median :  4.00   MORTGAGE:18489        Median : 8000  
##  Mean   :  5.41   OTHER   :  117        Mean   : 9583  
##  3rd Qu.:  8.00                         3rd Qu.:12237  
##  Max.   :125.00                         Max.   :35000  
##             loan_intent   loan_int_rate   loan_percent_income
##  PERSONAL         :7552   Min.   : 5.42   Min.   :0.0000     
##  EDUCATION        :9153   1st Qu.: 8.59   1st Qu.:0.0700     
##  MEDICAL          :8548   Median :11.01   Median :0.1200     
##  VENTURE          :7819   Mean   :11.01   Mean   :0.1397     
##  HOMEIMPROVEMENT  :4783   3rd Qu.:12.99   3rd Qu.:0.1900     
##  DEBTCONSOLIDATION:7145   Max.   :20.00   Max.   :0.6600     
##  cb_person_cred_hist_length  credit_score   previous_loan_defaults_on_file
##  Min.   : 2.000             Min.   :390.0   No :22142                     
##  1st Qu.: 3.000             1st Qu.:601.0   Yes:22858                     
##  Median : 4.000             Median :640.0                                 
##  Mean   : 5.867             Mean   :632.6                                 
##  3rd Qu.: 8.000             3rd Qu.:670.0                                 
##  Max.   :30.000             Max.   :850.0                                 
##  loan_status
##  1:10000    
##  0:35000    
##             
##             
##             
## 
loan %>% keep(is.numeric) %>% summary()  ### summary of the numeric data
##    person_age     person_income     person_emp_exp     loan_amnt    
##  Min.   : 20.00   Min.   :   8000   Min.   :  0.00   Min.   :  500  
##  1st Qu.: 24.00   1st Qu.:  47204   1st Qu.:  1.00   1st Qu.: 5000  
##  Median : 26.00   Median :  67048   Median :  4.00   Median : 8000  
##  Mean   : 27.76   Mean   :  80319   Mean   :  5.41   Mean   : 9583  
##  3rd Qu.: 30.00   3rd Qu.:  95789   3rd Qu.:  8.00   3rd Qu.:12237  
##  Max.   :144.00   Max.   :7200766   Max.   :125.00   Max.   :35000  
##  loan_int_rate   loan_percent_income cb_person_cred_hist_length  credit_score  
##  Min.   : 5.42   Min.   :0.0000      Min.   : 2.000             Min.   :390.0  
##  1st Qu.: 8.59   1st Qu.:0.0700      1st Qu.: 3.000             1st Qu.:601.0  
##  Median :11.01   Median :0.1200      Median : 4.000             Median :640.0  
##  Mean   :11.01   Mean   :0.1397      Mean   : 5.867             Mean   :632.6  
##  3rd Qu.:12.99   3rd Qu.:0.1900      3rd Qu.: 8.000             3rd Qu.:670.0  
##  Max.   :20.00   Max.   :0.6600      Max.   :30.000             Max.   :850.0
loan %>%  keep(is.factor) %>%  summary()  ### summary of factors variables 
##  person_gender     person_education person_home_ownership
##  female:20159   Master     : 6980   RENT    :23443       
##  male  :24841   High School:11972   OWN     : 2951       
##                 Bachelor   :13399   MORTGAGE:18489       
##                 Associate  :12028   OTHER   :  117       
##                 Doctorate  :  621                        
##                                                          
##             loan_intent   previous_loan_defaults_on_file loan_status
##  PERSONAL         :7552   No :22142                      1:10000    
##  EDUCATION        :9153   Yes:22858                      0:35000    
##  MEDICAL          :8548                                             
##  VENTURE          :7819                                             
##  HOMEIMPROVEMENT  :4783                                             
##  DEBTCONSOLIDATION:7145
### looking for the variables with NAs
sapply(loan, function(x) sum(is.na(x)))
##                     person_age                  person_gender 
##                              0                              0 
##               person_education                  person_income 
##                              0                              0 
##                 person_emp_exp          person_home_ownership 
##                              0                              0 
##                      loan_amnt                    loan_intent 
##                              0                              0 
##                  loan_int_rate            loan_percent_income 
##                              0                              0 
##     cb_person_cred_hist_length                   credit_score 
##                              0                              0 
## previous_loan_defaults_on_file                    loan_status 
##                              0                              0
sum(is.na(loan))
## [1] 0
### visualization


ggplot(loan, aes(x=person_age, fill=person_education, color=person_education)) + 
  geom_histogram(position="identity") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This shows some of the data are skewed to the right

### histogram for all the numeric data
loan %>%  keep(is.numeric) %>% hist.data.frame()

Barchart with person income and person home ownership by person gender

### Barchart

loan %>% ggplot(aes(person_home_ownership, person_income, fill = person_gender)) + geom_bar(stat= "identity", position = "dodge")

Plot of person age vs person income

loan %>% ggplot(aes(person_age, person_income, fill = person_home_ownership)) + geom_point() + stat_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

loan %>% ggplot(aes(person_age, person_income, color = person_gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Freqency polygon of person age and person education

loan %>% ggplot(aes(person_age, colour = person_education)) + geom_freqpoly(binwidth = 10)

Remove age greater than 100 years, remove person income greater than 7200764

### Data cleaning

loan <- loan %>% mutate(person_age =ifelse(person_age > 100, NA, person_age)) %>% 
  mutate(person_income =ifelse(person_income > 7200764, NA, person_income)) %>% 
  mutate(person_emp_exp =ifelse(person_emp_exp > 124, NA, person_emp_exp))

summary(loan)
##    person_age    person_gender     person_education person_income    
##  Min.   :20.00   female:20159   Master     : 6980   Min.   :   8000  
##  1st Qu.:24.00   male  :24841   High School:11972   1st Qu.:  47202  
##  Median :26.00                  Bachelor   :13399   Median :  67048  
##  Mean   :27.75                  Associate  :12028   Mean   :  80161  
##  3rd Qu.:30.00                  Doctorate  :  621   3rd Qu.:  95786  
##  Max.   :94.00                                      Max.   :5556399  
##  NA's   :7                                          NA's   :1        
##  person_emp_exp    person_home_ownership   loan_amnt    
##  Min.   :  0.000   RENT    :23443        Min.   :  500  
##  1st Qu.:  1.000   OWN     : 2951        1st Qu.: 5000  
##  Median :  4.000   MORTGAGE:18489        Median : 8000  
##  Mean   :  5.408   OTHER   :  117        Mean   : 9583  
##  3rd Qu.:  8.000                         3rd Qu.:12237  
##  Max.   :124.000                         Max.   :35000  
##  NA's   :1                                              
##             loan_intent   loan_int_rate   loan_percent_income
##  PERSONAL         :7552   Min.   : 5.42   Min.   :0.0000     
##  EDUCATION        :9153   1st Qu.: 8.59   1st Qu.:0.0700     
##  MEDICAL          :8548   Median :11.01   Median :0.1200     
##  VENTURE          :7819   Mean   :11.01   Mean   :0.1397     
##  HOMEIMPROVEMENT  :4783   3rd Qu.:12.99   3rd Qu.:0.1900     
##  DEBTCONSOLIDATION:7145   Max.   :20.00   Max.   :0.6600     
##                                                              
##  cb_person_cred_hist_length  credit_score   previous_loan_defaults_on_file
##  Min.   : 2.000             Min.   :390.0   No :22142                     
##  1st Qu.: 3.000             1st Qu.:601.0   Yes:22858                     
##  Median : 4.000             Median :640.0                                 
##  Mean   : 5.867             Mean   :632.6                                 
##  3rd Qu.: 8.000             3rd Qu.:670.0                                 
##  Max.   :30.000             Max.   :850.0                                 
##                                                                           
##  loan_status
##  1:10000    
##  0:35000    
##             
##             
##             
##             
## 

Data wrangling

loan %>% group_by(person_education) %>% summarise(Mean = mean(person_income, na.rm = TRUE)) %>% arrange(desc(Mean))
## # A tibble: 5 × 2
##   person_education   Mean
##   <fct>             <dbl>
## 1 Doctorate        87235.
## 2 Master           80492.
## 3 High School      80225.
## 4 Associate        80050.
## 5 Bachelor         79703.
###  bar graph showing count
loan %>%  count(person_education) %>% ggplot(aes(person_education, n, fill = person_education)) + geom_bar(stat = "identity")

### bar chart for loan intent 
loan %>% count(loan_intent) %>% ggplot(aes(reorder(loan_intent,-n), n, fill = loan_intent)) + geom_bar(stat = "identity")

###

loan %>% count(loan_intent, person_gender) %>% ggplot(aes(reorder(loan_intent,-n), n, fill = person_gender)) + geom_bar(stat = "identity", position = "dodge")

table(loan$loan_intent, loan$person_gender)
##                    
##                     female male
##   PERSONAL            3394 4158
##   EDUCATION           4079 5074
##   MEDICAL             3885 4663
##   VENTURE             3490 4329
##   HOMEIMPROVEMENT     2140 2643
##   DEBTCONSOLIDATION   3171 3974
loan %>% ggplot(aes(person_income, person_age)) + geom_point(alpha = 0.2, aes(color = person_education), outlier.shape = NA)

loan %>% ggplot(aes(person_education,person_income)) + scale_y_log10() + geom_boxplot(outlier.shape = NA) + geom_jitter(alpha = 0.4, color = "tomato")

loan %>% filter(!is.na(person_age)) %>% select(person_age, person_income,person_education) %>% arrange(desc(person_income)) %>% head() 
## # A tibble: 6 × 3
##   person_age person_income person_education
##        <dbl>         <dbl> <fct>           
## 1         42       2448661 High School     
## 2         60       2280980 High School     
## 3         63       2139143 High School     
## 4         42       2012954 High School     
## 5         46       1741243 Associate       
## 6         44       1728974 High School

Splitting the data

set.seed(1234)
sample_index <- sample(nrow(loan), round(nrow(loan) * .75), replace = FALSE)

train_data <- loan[sample_index, ]
test_data <- loan[-sample_index, ]

Checking for class imbalance

round(prop.table(table(select(loan, loan_status))), 2)
## loan_status
##    1    0 
## 0.22 0.78
round(prop.table(table(select(train_data, loan_status))), 2)
## loan_status
##    1    0 
## 0.22 0.78
round(prop.table(table(select(test_data, loan_status))), 2)
## loan_status
##    1    0 
## 0.23 0.77

Training the model

library(rpart)
loan_mod <- rpart(loan_status ~ ., method = "class", data = train_data)

Evaluating the model

library(rpart.plot)
rpart.plot(loan_mod)

Prediction on test data

loan_pred <- predict(loan_mod,test_data, type = "class")

loan_pred_table <- table(test_data$loan_status, loan_pred)
loan_pred_table
##    loan_pred
##        1    0
##   1 1776  762
##   0  258 8454
sum(diag(loan_pred_table)) / nrow(test_data)
## [1] 0.9093333

Confusion Matrix

library(caret)
confusionMatrix(loan_pred, test_data$loan_status)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1 1776  258
##          0  762 8454
##                                                
##                Accuracy : 0.9093               
##                  95% CI : (0.9039, 0.9146)     
##     No Information Rate : 0.7744               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.7209               
##                                                
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.6998               
##             Specificity : 0.9704               
##          Pos Pred Value : 0.8732               
##          Neg Pred Value : 0.9173               
##              Prevalence : 0.2256               
##          Detection Rate : 0.1579               
##    Detection Prevalence : 0.1808               
##       Balanced Accuracy : 0.8351               
##                                                
##        'Positive' Class : 1                    
## 

Build the model for randomForest algorithm

loan <- na.omit(loan)

is.factor(loan$loan_status)
## [1] TRUE

Splitting the data

set.seed(1234)
sample_index <- sample(nrow(loan), round(nrow(loan) * .75), replace = FALSE)

train_data <- loan[sample_index, ]
test_data <- loan[-sample_index, ]

Building the model

library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
loan_forest <- randomForest(loan_status ~ ., data = train_data, mtry = 4, ntree = 2001, importance = TRUE)

loan_forest
## 
## Call:
##  randomForest(formula = loan_status ~ ., data = train_data, mtry = 4,      ntree = 2001, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 2001
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 7.01%
## Confusion matrix:
##      1     0 class.error
## 1 5803  1657   0.2221180
## 0  710 25575   0.0270116

Prediction on train data

Randomforest_pred <- predict(loan_forest, train_data)

confusionMatrix(Randomforest_pred, train_data$loan_status)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     1     0
##          1  7460     0
##          0     0 26285
##                                                
##                Accuracy : 1                    
##                  95% CI : (0.9999, 1)          
##     No Information Rate : 0.7789               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 1                    
##                                                
##  Mcnemar's Test P-Value : NA                   
##                                                
##             Sensitivity : 1.0000               
##             Specificity : 1.0000               
##          Pos Pred Value : 1.0000               
##          Neg Pred Value : 1.0000               
##              Prevalence : 0.2211               
##          Detection Rate : 0.2211               
##    Detection Prevalence : 0.2211               
##       Balanced Accuracy : 1.0000               
##                                                
##        'Positive' Class : 1                    
## 

Prediction on test data

Randomforest_pred_test <- predict(loan_forest, test_data)

confusionMatrix(Randomforest_pred_test, test_data$loan_status)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1 1969  234
##          0  571 8474
##                                                
##                Accuracy : 0.9284               
##                  95% CI : (0.9235, 0.9331)     
##     No Information Rate : 0.7742               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.7852               
##                                                
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.7752               
##             Specificity : 0.9731               
##          Pos Pred Value : 0.8938               
##          Neg Pred Value : 0.9369               
##              Prevalence : 0.2258               
##          Detection Rate : 0.1751               
##    Detection Prevalence : 0.1959               
##       Balanced Accuracy : 0.8742               
##                                                
##        'Positive' Class : 1                    
## 

Important variable

importance(loan_forest)
##                                         1          0 MeanDecreaseAccuracy
## person_age                      25.360441  79.430488            88.529711
## person_gender                   -1.575742   4.822740             2.873238
## person_education                 1.282713  -2.024981            -0.852207
## person_income                  115.957079 112.762517           155.711802
## person_emp_exp                  19.134173  66.373238            74.146195
## person_home_ownership          375.812915  89.044169           250.827628
## loan_amnt                       51.852165  65.572205            82.220317
## loan_intent                    200.992054 109.707427           201.944859
## loan_int_rate                  520.388295 206.627402           476.813236
## loan_percent_income            268.535783  81.374757           187.011754
## cb_person_cred_hist_length      29.623032  49.374757            64.314848
## credit_score                     8.042307 138.285851           118.918321
## previous_loan_defaults_on_file 306.463117 476.570506           394.800581
##                                MeanDecreaseGini
## person_age                            319.23829
## person_gender                          67.50246
## person_education                      216.09924
## person_income                        1387.03909
## person_emp_exp                        290.84943
## person_home_ownership                 798.52541
## loan_amnt                             598.62179
## loan_intent                           523.32393
## loan_int_rate                        1862.53986
## loan_percent_income                  1946.16539
## cb_person_cred_hist_length            264.28452
## credit_score                          626.04201
## previous_loan_defaults_on_file       2718.72240
varImpPlot(loan_forest)

In conclusion, random forest algorithm is best suited for this data since it had a better accuracy of 92.84% than that of decision tree algorithm which is 90.93%.