load the data set
EDA
dim(loan) ## the number of rows and columns
## [1] 45000 14
head(loan, 10)
## # A tibble: 10 × 14
## person_age person_gender person_education person_income person_emp_exp
## <dbl> <fct> <fct> <dbl> <dbl>
## 1 22 female Master 71948 0
## 2 21 female High School 12282 0
## 3 25 female High School 12438 3
## 4 23 female Bachelor 79753 0
## 5 24 male Master 66135 1
## 6 21 female High School 12951 0
## 7 26 female Bachelor 93471 1
## 8 24 female High School 95550 5
## 9 24 female Associate 100684 3
## 10 21 female High School 12739 0
## # ℹ 9 more variables: person_home_ownership <fct>, loan_amnt <dbl>,
## # loan_intent <fct>, loan_int_rate <dbl>, loan_percent_income <dbl>,
## # cb_person_cred_hist_length <dbl>, credit_score <dbl>,
## # previous_loan_defaults_on_file <fct>, loan_status <fct>
str(loan) ## structure of the data
## spc_tbl_ [45,000 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ person_age : num [1:45000] 22 21 25 23 24 21 26 24 24 21 ...
## $ person_gender : Factor w/ 2 levels "female","male": 1 1 1 1 2 1 1 1 1 1 ...
## $ person_education : Factor w/ 5 levels "Master","High School",..: 1 2 2 3 1 2 3 2 4 2 ...
## $ person_income : num [1:45000] 71948 12282 12438 79753 66135 ...
## $ person_emp_exp : num [1:45000] 0 0 3 0 1 0 1 5 3 0 ...
## $ person_home_ownership : Factor w/ 4 levels "RENT","OWN","MORTGAGE",..: 1 2 3 1 1 2 1 1 1 2 ...
## $ loan_amnt : num [1:45000] 35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
## $ loan_intent : Factor w/ 6 levels "PERSONAL","EDUCATION",..: 1 2 3 3 3 4 2 3 1 4 ...
## $ loan_int_rate : num [1:45000] 16 11.1 12.9 15.2 14.3 ...
## $ loan_percent_income : num [1:45000] 0.49 0.08 0.44 0.44 0.53 0.19 0.37 0.37 0.35 0.13 ...
## $ cb_person_cred_hist_length : num [1:45000] 3 2 3 2 4 2 3 4 2 3 ...
## $ credit_score : num [1:45000] 561 504 635 675 586 532 701 585 544 640 ...
## $ previous_loan_defaults_on_file: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ loan_status : Factor w/ 2 levels "1","0": 1 2 1 1 1 1 1 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. person_age = col_number(),
## .. person_gender = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. person_education = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. person_income = col_number(),
## .. person_emp_exp = col_number(),
## .. person_home_ownership = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. loan_amnt = col_number(),
## .. loan_intent = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. loan_int_rate = col_number(),
## .. loan_percent_income = col_number(),
## .. cb_person_cred_hist_length = col_number(),
## .. credit_score = col_number(),
## .. previous_loan_defaults_on_file = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. loan_status = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE)
## .. )
## - attr(*, "problems")=<externalptr>
glimpse(loan) ### looking into the data
## Rows: 45,000
## Columns: 14
## $ person_age <dbl> 22, 21, 25, 23, 24, 21, 26, 24, 24, 21,…
## $ person_gender <fct> female, female, female, female, male, f…
## $ person_education <fct> Master, High School, High School, Bache…
## $ person_income <dbl> 71948, 12282, 12438, 79753, 66135, 1295…
## $ person_emp_exp <dbl> 0, 0, 3, 0, 1, 0, 1, 5, 3, 0, 0, 0, 3, …
## $ person_home_ownership <fct> RENT, OWN, MORTGAGE, RENT, RENT, OWN, R…
## $ loan_amnt <dbl> 35000, 1000, 5500, 35000, 35000, 2500, …
## $ loan_intent <fct> PERSONAL, EDUCATION, MEDICAL, MEDICAL, …
## $ loan_int_rate <dbl> 16.02, 11.14, 12.87, 15.23, 14.27, 7.14…
## $ loan_percent_income <dbl> 0.49, 0.08, 0.44, 0.44, 0.53, 0.19, 0.3…
## $ cb_person_cred_hist_length <dbl> 3, 2, 3, 2, 4, 2, 3, 4, 2, 3, 4, 2, 2, …
## $ credit_score <dbl> 561, 504, 635, 675, 586, 532, 701, 585,…
## $ previous_loan_defaults_on_file <fct> No, Yes, No, No, No, No, No, No, No, No…
## $ loan_status <fct> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
summary(loan) ### summary of the whole data
## person_age person_gender person_education person_income
## Min. : 20.00 female:20159 Master : 6980 Min. : 8000
## 1st Qu.: 24.00 male :24841 High School:11972 1st Qu.: 47204
## Median : 26.00 Bachelor :13399 Median : 67048
## Mean : 27.76 Associate :12028 Mean : 80319
## 3rd Qu.: 30.00 Doctorate : 621 3rd Qu.: 95789
## Max. :144.00 Max. :7200766
## person_emp_exp person_home_ownership loan_amnt
## Min. : 0.00 RENT :23443 Min. : 500
## 1st Qu.: 1.00 OWN : 2951 1st Qu.: 5000
## Median : 4.00 MORTGAGE:18489 Median : 8000
## Mean : 5.41 OTHER : 117 Mean : 9583
## 3rd Qu.: 8.00 3rd Qu.:12237
## Max. :125.00 Max. :35000
## loan_intent loan_int_rate loan_percent_income
## PERSONAL :7552 Min. : 5.42 Min. :0.0000
## EDUCATION :9153 1st Qu.: 8.59 1st Qu.:0.0700
## MEDICAL :8548 Median :11.01 Median :0.1200
## VENTURE :7819 Mean :11.01 Mean :0.1397
## HOMEIMPROVEMENT :4783 3rd Qu.:12.99 3rd Qu.:0.1900
## DEBTCONSOLIDATION:7145 Max. :20.00 Max. :0.6600
## cb_person_cred_hist_length credit_score previous_loan_defaults_on_file
## Min. : 2.000 Min. :390.0 No :22142
## 1st Qu.: 3.000 1st Qu.:601.0 Yes:22858
## Median : 4.000 Median :640.0
## Mean : 5.867 Mean :632.6
## 3rd Qu.: 8.000 3rd Qu.:670.0
## Max. :30.000 Max. :850.0
## loan_status
## 1:10000
## 0:35000
##
##
##
##
loan %>% keep(is.numeric) %>% summary() ### summary of the numeric data
## person_age person_income person_emp_exp loan_amnt
## Min. : 20.00 Min. : 8000 Min. : 0.00 Min. : 500
## 1st Qu.: 24.00 1st Qu.: 47204 1st Qu.: 1.00 1st Qu.: 5000
## Median : 26.00 Median : 67048 Median : 4.00 Median : 8000
## Mean : 27.76 Mean : 80319 Mean : 5.41 Mean : 9583
## 3rd Qu.: 30.00 3rd Qu.: 95789 3rd Qu.: 8.00 3rd Qu.:12237
## Max. :144.00 Max. :7200766 Max. :125.00 Max. :35000
## loan_int_rate loan_percent_income cb_person_cred_hist_length credit_score
## Min. : 5.42 Min. :0.0000 Min. : 2.000 Min. :390.0
## 1st Qu.: 8.59 1st Qu.:0.0700 1st Qu.: 3.000 1st Qu.:601.0
## Median :11.01 Median :0.1200 Median : 4.000 Median :640.0
## Mean :11.01 Mean :0.1397 Mean : 5.867 Mean :632.6
## 3rd Qu.:12.99 3rd Qu.:0.1900 3rd Qu.: 8.000 3rd Qu.:670.0
## Max. :20.00 Max. :0.6600 Max. :30.000 Max. :850.0
loan %>% keep(is.factor) %>% summary() ### summary of factors variables
## person_gender person_education person_home_ownership
## female:20159 Master : 6980 RENT :23443
## male :24841 High School:11972 OWN : 2951
## Bachelor :13399 MORTGAGE:18489
## Associate :12028 OTHER : 117
## Doctorate : 621
##
## loan_intent previous_loan_defaults_on_file loan_status
## PERSONAL :7552 No :22142 1:10000
## EDUCATION :9153 Yes:22858 0:35000
## MEDICAL :8548
## VENTURE :7819
## HOMEIMPROVEMENT :4783
## DEBTCONSOLIDATION:7145
### looking for the variables with NAs
sapply(loan, function(x) sum(is.na(x)))
## person_age person_gender
## 0 0
## person_education person_income
## 0 0
## person_emp_exp person_home_ownership
## 0 0
## loan_amnt loan_intent
## 0 0
## loan_int_rate loan_percent_income
## 0 0
## cb_person_cred_hist_length credit_score
## 0 0
## previous_loan_defaults_on_file loan_status
## 0 0
sum(is.na(loan))
## [1] 0
### visualization
ggplot(loan, aes(x=person_age, fill=person_education, color=person_education)) +
geom_histogram(position="identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This shows some of the data are skewed to the right
### histogram for all the numeric data
loan %>% keep(is.numeric) %>% hist.data.frame()

Barchart with person income and person home ownership by person
gender
### Barchart
loan %>% ggplot(aes(person_home_ownership, person_income, fill = person_gender)) + geom_bar(stat= "identity", position = "dodge")

Plot of person age vs person income
loan %>% ggplot(aes(person_age, person_income, fill = person_home_ownership)) + geom_point() + stat_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

loan %>% ggplot(aes(person_age, person_income, color = person_gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Freqency polygon of person age and person education
loan %>% ggplot(aes(person_age, colour = person_education)) + geom_freqpoly(binwidth = 10)

Remove age greater than 100 years, remove person income greater than
7200764
### Data cleaning
loan <- loan %>% mutate(person_age =ifelse(person_age > 100, NA, person_age)) %>%
mutate(person_income =ifelse(person_income > 7200764, NA, person_income)) %>%
mutate(person_emp_exp =ifelse(person_emp_exp > 124, NA, person_emp_exp))
summary(loan)
## person_age person_gender person_education person_income
## Min. :20.00 female:20159 Master : 6980 Min. : 8000
## 1st Qu.:24.00 male :24841 High School:11972 1st Qu.: 47202
## Median :26.00 Bachelor :13399 Median : 67048
## Mean :27.75 Associate :12028 Mean : 80161
## 3rd Qu.:30.00 Doctorate : 621 3rd Qu.: 95786
## Max. :94.00 Max. :5556399
## NA's :7 NA's :1
## person_emp_exp person_home_ownership loan_amnt
## Min. : 0.000 RENT :23443 Min. : 500
## 1st Qu.: 1.000 OWN : 2951 1st Qu.: 5000
## Median : 4.000 MORTGAGE:18489 Median : 8000
## Mean : 5.408 OTHER : 117 Mean : 9583
## 3rd Qu.: 8.000 3rd Qu.:12237
## Max. :124.000 Max. :35000
## NA's :1
## loan_intent loan_int_rate loan_percent_income
## PERSONAL :7552 Min. : 5.42 Min. :0.0000
## EDUCATION :9153 1st Qu.: 8.59 1st Qu.:0.0700
## MEDICAL :8548 Median :11.01 Median :0.1200
## VENTURE :7819 Mean :11.01 Mean :0.1397
## HOMEIMPROVEMENT :4783 3rd Qu.:12.99 3rd Qu.:0.1900
## DEBTCONSOLIDATION:7145 Max. :20.00 Max. :0.6600
##
## cb_person_cred_hist_length credit_score previous_loan_defaults_on_file
## Min. : 2.000 Min. :390.0 No :22142
## 1st Qu.: 3.000 1st Qu.:601.0 Yes:22858
## Median : 4.000 Median :640.0
## Mean : 5.867 Mean :632.6
## 3rd Qu.: 8.000 3rd Qu.:670.0
## Max. :30.000 Max. :850.0
##
## loan_status
## 1:10000
## 0:35000
##
##
##
##
##
Data wrangling
loan %>% group_by(person_education) %>% summarise(Mean = mean(person_income, na.rm = TRUE)) %>% arrange(desc(Mean))
## # A tibble: 5 × 2
## person_education Mean
## <fct> <dbl>
## 1 Doctorate 87235.
## 2 Master 80492.
## 3 High School 80225.
## 4 Associate 80050.
## 5 Bachelor 79703.
### bar graph showing count
loan %>% count(person_education) %>% ggplot(aes(person_education, n, fill = person_education)) + geom_bar(stat = "identity")

### bar chart for loan intent
loan %>% count(loan_intent) %>% ggplot(aes(reorder(loan_intent,-n), n, fill = loan_intent)) + geom_bar(stat = "identity")

###
loan %>% count(loan_intent, person_gender) %>% ggplot(aes(reorder(loan_intent,-n), n, fill = person_gender)) + geom_bar(stat = "identity", position = "dodge")

table(loan$loan_intent, loan$person_gender)
##
## female male
## PERSONAL 3394 4158
## EDUCATION 4079 5074
## MEDICAL 3885 4663
## VENTURE 3490 4329
## HOMEIMPROVEMENT 2140 2643
## DEBTCONSOLIDATION 3171 3974
loan %>% ggplot(aes(person_income, person_age)) + geom_point(alpha = 0.2, aes(color = person_education), outlier.shape = NA)

loan %>% ggplot(aes(person_education,person_income)) + scale_y_log10() + geom_boxplot(outlier.shape = NA) + geom_jitter(alpha = 0.4, color = "tomato")

loan %>% filter(!is.na(person_age)) %>% select(person_age, person_income,person_education) %>% arrange(desc(person_income)) %>% head()
## # A tibble: 6 × 3
## person_age person_income person_education
## <dbl> <dbl> <fct>
## 1 42 2448661 High School
## 2 60 2280980 High School
## 3 63 2139143 High School
## 4 42 2012954 High School
## 5 46 1741243 Associate
## 6 44 1728974 High School
Splitting the data
set.seed(1234)
sample_index <- sample(nrow(loan), round(nrow(loan) * .75), replace = FALSE)
train_data <- loan[sample_index, ]
test_data <- loan[-sample_index, ]
Checking for class imbalance
round(prop.table(table(select(loan, loan_status))), 2)
## loan_status
## 1 0
## 0.22 0.78
round(prop.table(table(select(train_data, loan_status))), 2)
## loan_status
## 1 0
## 0.22 0.78
round(prop.table(table(select(test_data, loan_status))), 2)
## loan_status
## 1 0
## 0.23 0.77
Training the model
library(rpart)
loan_mod <- rpart(loan_status ~ ., method = "class", data = train_data)
Evaluating the model
library(rpart.plot)
rpart.plot(loan_mod)

Prediction on test data
loan_pred <- predict(loan_mod,test_data, type = "class")
loan_pred_table <- table(test_data$loan_status, loan_pred)
loan_pred_table
## loan_pred
## 1 0
## 1 1776 762
## 0 258 8454
sum(diag(loan_pred_table)) / nrow(test_data)
## [1] 0.9093333
Confusion Matrix
library(caret)
confusionMatrix(loan_pred, test_data$loan_status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 1776 258
## 0 762 8454
##
## Accuracy : 0.9093
## 95% CI : (0.9039, 0.9146)
## No Information Rate : 0.7744
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7209
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.6998
## Specificity : 0.9704
## Pos Pred Value : 0.8732
## Neg Pred Value : 0.9173
## Prevalence : 0.2256
## Detection Rate : 0.1579
## Detection Prevalence : 0.1808
## Balanced Accuracy : 0.8351
##
## 'Positive' Class : 1
##
Build the model for randomForest algorithm
loan <- na.omit(loan)
is.factor(loan$loan_status)
## [1] TRUE
Splitting the data
set.seed(1234)
sample_index <- sample(nrow(loan), round(nrow(loan) * .75), replace = FALSE)
train_data <- loan[sample_index, ]
test_data <- loan[-sample_index, ]
Building the model
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
loan_forest <- randomForest(loan_status ~ ., data = train_data, mtry = 4, ntree = 2001, importance = TRUE)
loan_forest
##
## Call:
## randomForest(formula = loan_status ~ ., data = train_data, mtry = 4, ntree = 2001, importance = TRUE)
## Type of random forest: classification
## Number of trees: 2001
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 7.01%
## Confusion matrix:
## 1 0 class.error
## 1 5803 1657 0.2221180
## 0 710 25575 0.0270116
Prediction on train data
Randomforest_pred <- predict(loan_forest, train_data)
confusionMatrix(Randomforest_pred, train_data$loan_status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 7460 0
## 0 0 26285
##
## Accuracy : 1
## 95% CI : (0.9999, 1)
## No Information Rate : 0.7789
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.2211
## Detection Rate : 0.2211
## Detection Prevalence : 0.2211
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##
Prediction on test data
Randomforest_pred_test <- predict(loan_forest, test_data)
confusionMatrix(Randomforest_pred_test, test_data$loan_status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 1969 234
## 0 571 8474
##
## Accuracy : 0.9284
## 95% CI : (0.9235, 0.9331)
## No Information Rate : 0.7742
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7852
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.7752
## Specificity : 0.9731
## Pos Pred Value : 0.8938
## Neg Pred Value : 0.9369
## Prevalence : 0.2258
## Detection Rate : 0.1751
## Detection Prevalence : 0.1959
## Balanced Accuracy : 0.8742
##
## 'Positive' Class : 1
##
Important variable
importance(loan_forest)
## 1 0 MeanDecreaseAccuracy
## person_age 25.360441 79.430488 88.529711
## person_gender -1.575742 4.822740 2.873238
## person_education 1.282713 -2.024981 -0.852207
## person_income 115.957079 112.762517 155.711802
## person_emp_exp 19.134173 66.373238 74.146195
## person_home_ownership 375.812915 89.044169 250.827628
## loan_amnt 51.852165 65.572205 82.220317
## loan_intent 200.992054 109.707427 201.944859
## loan_int_rate 520.388295 206.627402 476.813236
## loan_percent_income 268.535783 81.374757 187.011754
## cb_person_cred_hist_length 29.623032 49.374757 64.314848
## credit_score 8.042307 138.285851 118.918321
## previous_loan_defaults_on_file 306.463117 476.570506 394.800581
## MeanDecreaseGini
## person_age 319.23829
## person_gender 67.50246
## person_education 216.09924
## person_income 1387.03909
## person_emp_exp 290.84943
## person_home_ownership 798.52541
## loan_amnt 598.62179
## loan_intent 523.32393
## loan_int_rate 1862.53986
## loan_percent_income 1946.16539
## cb_person_cred_hist_length 264.28452
## credit_score 626.04201
## previous_loan_defaults_on_file 2718.72240
varImpPlot(loan_forest)

In conclusion, random forest algorithm is best suited for this data
since it had a better accuracy of 92.84% than that of
decision tree algorithm which is 90.93%.