Goals of the study
This will require you to report a model to characterize the relationship between risk factors to the chance of a loan being defaulted.
The set of available predictors is not limited to the raw variables in the data set. You may engineer any factors using the data that you think will improve your model’s quality.
## loan_amnt funded_amnt funded_amnt_inv term
## Min. : 500 Min. : 500 Min. : 0 36_months:28301
## 1st Qu.: 5500 1st Qu.: 5500 1st Qu.: 5000 60_months:10670
## Median :10000 Median : 9800 Median : 9000
## Mean :11267 Mean :10991 Mean :10567
## 3rd Qu.:15000 3rd Qu.:15000 3rd Qu.:14500
## Max. :35000 Max. :35000 Max. :35000
##
## int_rate installment grade sub_grade
## Min. :0.0542 Min. : 16.08 A: 9937 B3 : 2886
## 1st Qu.:0.0925 1st Qu.: 167.42 B:11810 A4 : 2856
## Median :0.1186 Median : 280.91 C: 7889 A5 : 2706
## Mean :0.1204 Mean : 325.10 D: 5185 B5 : 2665
## 3rd Qu.:0.1461 3rd Qu.: 431.37 E: 2805 B4 : 2465
## Max. :0.2459 Max. :1305.19 F: 1033 C1 : 2086
## G: 312 (Other):23307
## emp_title emp_length home_ownership
## : 2406 10+ years: 8770 MORTGAGE:17427
## US Army : 131 < 1 year : 4404 OTHER : 96
## Bank of America : 108 2 years : 4299 OWN : 2992
## IBM : 66 3 years : 4026 RENT :18456
## AT&T : 60 4 years : 3385
## Kaiser Permanente: 56 5 years : 3239
## (Other) :36144 (Other) :10848
## annual_inc verification_status issue_d
## Min. : 4000 Not Verified :16165 Dec 2011: 2266
## 1st Qu.: 40800 Source Verified: 9992 Nov 2011: 2226
## Median : 59386 Verified :12814 Oct 2011: 2113
## Mean : 69038 Sep 2011: 2061
## 3rd Qu.: 82500 Aug 2011: 1933
## Max. :6000000 Jul 2011: 1866
## (Other) :26506
## loan_status purpose zip_code
## Charged Off: 5468 debt_consolidation:18345 100xx : 570
## Fully Paid :33503 credit_card : 5034 945xx : 540
## other : 3875 112xx : 505
## home_improvement : 2929 606xx : 498
## major_purchase : 2158 070xx : 458
## small_business : 1768 900xx : 441
## (Other) : 4862 (Other):35959
## addr_state dti delinq_2yrs earliest_cr_line
## CA : 6995 Min. : 0.00 Min. : 0.0000 Nov 1998: 366
## NY : 3721 1st Qu.: 8.25 1st Qu.: 0.0000 Oct 1999: 363
## FL : 2816 Median :13.46 Median : 0.0000 Dec 1998: 343
## TX : 2676 Mean :13.37 Mean : 0.1454 Oct 2000: 339
## NJ : 1814 3rd Qu.:18.63 3rd Qu.: 0.0000 Dec 1997: 321
## IL : 1510 Max. :29.99 Max. :11.0000 Nov 1999: 316
## (Other):19439 (Other) :36923
## inq_last_6mths open_acc pub_rec revol_bal
## Min. :0.0000 Min. : 2.000 Min. :0.00000 Min. : 0
## 1st Qu.:0.0000 1st Qu.: 6.000 1st Qu.:0.00000 1st Qu.: 3746
## Median :1.0000 Median : 9.000 Median :0.00000 Median : 8897
## Mean :0.8658 Mean : 9.302 Mean :0.05553 Mean : 13414
## 3rd Qu.:1.0000 3rd Qu.:12.000 3rd Qu.:0.00000 3rd Qu.: 17095
## Max. :8.0000 Max. :44.000 Max. :4.00000 Max. :149588
##
## revol_util total_acc total_pymnt total_pymnt_inv
## Min. :0.0000 Min. : 2.00 Min. : 33.97 Min. : 0
## 1st Qu.:0.2560 1st Qu.:14.00 1st Qu.: 5604.59 1st Qu.: 5338
## Median :0.4950 Median :21.00 Median : 9991.49 Median : 9520
## Mean :0.4898 Mean :22.17 Mean :12297.41 Mean :11836
## 3rd Qu.:0.7250 3rd Qu.:29.00 3rd Qu.:16689.24 3rd Qu.:16096
## Max. :0.9990 Max. :90.00 Max. :58886.47 Max. :58564
##
## total_rec_prncp total_rec_int total_rec_late_fee recoveries
## Min. : 0 Min. : 6.22 Min. : 0.000 Min. : 0.0
## 1st Qu.: 4727 1st Qu.: 668.45 1st Qu.: 0.000 1st Qu.: 0.0
## Median : 8000 Median : 1360.48 Median : 0.000 Median : 0.0
## Mean : 9904 Mean : 2296.40 Mean : 1.353 Mean : 95.9
## 3rd Qu.:14000 3rd Qu.: 2874.91 3rd Qu.: 0.000 3rd Qu.: 0.0
## Max. :35000 Max. :23886.47 Max. :180.200 Max. :29623.3
##
## collection_recovery_fee last_pymnt_d last_pymnt_amnt
## Min. : 0.00 Mar 2013: 1024 Min. : 0.01
## 1st Qu.: 0.00 Dec 2014: 944 1st Qu.: 220.95
## Median : 0.00 May 2013: 907 Median : 558.08
## Mean : 12.37 Feb 2013: 868 Mean : 2706.59
## 3rd Qu.: 0.00 Apr 2013: 851 3rd Qu.: 3349.28
## Max. :7002.19 Mar 2012: 843 Max. :36115.20
## (Other) :33534
## last_credit_pull_d pub_rec_bankruptcies
## Apr 2017: 9312 Min. :0.00000
## Oct 2016: 4238 1st Qu.:0.00000
## Feb 2017: 1101 Median :0.00000
## Mar 2017: 870 Mean :0.04324
## Jan 2017: 658 3rd Qu.:0.00000
## Dec 2016: 636 Max. :2.00000
## (Other) :22156
#Looking at potential explanatory variable. We can see that default rates increase from A-G
default_filtered <- loan %>%
group_by(grade,loan_status) %>%
summarise(n =n()) %>%
mutate(proportion = n *100/sum(n)) %>%
filter(loan_status == "Charged Off")
by_grade_aggregated =group_by(default_filtered,grade)
default_perc <-summarize(by_grade_aggregated,defaultperc =sum(proportion,na.rm = TRUE))
ggplot(default_perc,aes(x = grade, y= defaultperc)) +geom_bar(stat = "identity", colour="blue") +labs(title = "Loan Status by Loan Grade", x = "Loan Grade", y = "Loan Status")#From the summary above, we also see there are potential outliers in annual income, so we are cleaning out the outliers
## [1] 150000.0 150000.0 225000.0 168000.0 150000.0 155000.0
## [7] 175000.0 208000.0 276000.0 195000.0 231252.0 163000.0
## [13] 160000.0 156000.0 150500.0 235000.0 150000.0 150000.0
## [19] 235000.0 175000.0 160000.0 180000.0 200000.0 165000.0
## [25] 175000.0 165000.0 174000.0 243600.0 152000.0 170000.0
## [31] 264000.0 210000.0 200000.0 170000.0 160000.0 525000.0
## [37] 180000.0 160000.0 150000.0 150000.0 158000.0 220000.0
## [43] 280000.0 300000.0 220000.0 220000.0 180000.0 168000.0
## [49] 250000.0 365000.0 550000.0 325000.0 150000.0 200000.0
## [55] 150000.0 180000.0 200000.0 170000.0 189996.0 200000.0
## [61] 180000.0 175000.0 150000.0 171600.0 170000.0 350000.0
## [67] 165000.0 170000.0 152004.0 150000.0 200000.0 158000.0
## [73] 248000.0 160000.0 180000.0 300000.0 158000.0 225000.0
## [79] 240000.0 185000.0 159500.0 200000.0 250000.0 161000.0
## [85] 157000.0 165000.0 281000.0 180000.0 277000.0 150000.0
## [91] 220000.0 408000.0 160000.0 150000.0 162480.0 150000.0
## [97] 162000.0 146000.0 150000.0 149000.0 173156.0 150000.0
## [103] 192000.0 150000.0 191000.0 153000.0 150000.0 150000.0
## [109] 400000.0 180396.0 280000.0 172000.0 160000.0 267000.0
## [115] 150000.0 211000.0 156000.0 160000.0 180000.0 270000.0
## [121] 185000.0 180000.0 234000.0 204000.0 250000.0 179000.0
## [127] 300000.0 162000.0 190000.0 200000.0 195000.0 300000.0
## [133] 300000.0 180000.0 180000.0 290000.0 160000.0 200000.0
## [139] 150000.0 210000.0 155000.0 180000.0 272000.0 200000.0
## [145] 250000.0 150000.0 180000.0 185000.0 210000.0 150000.0
## [151] 200000.0 184000.0 200000.0 200000.0 175000.0 148112.0
## [157] 240000.0 165000.0 400000.0 270000.0 156000.0 480000.0
## [163] 210000.0 187000.0 170000.0 300000.0 180000.0 200000.0
## [169] 180000.0 192000.0 156000.0 150000.0 175000.0 228000.0
## [175] 190000.0 185000.0 150000.0 200000.0 155000.0 148080.0
## [181] 150000.0 150000.0 200000.0 150000.0 148350.0 160000.0
## [187] 175000.0 320000.0 180000.0 160000.0 175000.0 173467.0
## [193] 206000.0 165000.0 150000.0 243000.0 178000.0 180000.0
## [199] 294000.0 180000.0 185000.0 200000.0 171996.0 150000.0
## [205] 158000.0 150000.0 179000.0 240000.0 180000.0 174996.0
## [211] 185232.0 158367.0 150000.0 147000.0 175000.0 250000.0
## [217] 148616.0 151000.0 190000.0 290000.0 155142.0 192000.0
## [223] 163232.0 200000.0 175000.0 150000.0 200000.0 285000.0
## [229] 183000.0 240000.0 725000.0 245000.0 158000.0 215004.0
## [235] 150000.0 160000.0 156000.0 500000.0 300000.0 200000.0
## [241] 165000.0 150000.0 357000.0 180000.0 195800.0 160000.0
## [247] 175000.0 171500.0 150000.0 190000.0 159000.0 180000.0
## [253] 170000.0 210000.0 240000.0 170000.0 225000.0 155000.0
## [259] 146885.0 385000.0 160000.0 400000.0 270000.0 400000.0
## [265] 187000.0 215000.0 170004.0 175000.0 160000.0 174000.0
## [271] 250000.0 216000.0 360000.0 225000.0 180000.0 400000.0
## [277] 175000.0 207000.0 158000.0 200000.0 250000.0 414996.0
## [283] 300000.0 185000.0 150000.0 340000.0 160000.0 160000.0
## [289] 260000.0 250000.0 250000.0 230000.0 149400.0 156000.0
## [295] 200000.0 170000.0 200000.0 368000.0 175000.0 150000.0
## [301] 150000.0 155000.0 150000.0 240000.0 180000.0 300000.0
## [307] 165000.0 187000.0 162000.0 175000.0 200000.0 174792.0
## [313] 150000.0 250000.0 165000.0 305000.0 175000.0 150000.0
## [319] 160000.0 175000.0 192000.0 150000.0 170000.0 160000.0
## [325] 275000.0 170000.0 156000.0 291996.0 164000.0 208000.0
## [331] 174996.0 150792.0 206000.0 350000.0 180000.0 154000.0
## [337] 157500.0 180000.0 151000.0 153660.0 250000.0 150000.0
## [343] 200000.0 275000.0 190000.0 195000.0 182952.0 150000.0
## [349] 395000.0 201000.0 167000.0 200000.0 175000.0 195000.0
## [355] 1782000.0 150000.0 218000.0 200100.0 270000.0 165000.0
## [361] 200000.0 150000.0 150000.0 160000.0 199992.0 204000.0
## [367] 153936.0 165000.0 166000.0 150000.0 165000.0 154008.0
## [373] 262000.0 175000.0 150000.0 543000.0 900000.0 286000.0
## [379] 750000.0 150000.0 182004.0 180000.0 350000.0 220000.0
## [385] 220000.0 300000.0 240000.0 154800.0 220000.0 185000.0
## [391] 155004.0 160000.0 185000.0 150000.0 165000.0 305000.0
## [397] 147000.0 209004.0 160000.0 150000.0 159996.0 400000.0
## [403] 300000.0 150000.0 155000.0 170000.0 170000.0 150000.0
## [409] 160000.0 146760.0 162000.0 160000.0 180000.0 156000.0
## [415] 200000.0 155000.0 159996.0 150000.0 182500.0 180000.0
## [421] 150000.0 158000.0 170000.0 190000.0 200000.0 165000.0
## [427] 192000.0 162000.0 165000.0 150000.0 198000.0 150000.0
## [433] 150000.0 175000.0 252000.0 160000.0 178650.0 170000.0
## [439] 275000.0 175000.0 350000.0 165000.0 160000.0 160000.0
## [445] 165000.0 249996.0 300000.0 200000.0 149000.0 320000.0
## [451] 150000.0 156000.0 300000.0 185000.0 200000.0 148500.0
## [457] 150000.0 155004.0 150000.0 150000.0 215000.0 148000.0
## [463] 148000.0 150000.0 150000.0 160000.0 198204.0 200000.0
## [469] 175000.0 480000.0 150000.0 200000.0 285000.0 165500.0
## [475] 190300.0 160549.0 187000.0 170000.0 200000.0 186000.0
## [481] 150000.0 192000.0 157544.0 370008.0 250000.0 157000.0
## [487] 258000.0 225000.0 163008.0 300000.0 225000.0 315600.0
## [493] 275000.0 150000.0 200000.0 280000.0 200000.0 200000.0
## [499] 202000.0 164000.0 150000.0 200000.0 150000.0 185000.0
## [505] 157000.0 150000.0 315000.0 158000.0 150000.0 250000.0
## [511] 150000.0 215000.0 190000.0 180000.0 151000.0 240000.0
## [517] 230000.0 230000.0 350000.0 225000.0 150000.0 160000.0
## [523] 210000.0 150000.0 162500.0 193600.0 200000.0 155000.0
## [529] 152000.0 190000.0 159996.0 190000.0 150000.0 146000.0
## [535] 155000.0 180000.0 205000.0 170000.0 168000.0 159000.0
## [541] 240000.0 193086.0 254616.0 200000.0 275000.0 336000.0
## [547] 150000.0 150000.0 163000.0 150060.0 160000.0 170000.0
## [553] 170000.0 200000.0 160000.0 1250000.0 214200.0 150000.0
## [559] 175000.0 175000.0 240000.0 150000.0 165000.0 160680.0
## [565] 199000.0 153600.0 165000.0 225000.0 160000.0 150000.0
## [571] 155000.0 195000.0 150000.0 175000.0 175000.0 153000.0
## [577] 160000.0 250000.0 200000.0 162500.0 150000.0 183000.0
## [583] 190000.0 180000.0 250000.0 195200.0 150000.0 180000.0
## [589] 147884.8 200000.0 160000.0 225000.0 152000.0 200000.0
## [595] 226000.0 175000.0 170000.0 300000.0 150000.0 184000.0
## [601] 150000.0 186600.0 372000.0 150000.0 170000.0 185000.0
## [607] 146000.0 155000.0 178548.0 158899.0 214200.0 650000.0
## [613] 150000.0 310000.0 201000.0 175400.0 150000.0 160000.0
## [619] 200000.0 340000.0 300000.0 205000.0 180000.0 200000.0
## [625] 340000.0 260000.0 300000.0 165996.0 210000.0 157124.0
## [631] 159960.0 192000.0 250000.0 290000.0 180000.0 172000.0
## [637] 160000.0 230000.0 165000.0 180000.0 199680.0 182004.0
## [643] 325000.0 150000.0 220000.0 160000.0 175000.0 150000.0
## [649] 150000.0 150000.0 185000.0 150000.0 150000.0 750000.0
## [655] 399996.0 150000.0 154000.0 175000.0 156000.0 215000.0
## [661] 160000.0 153000.0 250000.0 150750.0 157000.0 225000.0
## [667] 150000.0 180000.0 150000.0 157124.0 185000.0 180000.0
## [673] 300000.0 215662.0 200000.0 300000.0 381450.0 155000.0
## [679] 370000.0 180000.0 350000.0 230000.0 150000.0 200000.0
## [685] 180000.0 150000.0 177632.0 160800.0 225000.0 159996.0
## [691] 320000.0 200000.0 167000.0 155000.0 170000.0 190000.0
## [697] 487000.0 200000.0 196000.0 324996.0 160000.0 173000.0
## [703] 230004.0 175000.0 162500.0 175000.0 170000.0 170000.0
## [709] 250000.0 165000.0 175000.0 165000.0 255000.0 150000.0
## [715] 225000.0 195700.0 145986.6 200000.0 200000.0 159996.0
## [721] 160000.0 207996.0 201000.0 160000.0 300000.0 180000.0
## [727] 170000.0 150000.0 163400.0 150000.0 159996.0 230000.0
## [733] 210000.0 220000.0 205000.0 648000.0 168000.0 230000.0
## [739] 160000.0 160000.0 390000.0 200000.0 156996.0 160000.0
## [745] 225000.0 150075.0 250000.0 174708.0 156000.0 146000.0
## [751] 183000.0 225000.0 252361.0 158000.0 225000.0 175000.0
## [757] 150000.0 200004.0 300000.0 250000.0 249996.0 183000.0
## [763] 350000.0 180000.0 470000.0 191508.0 240000.0 156000.0
## [769] 146990.0 350000.0 210000.0 285000.0 300000.0 150000.0
## [775] 179004.0 180000.0 220000.0 204000.0 192000.0 168000.0
## [781] 147750.0 230000.0 156000.0 204996.0 150000.0 160000.0
## [787] 250000.0 190000.0 190000.0 165000.0 185000.0 280000.0
## [793] 182004.0 150000.0 158000.0 274992.0 200000.0 195054.8
## [799] 180000.0 168000.0 250000.0 204000.0 275000.0 300000.0
## [805] 170004.0 360000.0 201000.0 160000.0 249996.0 160000.0
## [811] 178000.0 180000.0 192000.0 150000.0 222000.0 150000.0
## [817] 249996.0 150600.0 220000.0 195000.0 150000.0 156600.0
## [823] 210000.0 165000.0 160000.0 175436.0 180000.0 170000.0
## [829] 270000.0 180000.0 150000.0 242400.0 180000.0 180000.0
## [835] 208000.0 170000.0 194808.0 185000.0 163500.0 170000.0
## [841] 410000.0 180000.0 150000.0 305000.0 155004.0 210000.0
## [847] 325000.0 170000.0 165000.0 302000.0 175000.0 175000.0
## [853] 150000.0 195000.0 193000.0 170000.0 360000.0 168300.0
## [859] 214992.0 150000.0 160000.0 175000.0 180000.0 170004.0
## [865] 175000.0 200000.0 225000.0 195000.0 154000.0 275000.0
## [871] 150000.0 149400.0 200000.0 268000.0 171312.0 170004.0
## [877] 174996.0 170000.0 193000.0 204000.0 162000.0 172000.0
## [883] 172008.0 183456.0 281988.0 450000.0 150000.0 185000.0
## [889] 191000.0 204000.0 165000.0 240000.0 148000.0 248004.0
## [895] 205000.0 180000.0 180000.0 185000.0 300000.0 150000.0
## [901] 166361.0 200000.0 150000.0 150000.0 195000.0 165000.0
## [907] 175000.0 300000.0 150000.0 185000.0 170000.0 160000.0
## [913] 185000.0 160000.0 300000.0 225000.0 275000.0 155000.0
## [919] 150000.0 160000.0 153000.0 154000.0 275000.0 250000.0
## [925] 175000.0 240000.0 190000.0 400000.0 146775.0 185000.0
## [931] 185000.0 175000.0 145596.0 146000.0 148000.0 260000.0
## [937] 166300.0 160000.0 160000.0 220000.0 275000.0 300000.0
## [943] 225000.0 216000.0 185000.0 177160.0 150000.0 250000.0
## [949] 147000.0 250000.0 219600.0 176004.0 200000.0 156000.0
## [955] 180000.0 175000.0 300000.0 175000.0 160000.0 150000.0
## [961] 155000.0 165000.0 150000.0 190000.0 163556.2 198172.0
## [967] 170000.0 160000.0 180000.0 154000.0 205000.0 336000.0
## [973] 275004.0 216000.0 234000.0 157656.0 168000.0 150000.0
## [979] 195000.0 170000.0 185000.0 200000.0 154000.0 151200.0
## [985] 210000.0 201000.0 150000.0 150000.0 158000.0 170000.0
## [991] 300000.0 155000.0 260000.0 150000.0 552000.0 360000.0
## [997] 180000.0 250000.0 180000.0 154000.0 150000.0 300000.0
## [1003] 180000.0 168000.0 200000.0 290000.0 180000.0 200000.0
## [1009] 150000.0 170000.0 150000.0 174996.0 232500.0 430000.0
## [1015] 150000.0 172000.0 334000.0 150000.0 165000.0 168000.0
## [1021] 480000.0 160000.0 170000.0 180000.0 225000.0 150000.0
## [1027] 150000.0 250000.0 180000.0 154000.0 150000.0 155004.0
## [1033] 330000.0 165000.0 165000.0 500000.0 151000.0 300000.0
## [1039] 200000.0 155000.0 170000.0 165000.0 150000.0 160000.0
## [1045] 234000.0 190000.0 146000.0 201400.0 200000.0 150000.0
## [1051] 780000.0 150000.0 150000.0 175000.0 168000.0 500000.0
## [1057] 948000.0 165000.0 156000.0 1200000.0 180000.0 170000.0
## [1063] 200000.0 325000.0 170000.0 200004.0 300000.0 200000.0
## [1069] 190000.0 185000.0 160000.0 165000.0 195000.0 181408.5
## [1075] 200000.0 408000.0 189000.0 200000.0 160000.0 175000.0
## [1081] 290000.0 156000.0 160000.0 285000.0 180000.0 172000.0
## [1087] 250000.0 154000.0 150000.0 176600.0 150000.0 304800.0
## [1093] 147000.0 192000.0 192000.0 167000.0 350000.0 177000.0
## [1099] 174996.0 250000.0 160000.0 445000.0 150000.0 160000.0
## [1105] 300000.0 170004.0 180000.0 300000.0 200000.0 210000.0
## [1111] 150000.0 170000.0 266000.0 325000.0 150000.0 828000.0
## [1117] 151200.0 150000.0 150000.0 150000.0 179434.0 330000.0
## [1123] 151000.0 168000.0 149981.0 202040.0 219996.0 155000.0
## [1129] 197004.0 275000.0 150996.0 170000.0 250000.0 184992.0
## [1135] 185004.0 200000.0 210000.0 1900000.0 250000.0 158000.0
## [1141] 150000.0 150000.0 175000.0 200000.0 200000.0 150000.0
## [1147] 185000.0 146000.0 165000.0 150086.0 160000.0 175000.0
## [1153] 170000.0 267000.0 200000.0 225000.0 150000.0 210000.0
## [1159] 152004.0 1200000.0 159996.0 250000.0 171996.0 360000.0
## [1165] 175000.0 186996.0 250000.0 180000.0 190000.0 158400.0
## [1171] 150000.0 550000.0 150000.0 158000.0 180415.4 170000.0
## [1177] 590000.0 175000.0 175000.0 576000.0 150000.0 200004.0
## [1183] 152004.0 200004.0 154000.0 180000.0 667680.0 155000.0
## [1189] 756000.0 175000.0 166000.0 218000.0 159996.0 240000.0
## [1195] 185000.0 170000.0 564000.0 148000.0 152004.0 200000.0
## [1201] 648000.0 171194.0 225000.0 150000.0 329000.0 165000.0
## [1207] 889000.0 285000.0 550000.0 150000.0 175000.0 162000.0
## [1213] 180000.0 147500.0 175000.0 172200.0 185000.0 180000.0
## [1219] 230000.0 235000.0 225000.0 203000.0 177240.0 175000.0
## [1225] 250000.0 153000.0 209000.0 780000.0 159996.0 150000.0
## [1231] 150000.0 160000.0 275004.0 1200000.0 220000.0 153000.0
## [1237] 185000.0 162000.0 900000.0 233000.0 180000.0 168000.0
## [1243] 200000.0 400000.0 168000.0 220000.0 170000.0 156268.0
## [1249] 160396.0 165000.0 304000.0 190000.0 290000.0 155000.0
## [1255] 168288.0 170000.0 152004.0 180000.0 190000.0 164000.0
## [1261] 150000.0 155000.0 233000.0 160000.0 150000.0 150000.0
## [1267] 159000.0 150000.0 150000.0 780000.0 150000.0 175000.0
## [1273] 160000.0 528000.0 170000.0 200000.0 200000.0 700053.8
## [1279] 350000.0 150000.0 150000.0 160000.0 195000.0 206000.0
## [1285] 200000.0 155000.0 201000.0 210000.0 244000.0 200000.0
## [1291] 157500.0 150000.0 148716.0 150000.0 157000.0 172000.0
## [1297] 189000.0 170000.0 168000.0 150000.0 366000.0 186000.0
## [1303] 204000.0 160000.0 168000.0 156000.0 151000.0 150000.0
## [1309] 1176000.0 180000.0 170004.0 185000.0 155000.0 224000.0
## [1315] 236004.0 244000.0 205000.0 300000.0 204000.0 228000.0
## [1321] 300000.0 201996.0 170000.0 180000.0 170000.0 155000.0
## [1327] 200000.0 162000.0 175000.0 204000.0 150000.0 200000.0
## [1333] 170000.0 150000.0 300000.0 155000.0 720000.0 160000.0
## [1339] 200000.0 148000.0 160000.0 159996.0 273655.0 175000.0
## [1345] 180000.0 182000.0 150000.0 200000.0 200004.0 250000.0
## [1351] 182496.0 150000.0 300000.0 480000.0 1080000.0 156996.0
## [1357] 540000.0 155000.0 300000.0 230000.0 170004.0 200000.0
## [1363] 160000.0 225000.0 240734.0 150000.0 148000.0 200000.0
## [1369] 166000.0 150000.0 146000.0 171000.0 240000.0 230004.0
## [1375] 157000.0 182004.0 3900000.0 242004.0 351700.0 358680.0
## [1381] 235000.0 741600.0 156000.0 168000.0 234996.0 147000.0
## [1387] 150000.0 155000.0 228000.0 156000.0 840000.0 170004.0
## [1393] 150000.0 6000000.0 175000.0 170000.0 200000.0 660000.0
## [1399] 200004.0 186480.0 176000.0 275004.0 316800.0 153000.0
## [1405] 160000.0 235000.0 165000.0 306000.0 156600.0 200000.0
## [1411] 415000.0 174000.0 648000.0 215000.0 275000.0 612000.0
## [1417] 155796.0 190000.0 200000.0 175000.0 200000.0 150000.0
## [1423] 155000.0 170000.0 220000.0 163000.0 165000.0 210000.0
## [1429] 180000.0 250000.0 190000.0 334000.0 273000.0 504000.0
## [1435] 179000.0 150000.0 185000.0 228000.0 250000.0 198000.0
## [1441] 186000.0 150000.0 150000.0 250000.0 250000.0 220000.0
## [1447] 320000.0 306000.0 150000.0 165000.0 305000.0 150000.0
## [1453] 280000.0 155000.0 200000.0 284400.0 500000.0 425000.0
## [1459] 195000.0 200000.0 225000.0 148000.0 200000.0 800000.0
## [1465] 265000.0 157000.0 180000.0 230000.0 159600.0 207182.0
## [1471] 300000.0 168000.0 340000.0 160669.8 150000.0 250000.0
## [1477] 185000.0 170000.0 260000.0 150000.0 166000.0 215000.0
## [1483] 175000.0 250000.0 200000.0 160000.0 300000.0 600000.0
## [1489] 235000.0 180000.0 181000.0 216000.0 148000.0 167000.0
## [1495] 510000.0 200000.0 216000.0 420000.0 150000.0 160000.0
## [1501] 152229.4 180000.0 154000.0 150000.0 200000.0 150000.0
## [1507] 160000.0 150000.0 180000.0 300000.0 170000.0 200000.0
## [1513] 204000.0 150000.0 287000.0 207000.0 225000.0 165000.0
## [1519] 150000.0 240000.0 150000.0 147000.0 155000.0 325000.0
## [1525] 160000.0 157000.0 384000.0 201000.0 240000.0 240000.0
## [1531] 234000.0 191000.0 600000.0 156000.0 160000.0 636000.0
## [1537] 307000.0 160000.0 200000.0 157500.0 156000.0 160000.0
## [1543] 150000.0 184000.0 150000.0 241875.0 260000.0 294000.0
## [1549] 150000.0 170000.0 190000.0 210000.0 150000.0 150000.0
## [1555] 160000.0 150000.0 180000.0 200000.0 280000.0 150000.0
## [1561] 180000.0 160000.0 150000.0 150000.0 200000.0 189996.0
## [1567] 200000.0 165000.0 200000.0 170000.0 1200000.0 180952.0
## [1573] 260000.0 780000.0 170000.0 169000.0 160000.0 150000.0
## [1579] 225000.0 168000.0 762000.0 708000.0 156000.0 190000.0
## [1585] 160000.0 155000.0 175000.0 189000.0 156000.0 154000.0
## [1591] 150000.0 371000.0 1440000.0 200000.0 250000.0 150000.0
## [1597] 200000.0 160000.0 313000.0 168750.0 200000.0 180488.1
## [1603] 900000.0 160000.0 200000.0 900000.0 1362000.0 204000.0
## [1609] 148000.0 175500.0 348000.0 200000.0 190000.0 200000.0
## [1615] 146000.0 235000.0 175000.0 150000.0 149385.0 720000.0
## [1621] 171200.0 480000.0 235800.0 360000.0 200000.0 780000.0
## [1627] 180000.0 700000.0 153000.0 210000.0 200000.0 182496.0
## [1633] 170000.0 150000.0 200000.0 150000.0 165000.0 160000.0
## [1639] 300000.0 210000.0 154000.0 150000.0 172500.0 186000.0
## [1645] 160000.0 200004.0 277104.0 178008.0 225000.0 200000.0
## [1651] 225000.0 162000.0 174000.0 172800.0 225000.0 158000.0
## [1657] 150000.0 192000.0 249996.0 233000.0 157200.0 180000.0
## [1663] 171996.0 150000.0 200000.0 180000.0 322400.0 180000.0
## [1669] 250000.0 186462.0 188004.0 160000.0 190000.0 245000.0
## [1675] 175000.0 165000.0 176608.0 150000.0 163000.0 154008.0
## [1681] 153000.0 199000.0 285000.0 450000.0 250000.0 145992.0
## [1687] 147996.0 237000.0 225000.0 230000.0 265000.0 220836.0
## [1693] 300000.0 168600.0 300000.0 150000.0 300000.0 189996.0
## [1699] 235000.0 167300.0 220000.0 147000.0 175000.0 300000.0
## [1705] 180000.0 600000.0 151016.1 189000.0 160000.0 700000.0
## [1711] 230000.0 240000.0 176000.0 275000.0 150000.0 155800.0
## [1717] 175000.0 260735.0 165000.0 190000.0 270000.0 174000.0
## [1723] 173088.0 180000.0 202000.0 200004.0 165996.0 300000.0
## [1729] 180000.0 234600.0 210000.0 265000.0 259000.0 160000.0
## [1735] 150000.0 150000.0 210000.0 184000.0 150000.0 160000.0
## [1741] 189996.0 204000.0 150000.0 155867.0 156000.0 249000.0
## [1747] 190000.0 227500.0 156326.0 150000.0 315000.0 159996.0
## [1753] 390000.0 150000.0 175000.0 165000.0 233004.0 150000.0
## [1759] 150000.0 182500.0 150000.0 175000.0 180000.0 178364.0
## [1765] 163900.0 150000.0 175000.0 150000.0 150000.0 150000.0
## [1771] 244000.0 180000.0 150000.0 151800.0 150000.0 250000.0
## [1777] 150000.0 160000.0 203500.0 160000.0 240000.0 350000.0
## [1783] 160000.0 160000.0 160000.0 145900.0 225000.0 230000.0
## [1789] 175000.0 175000.0 160000.0 155000.0 200000.0 155004.0
## [1795] 200000.0 160000.0 500000.0 200000.0 185000.0 160000.0
## [1801] 148000.0 180000.0 200000.0 225000.0 162000.0 150000.0
## [1807] 150000.0 205000.0 151000.0 175000.0 150000.0
outliers <- boxplot(loan$annual_inc, plot=FALSE)$out
loan <- loan[-which(loan$annual_inc %in% outliers),]
boxplot((loan$annual_inc))#Defining the response variable y
loan$loan_status <- factor(loan$loan_status, levels = c("Fully Paid", "Charged Off"), ordered = TRUE)
levels(loan$loan_status)## [1] "Fully Paid" "Charged Off"
#In order to objectively predict the factors that determine the success of the loans, we need to remove post-loan data since these data were not available when making the initial funding decisions. The only thing from post-loan data that we kept was the ‘issue_d’, because it is close enough to borrower’s application date (which we know prior to approving the loan), and we can determine borrowers’ credit history length, which might be an important factor. We are also removing employment title since it does not vary at the row level. We are also removing zip code since there are 810 levels currently in the dataset
loan.pre<- loan %>%
select(-funded_amnt, -funded_amnt_inv, -total_pymnt, -total_pymnt_inv,-total_rec_prncp, -total_rec_int, -total_rec_late_fee, -recoveries, -collection_recovery_fee, -last_pymnt_d, -last_pymnt_amnt, -last_credit_pull_d, -emp_title, -zip_code)#Further data clean-up. We want to clean the date to determine the length of the borrowers’ credit history in years.
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
loan.pre$issue_d <- as.Date(as.yearmon(loan.pre$issue_d))
loan.pre$earliest_cr_line <- as.Date(as.yearmon(loan.pre$earliest_cr_line))
loan.pre <- mutate (loan.pre, credit_hist_length = ((issue_d - earliest_cr_line)/365))
loan.pre$credit_hist_length= as.numeric(loan.pre$credit_hist_length)#Checking if any is data is highly correlated. We see that the loan_amnt and installment is highly correlated (0.929) because installment is just loan_amnt divided by the loan term of 36 months and 60 months. We also see that pub_rec_bankrtupcies and pub_rec are highly correlated (0.845). We are removing pub_rec_bankruptcies since these incidents are already included in pub_rec. The reason we removed it is to avoid misleading coefficient estimates and p-values.
## loan_amnt int_rate installment annual_inc
## loan_amnt 1.000000000 0.301160392 0.928947720 0.405183468
## int_rate 0.301160392 1.000000000 0.276607516 0.061166762
## installment 0.928947720 0.276607516 1.000000000 0.401519697
## annual_inc 0.405183468 0.061166762 0.401519697 1.000000000
## dti 0.093848124 0.109299173 0.081720553 -0.076518906
## delinq_2yrs -0.037173700 0.155646960 -0.025898238 0.032239143
## inq_last_6mths -0.003057999 0.134761640 -0.002397067 0.028630054
## open_acc 0.166729416 -0.002210449 0.161136486 0.262899226
## pub_rec -0.046964331 0.101670073 -0.042211417 -0.013150463
## revol_bal 0.317056594 0.084155809 0.309201900 0.398184991
## revol_util 0.071658865 0.469378775 0.104133633 0.048198980
## total_acc 0.239825383 -0.061254749 0.214332241 0.384705391
## pub_rec_bankruptcies -0.032064231 0.085804986 -0.028823505 -0.009375692
## credit_hist_length 0.191543780 -0.126995871 0.156692979 0.278132704
## dti delinq_2yrs inq_last_6mths open_acc
## loan_amnt 0.093848124 -0.037173700 -0.003057999 0.166729416
## int_rate 0.109299173 0.155646960 0.134761640 -0.002210449
## installment 0.081720553 -0.025898238 -0.002397067 0.161136486
## annual_inc -0.076518906 0.032239143 0.028630054 0.262899226
## dti 1.000000000 -0.032608358 0.006540076 0.300623973
## delinq_2yrs -0.032608358 1.000000000 0.007399324 0.016149123
## inq_last_6mths 0.006540076 0.007399324 1.000000000 0.091092099
## open_acc 0.300623973 0.016149123 0.091092099 1.000000000
## pub_rec -0.007059320 0.007273537 0.027491415 0.005759390
## revol_bal 0.271134452 -0.062388653 -0.028895955 0.285836828
## revol_util 0.274237781 -0.043444514 -0.067178688 -0.097223759
## total_acc 0.245060967 0.069875916 0.108496792 0.682879838
## pub_rec_bankruptcies 0.003785137 0.002748424 0.017601229 0.010327028
## credit_hist_length 0.068730426 0.062971340 -0.001535641 0.220563657
## pub_rec revol_bal revol_util total_acc
## loan_amnt -0.046964331 0.31705659 0.07165886 0.239825383
## int_rate 0.101670073 0.08415581 0.46937878 -0.061254749
## installment -0.042211417 0.30920190 0.10413363 0.214332241
## annual_inc -0.013150463 0.39818499 0.04819898 0.384705391
## dti -0.007059320 0.27113445 0.27423778 0.245060967
## delinq_2yrs 0.007273537 -0.06238865 -0.04344451 0.069875916
## inq_last_6mths 0.027491415 -0.02889596 -0.06717869 0.108496792
## open_acc 0.005759390 0.28583683 -0.09722376 0.682879838
## pub_rec 1.000000000 -0.05915584 0.06085988 -0.018701198
## revol_bal -0.059155845 1.00000000 0.31574186 0.308934894
## revol_util 0.060859884 0.31574186 1.00000000 -0.079500662
## total_acc -0.018701198 0.30893489 -0.07950066 1.000000000
## pub_rec_bankruptcies 0.847661194 -0.04497921 0.06262753 -0.006636702
## credit_hist_length 0.051136017 0.25857168 -0.04559959 0.380706517
## pub_rec_bankruptcies credit_hist_length
## loan_amnt -0.032064231 0.191543780
## int_rate 0.085804986 -0.126995871
## installment -0.028823505 0.156692979
## annual_inc -0.009375692 0.278132704
## dti 0.003785137 0.068730426
## delinq_2yrs 0.002748424 0.062971340
## inq_last_6mths 0.017601229 -0.001535641
## open_acc 0.010327028 0.220563657
## pub_rec 0.847661194 0.051136017
## revol_bal -0.044979211 0.258571677
## revol_util 0.062627525 -0.045599595
## total_acc -0.006636702 0.380706517
## pub_rec_bankruptcies 1.000000000 0.059926681
## credit_hist_length 0.059926681 1.000000000
#We are also removing earliest_cr_line and issue_d since the more relevant factor is already captured in the borrower’s credit history length. This way we are able to simplify the data since earliest credit line has 526 factor levels and issue date has 52 factor levels.
loan.pre2<- loan.pre %>%
select(-installment, -pub_rec_bankruptcies, -earliest_cr_line, -issue_d)
str(loan.pre2)## 'data.frame': 37160 obs. of 21 variables:
## $ loan_amnt : int 5000 2500 2400 10000 3000 5000 7000 3000 5600 5375 ...
## $ term : Factor w/ 2 levels "36_months","60_months": 1 2 1 1 2 1 2 1 2 2 ...
## $ int_rate : num 0.106 0.153 0.16 0.135 0.127 ...
## $ grade : Factor w/ 7 levels "A","B","C","D",..: 2 3 3 3 2 1 3 5 6 2 ...
## $ sub_grade : Factor w/ 35 levels "A1","A2","A3",..: 7 14 15 11 10 4 15 21 27 10 ...
## $ emp_length : Factor w/ 12 levels "1 year","10+ years",..: 2 11 2 2 1 4 9 10 5 11 ...
## $ home_ownership : Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 4 4 4 4 4 4 4 3 4 ...
## $ annual_inc : num 24000 30000 12252 49200 80000 ...
## $ verification_status: Factor w/ 3 levels "Not Verified",..: 3 2 1 2 2 2 1 2 2 3 ...
## $ loan_status : Ord.factor w/ 2 levels "Fully Paid"<"Charged Off": 1 2 1 1 1 1 1 1 2 2 ...
## $ purpose : Factor w/ 14 levels "car","credit_card",..: 2 1 12 10 10 14 3 1 12 10 ...
## $ addr_state : Factor w/ 49 levels "AK","AL","AR",..: 4 11 15 5 36 4 27 5 5 42 ...
## $ dti : num 27.65 1 8.72 20 17.94 ...
## $ delinq_2yrs : int 0 0 0 0 0 0 0 0 0 0 ...
## $ inq_last_6mths : int 1 5 2 1 0 3 1 2 2 0 ...
## $ open_acc : int 3 3 2 10 15 9 7 4 11 2 ...
## $ pub_rec : int 0 0 0 0 0 0 0 0 0 0 ...
## $ revol_bal : int 13648 1687 2956 5598 27783 7963 17726 8221 5210 9279 ...
## $ revol_util : num 0.837 0.094 0.985 0.21 0.539 0.283 0.856 0.875 0.326 0.365 ...
## $ total_acc : int 9 4 10 37 38 12 11 4 13 3 ...
## $ credit_hist_length : num 26.9 12.7 10.1 15.8 15.9 ...
#Logistical Regression
set.seed(20)
loan.sample <-sample(nrow(loan.pre2),nrow(loan.pre2)*0.2)
loan.reduced <- loan.pre2[loan.sample, ]
#separate train and test data, upon research, many recommendations to use 70:30 rule
train.data <-sample(nrow(loan.reduced),nrow(loan.reduced)*0.7)
loan.train <- loan.reduced[train.data, ]
loan.test <- loan.reduced[-train.data, ] set.seed(20)
X <- model.matrix(loan_status~., data=loan.train)[, -1]
Y <- loan.train$loan_status
fit.loan.lambda <- cv.glmnet(X, Y, alpha=1, nfolds = 10, family= "binomial")
plot(fit.loan.lambda)## [1] 0.01187452
## [1] 0.03010625
#using lambda.min to determine how many variables to use
coef.loan <- coef(fit.loan.lambda, s="lambda.min")
coef.loan <- coef.loan[which(coef.loan !=0),]
rownames(as.matrix(coef.loan))## [1] "(Intercept)" "term60_months" "int_rate"
## [4] "sub_gradeF5" "emp_lengthn/a" "annual_inc"
## [7] "purposesmall_business" "inq_last_6mths"
#We are then refitting this logistic regression based on the result.
fit.loan.glm <- glm(loan_status~ term + int_rate + sub_grade + emp_length + annual_inc + purpose + inq_last_6mths, data=loan.train, family=binomial (logit))
summary(fit.loan.glm) ##
## Call:
## glm(formula = loan_status ~ term + int_rate + sub_grade + emp_length +
## annual_inc + purpose + inq_last_6mths, family = binomial(logit),
## data = loan.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6291 -0.6101 -0.4692 -0.3052 2.7597
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.295e+00 6.418e-01 -5.135 2.83e-07 ***
## term60_months 4.593e-01 9.873e-02 4.652 3.29e-06 ***
## int_rate 5.922e+00 4.950e+00 1.196 0.2316
## sub_gradeA2 4.663e-02 6.408e-01 0.073 0.9420
## sub_gradeA3 3.745e-01 6.019e-01 0.622 0.5339
## sub_gradeA4 5.036e-01 5.638e-01 0.893 0.3718
## sub_gradeA5 7.075e-01 5.592e-01 1.265 0.2057
## sub_gradeB1 8.480e-01 5.899e-01 1.437 0.1506
## sub_gradeB2 1.311e+00 5.881e-01 2.230 0.0258 *
## sub_gradeB3 1.060e+00 5.910e-01 1.794 0.0729 .
## sub_gradeB4 1.374e+00 5.978e-01 2.299 0.0215 *
## sub_gradeB5 9.464e-01 6.142e-01 1.541 0.1233
## sub_gradeC1 9.754e-01 6.418e-01 1.520 0.1286
## sub_gradeC2 1.342e+00 6.534e-01 2.053 0.0400 *
## sub_gradeC3 1.023e+00 6.711e-01 1.524 0.1276
## sub_gradeC4 1.348e+00 6.857e-01 1.966 0.0493 *
## sub_gradeC5 1.304e+00 6.980e-01 1.868 0.0617 .
## sub_gradeD1 1.451e+00 7.215e-01 2.011 0.0443 *
## sub_gradeD2 1.312e+00 7.242e-01 1.811 0.0701 .
## sub_gradeD3 9.851e-01 7.427e-01 1.326 0.1847
## sub_gradeD4 1.084e+00 7.561e-01 1.433 0.1518
## sub_gradeD5 1.437e+00 7.673e-01 1.873 0.0610 .
## sub_gradeE1 1.314e+00 7.920e-01 1.659 0.0971 .
## sub_gradeE2 1.541e+00 8.041e-01 1.917 0.0553 .
## sub_gradeE3 1.091e+00 8.494e-01 1.284 0.1991
## sub_gradeE4 1.621e+00 8.491e-01 1.909 0.0562 .
## sub_gradeE5 1.375e+00 8.659e-01 1.588 0.1122
## sub_gradeF1 9.202e-01 9.099e-01 1.011 0.3118
## sub_gradeF2 7.440e-01 9.415e-01 0.790 0.4294
## sub_gradeF3 1.169e+00 9.739e-01 1.200 0.2301
## sub_gradeF4 1.247e+00 1.035e+00 1.205 0.2281
## sub_gradeF5 2.395e+00 1.067e+00 2.245 0.0247 *
## sub_gradeG1 2.465e+00 1.174e+00 2.100 0.0358 *
## sub_gradeG2 7.818e-01 1.192e+00 0.656 0.5120
## sub_gradeG3 1.912e+00 1.276e+00 1.499 0.1340
## sub_gradeG4 -1.105e+01 2.361e+02 -0.047 0.9627
## sub_gradeG5 -1.149e+01 5.354e+02 -0.021 0.9829
## emp_length10+ years -1.577e-02 1.684e-01 -0.094 0.9254
## emp_length2 years -1.207e-01 1.872e-01 -0.645 0.5190
## emp_length3 years -2.409e-01 1.972e-01 -1.222 0.2218
## emp_length4 years -1.960e-01 2.048e-01 -0.957 0.3385
## emp_length5 years -6.411e-03 1.940e-01 -0.033 0.9736
## emp_length6 years 7.500e-02 2.134e-01 0.351 0.7253
## emp_length7 years 2.523e-02 2.316e-01 0.109 0.9133
## emp_length8 years -9.977e-02 2.722e-01 -0.366 0.7140
## emp_length9 years 3.984e-02 2.611e-01 0.153 0.8787
## emp_length< 1 year 7.486e-02 1.798e-01 0.416 0.6771
## emp_lengthn/a 4.623e-01 2.646e-01 1.747 0.0807 .
## annual_inc -1.083e-05 1.628e-06 -6.652 2.88e-11 ***
## purposecredit_card -3.699e-02 2.604e-01 -0.142 0.8871
## purposedebt_consolidation 5.211e-02 2.352e-01 0.222 0.8246
## purposeeducational 2.501e-01 4.633e-01 0.540 0.5894
## purposehome_improvement -2.308e-01 2.894e-01 -0.798 0.4251
## purposehouse -2.515e-01 5.139e-01 -0.489 0.6246
## purposemajor_purchase 1.343e-02 2.917e-01 0.046 0.9633
## purposemedical 6.272e-01 3.634e-01 1.726 0.0844 .
## purposemoving 4.431e-01 3.745e-01 1.183 0.2367
## purposeother 2.017e-01 2.590e-01 0.779 0.4362
## purposerenewable_energy -7.099e-01 1.108e+00 -0.641 0.5216
## purposesmall_business 4.592e-01 2.794e-01 1.643 0.1003
## purposevacation 5.133e-01 4.139e-01 1.240 0.2149
## purposewedding -6.292e-02 3.791e-01 -0.166 0.8682
## inq_last_6mths 1.441e-01 3.702e-02 3.893 9.89e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4309.1 on 5201 degrees of freedom
## Residual deviance: 3974.3 on 5139 degrees of freedom
## AIC: 4100.3
##
## Number of Fisher Scoring iterations: 12
## Analysis of Deviance Table (Type II tests)
##
## Response: loan_status
## LR Chisq Df Pr(>Chisq)
## term 21.302 1 3.923e-06 ***
## int_rate 1.446 1 0.2291849
## sub_grade 47.496 34 0.0620067 .
## emp_length 10.243 11 0.5086771
## annual_inc 47.553 1 5.355e-12 ***
## purpose 18.399 13 0.1429525
## inq_last_6mths 14.705 1 0.0001257 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Dropping sub_grade and emp_length as it looks to be the most statistically insignificant
fit.loan.final <- glm(loan_status~ term + int_rate + annual_inc + purpose + inq_last_6mths, data=loan.train, family=binomial (logit))
summary(fit.loan.final) ##
## Call:
## glm(formula = loan_status ~ term + int_rate + annual_inc + purpose +
## inq_last_6mths, family = binomial(logit), data = loan.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2166 -0.5945 -0.4681 -0.3501 2.5774
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.943e+00 2.697e-01 -10.912 < 2e-16 ***
## term60_months 4.391e-01 9.571e-02 4.588 4.48e-06 ***
## int_rate 1.179e+01 1.265e+00 9.325 < 2e-16 ***
## annual_inc -1.151e-05 1.579e-06 -7.288 3.14e-13 ***
## purposecredit_card -5.696e-02 2.574e-01 -0.221 0.8249
## purposedebt_consolidation 3.472e-02 2.323e-01 0.149 0.8812
## purposeeducational 2.520e-01 4.583e-01 0.550 0.5824
## purposehome_improvement -2.362e-01 2.860e-01 -0.826 0.4088
## purposehouse -2.299e-01 5.037e-01 -0.456 0.6481
## purposemajor_purchase -8.051e-03 2.877e-01 -0.028 0.9777
## purposemedical 6.296e-01 3.555e-01 1.771 0.0766 .
## purposemoving 4.447e-01 3.692e-01 1.205 0.2284
## purposeother 1.978e-01 2.563e-01 0.772 0.4402
## purposerenewable_energy -7.184e-01 1.098e+00 -0.654 0.5129
## purposesmall_business 4.665e-01 2.757e-01 1.692 0.0907 .
## purposevacation 4.678e-01 4.084e-01 1.145 0.2521
## purposewedding -7.384e-02 3.747e-01 -0.197 0.8438
## inq_last_6mths 1.476e-01 3.624e-02 4.072 4.66e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4309.1 on 5201 degrees of freedom
## Residual deviance: 4032.5 on 5184 degrees of freedom
## AIC: 4068.5
##
## Number of Fisher Scoring iterations: 5
## Analysis of Deviance Table (Type II tests)
##
## Response: loan_status
## LR Chisq Df Pr(>Chisq)
## term 20.720 1 5.316e-06 ***
## int_rate 89.251 1 < 2.2e-16 ***
## annual_inc 57.307 1 3.729e-14 ***
## purpose 19.766 13 0.1012
## inq_last_6mths 16.033 1 6.226e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#In the final Anova, all variables look to be significant at the 0.1 level.
fit.predict <-ifelse(fit.predict > 0.5, 1, 0)
confmatrix_loan_status <-table(loan.test$loan_status,fit.predict)
confmatrix_loan_status## fit.predict
## 0 1
## Fully Paid 1904 3
## Charged Off 322 1
## [1] 0.8542601
#Checking the accuracy of the model, 86.36% of the time, the model predicts default / non-default accurately.
## Setting levels: control = Fully Paid, case = Charged Off
## Setting direction: controls < cases
## Area under the curve: 0.6966
#Area under the curve is 0.7047
par(mfrow=c(1,2), mar=c(5,2,4,2), mgp=c(3,0.5,0))
plot(fit.loan.final, 1, pch=16) # residual plot
abline(h=0, col="blue", lwd=2)
plot(fit.loan.final, 2) # qqplot#We are also trying Random Forrest to see if it will give a better fit
#Predicting using random forest model
#Confusion Matrix
##
## fit.rf.pred Fully Paid Charged Off
## Fully Paid 1898 316
## Charged Off 9 7
## [1] 0.8542601
#86.27% of the time, the model from random forest is able to correctly predict loan status
#Drawing POC curve
fit.rf.prob <- predict (fit.rf, loan.test, type = "prob")
fit2.roc <- roc(loan.test$loan_status, fit.rf.prob[,2], plot=T) ## Setting levels: control = Fully Paid, case = Charged Off
## Setting direction: controls < cases
## Area under the curve: 0.6661
#Area under the curve is 0.6752
investor_return <- (loan$total_pymnt_inv/loan$funded_amnt_inv)-1
loan.return <- cbind (loan, investor_return)
loan.return<- loan.return[complete.cases(loan.return)]loan.return<- loan.return %>%
select(-funded_amnt, -funded_amnt_inv, -total_pymnt, -total_pymnt_inv,-total_rec_prncp, -total_rec_int, -total_rec_late_fee, -recoveries, -collection_recovery_fee, -last_pymnt_d, -last_pymnt_amnt, -last_credit_pull_d, -emp_title, -zip_code, -installment, -pub_rec_bankruptcies, -earliest_cr_line, -issue_d, -loan_status)## loan_amnt int_rate annual_inc dti
## loan_amnt 1.000000000 0.300716072 0.40503019 0.093370598
## int_rate 0.300716072 1.000000000 0.06074385 0.108372686
## annual_inc 0.405030189 0.060743854 1.00000000 -0.077413807
## dti 0.093370598 0.108372686 -0.07741381 1.000000000
## delinq_2yrs -0.037146002 0.155729802 0.03220888 -0.033082383
## inq_last_6mths -0.002399842 0.135666072 0.02911016 0.006771507
## open_acc 0.166959183 -0.002705557 0.26268127 0.300394177
## pub_rec -0.047206486 0.101789155 -0.01318882 -0.006994701
## revol_bal 0.317016809 0.083927447 0.39785153 0.271043398
## revol_util 0.071103012 0.469359716 0.04768365 0.273872747
## total_acc 0.239827010 -0.061831128 0.38435316 0.244579479
## investor_return 0.056417579 0.100103566 0.07888738 0.014714769
## delinq_2yrs inq_last_6mths open_acc pub_rec
## loan_amnt -0.037146002 -0.002399842 0.166959183 -0.047206486
## int_rate 0.155729802 0.135666072 -0.002705557 0.101789155
## annual_inc 0.032208876 0.029110162 0.262681266 -0.013188824
## dti -0.033082383 0.006771507 0.300394177 -0.006994701
## delinq_2yrs 1.000000000 0.008058183 0.015351676 0.007580796
## inq_last_6mths 0.008058183 1.000000000 0.090862056 0.027043515
## open_acc 0.015351676 0.090862056 1.000000000 0.005846617
## pub_rec 0.007580796 0.027043515 0.005846617 1.000000000
## revol_bal -0.062552968 -0.028551402 0.285903720 -0.059357832
## revol_util -0.043078206 -0.066215373 -0.097187258 0.061201117
## total_acc 0.069586006 0.108888400 0.682754129 -0.018619626
## investor_return 0.017378112 -0.050633029 0.008329904 -0.017283787
## revol_bal revol_util total_acc investor_return
## loan_amnt 0.31701681 0.07110301 0.239827010 0.056417579
## int_rate 0.08392745 0.46935972 -0.061831128 0.100103566
## annual_inc 0.39785153 0.04768365 0.384353161 0.078887379
## dti 0.27104340 0.27387275 0.244579479 0.014714769
## delinq_2yrs -0.06255297 -0.04307821 0.069586006 0.017378112
## inq_last_6mths -0.02855140 -0.06621537 0.108888400 -0.050633029
## open_acc 0.28590372 -0.09718726 0.682754129 0.008329904
## pub_rec -0.05935783 0.06120112 -0.018619626 -0.017283787
## revol_bal 1.00000000 0.31541200 0.308640967 0.020847811
## revol_util 0.31541200 1.00000000 -0.080004466 0.042807792
## total_acc 0.30864097 -0.08000447 1.000000000 -0.004620665
## investor_return 0.02084781 0.04280779 -0.004620665 1.000000000
## Classes 'data.table' and 'data.frame': 37037 obs. of 20 variables:
## $ loan_amnt : int 5000 2500 2400 10000 3000 5000 7000 3000 5600 5375 ...
## $ term : Factor w/ 2 levels "36_months","60_months": 1 2 1 1 2 1 2 1 2 2 ...
## $ int_rate : num 0.106 0.153 0.16 0.135 0.127 ...
## $ grade : Factor w/ 7 levels "A","B","C","D",..: 2 3 3 3 2 1 3 5 6 2 ...
## $ sub_grade : Factor w/ 35 levels "A1","A2","A3",..: 7 14 15 11 10 4 15 21 27 10 ...
## $ emp_length : Factor w/ 12 levels "1 year","10+ years",..: 2 11 2 2 1 4 9 10 5 11 ...
## $ home_ownership : Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 4 4 4 4 4 4 4 3 4 ...
## $ annual_inc : num 24000 30000 12252 49200 80000 ...
## $ verification_status: Factor w/ 3 levels "Not Verified",..: 3 2 1 2 2 2 1 2 2 3 ...
## $ purpose : Factor w/ 14 levels "car","credit_card",..: 2 1 12 10 10 14 3 1 12 10 ...
## $ addr_state : Factor w/ 49 levels "AK","AL","AR",..: 4 11 15 5 36 4 27 5 5 42 ...
## $ dti : num 27.65 1 8.72 20 17.94 ...
## $ delinq_2yrs : int 0 0 0 0 0 0 0 0 0 0 ...
## $ inq_last_6mths : int 1 5 2 1 0 3 1 2 2 0 ...
## $ open_acc : int 3 3 2 10 15 9 7 4 11 2 ...
## $ pub_rec : int 0 0 0 0 0 0 0 0 0 0 ...
## $ revol_bal : int 13648 1687 2956 5598 27783 7963 17726 8221 5210 9279 ...
## $ revol_util : num 0.837 0.094 0.985 0.21 0.539 0.283 0.856 0.875 0.326 0.365 ...
## $ total_acc : int 9 4 10 37 38 12 11 4 13 3 ...
## $ investor_return : num 0.173 -0.594 0.252 0.223 0.356 ...
## - attr(*, ".internal.selfref")=<externalptr>
#Checking correlation to see if there is anything that we should remove. There was nothing that is too highly correlated.
#Logistical Regression
set.seed(9)
loan.sample2 <-sample(nrow(loan.return),nrow(loan.return)*0.1)
loan.reduced2 <- loan.return[loan.sample2, ]
#separate train and test data, upon research, many recommendations to use 70:30 rule
train.data2 <-sample(nrow(loan.reduced2),nrow(loan.reduced2)*0.7)
loan.train2 <- loan.reduced2[train.data2, ]
loan.test2 <- loan.reduced2[-train.data2, ] X2 <- model.matrix(investor_return~., data=loan.train2)[, -1]
Y2 <- loan.train2$investor_return
Y2clean<- Y2[complete.cases(Y2)]
summary (Y2clean)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.97591 0.08184 0.14602 0.09602 0.21508 0.66087
## [1] 0.007436409
## [1] 0.02270972
coef.return <- coef(fit.return.lambda, s="lambda.min")
coef.return <- coef.return[which(coef.return !=0),]
rownames(as.matrix(coef.return))## [1] "(Intercept)" "term60_months"
## [3] "int_rate" "gradeF"
## [5] "gradeG" "sub_gradeC2"
## [7] "sub_gradeC3" "sub_gradeC5"
## [9] "sub_gradeE2" "sub_gradeE5"
## [11] "sub_gradeF3" "sub_gradeG3"
## [13] "emp_length10+ years" "emp_length6 years"
## [15] "emp_length7 years" "home_ownershipOTHER"
## [17] "home_ownershipRENT" "annual_inc"
## [19] "purposehome_improvement" "purposeother"
## [21] "purposerenewable_energy" "addr_stateKS"
## [23] "addr_stateMA" "addr_stateNJ"
## [25] "addr_stateSC" "addr_stateSD"
## [27] "inq_last_6mths" "pub_rec"
## [29] "revol_bal" "total_acc"
We are then refitting this logistic regression based on the result.
fit.return.glm <- glm(investor_return~ loan_amnt+ term + int_rate+ grade + sub_grade + addr_state + inq_last_6mths, data=loan.train2)
Anova(fit.return.glm) ## Analysis of Deviance Table (Type II tests)
##
## Response: investor_return
## LR Chisq Df Pr(>Chisq)
## loan_amnt 0.593 1 0.4412675
## term 10.253 1 0.0013643 **
## int_rate 0.358 1 0.5493995
## grade 0
## sub_grade 25.039 27 0.5722910
## addr_state 44.906 42 0.3510458
## inq_last_6mths 13.256 1 0.0002717 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Dropping variables, addr_state, int_rate and sub_grade that are not statistically significant
fit.return.final <- glm(investor_return~ loan_amnt+ term + grade + inq_last_6mths, data=loan.train2)
Anova(fit.return.final) ## Analysis of Deviance Table (Type II tests)
##
## Response: investor_return
## LR Chisq Df Pr(>Chisq)
## loan_amnt 0.1362 1 0.712052
## term 9.3416 1 0.002240 **
## grade 18.6202 6 0.004856 **
## inq_last_6mths 10.2606 1 0.001359 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1