# Importing the data into R
loandata = read.csv("loan_data_PD.csv")
# Set seed
set.seed(0)
# Look at structure and summary of our data
str(loandata)
## 'data.frame': 133889 obs. of 145 variables:
## $ id : chr "" "" "" "" ...
## $ member_id : logi NA NA NA NA NA NA ...
## $ loan_amnt : int 10000 35000 20000 17475 8000 14400 18000 5800 12500 3000 ...
## $ funded_amnt : int 10000 35000 20000 17475 8000 14400 18000 5800 12500 3000 ...
## $ funded_amnt_inv : num 10000 35000 20000 17475 8000 ...
## $ term : chr " 60 months" " 60 months" " 60 months" " 60 months" ...
## $ int_rate : chr " 19.53%" " 20.75%" " 9.16%" " 11.47%" ...
## $ installment : num 262 942 417 384 255 ...
## $ grade : chr "D" "E" "B" "B" ...
## $ sub_grade : chr "D5" "E2" "B2" "B5" ...
## $ emp_title : chr "lpn/charge nurse" "Coiler" "Reliability Engineer" "" ...
## $ emp_length : chr "4 years" "3 years" "1 year" "n/a" ...
## $ home_ownership : chr "OWN" "MORTGAGE" "MORTGAGE" "MORTGAGE" ...
## $ annual_inc : num 52000 85000 77000 41682 72000 ...
## $ verification_status : chr "Source Verified" "Source Verified" "Not Verified" "Verified" ...
## $ issue_d : chr "Mar-2016" "Mar-2016" "Mar-2016" "Mar-2016" ...
## $ loan_status : chr "Fully Paid" "Fully Paid" "Current" "Current" ...
## $ pymnt_plan : chr "n" "n" "n" "n" ...
## $ url : logi NA NA NA NA NA NA ...
## $ desc : chr "" "" "" "" ...
## $ purpose : chr "other" "debt_consolidation" "home_improvement" "debt_consolidation" ...
## $ title : chr "Other" "Debt consolidation" "Home improvement" "Debt consolidation" ...
## $ zip_code : chr "317xx" "144xx" "606xx" "796xx" ...
## $ addr_state : chr "GA" "NY" "IL" "TX" ...
## $ dti : num 15 25 13.8 30.1 22.6 ...
## $ delinq_2yrs : int 0 0 0 0 0 0 1 1 0 1 ...
## $ earliest_cr_line : chr "Oct-2000" "Dec-2008" "Mar-2005" "Feb-1976" ...
## $ inq_last_6mths : int 1 1 1 0 0 0 4 0 1 0 ...
## $ mths_since_last_delinq : int 44 NA NA NA NA 72 18 2 NA 9 ...
## $ mths_since_last_record : int 52 NA NA NA NA NA NA NA NA 71 ...
## $ open_acc : int 4 9 14 10 4 16 21 5 8 7 ...
## $ pub_rec : int 2 0 0 0 0 0 0 0 0 1 ...
## $ revol_bal : int 1077 10167 4204 24431 19735 37582 29047 4416 9215 2099 ...
## $ revol_util : chr "35.9%" "46.6%" "13.9%" "27.6%" ...
## $ total_acc : int 12 19 19 18 12 33 40 10 15 25 ...
## $ initial_list_status : chr "w" "w" "w" "w" ...
## $ out_prncp : num 0 0 13700 12189 3373 ...
## $ out_prncp_inv : num 0 0 13700 12189 3373 ...
## $ total_pymnt : num 11128 37226 9148 8433 5602 ...
## $ total_pymnt_inv : num 11128 37226 9148 8433 5602 ...
## $ total_rec_prncp : num 10000 35000 6300 5286 4627 ...
## $ total_rec_int : num 1128 2226 2848 3147 975 ...
## $ total_rec_late_fee : num 0 0 0 0 0 0 0 0 0 0 ...
## $ recoveries : num 0 0 0 0 0 0 0 0 0 0 ...
## $ collection_recovery_fee : num 0 0 0 0 0 0 0 0 0 0 ...
## $ last_pymnt_d : chr "Nov-2016" "Aug-2016" "Feb-2018" "Feb-2018" ...
## $ last_pymnt_amnt : num 9575 509 417 384 255 ...
## $ next_pymnt_d : chr "" "" "Mar-2018" "Mar-2018" ...
## $ last_credit_pull_d : chr "Dec-2017" "Feb-2018" "Feb-2018" "Feb-2018" ...
## $ collections_12_mths_ex_med : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mths_since_last_major_derog : int 44 NA NA NA NA NA NA 2 NA NA ...
## $ policy_code : int 1 1 1 1 1 1 1 1 1 1 ...
## $ application_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ annual_inc_joint : num NA NA NA NA NA NA NA NA NA NA ...
## $ dti_joint : num NA NA NA NA NA NA NA NA NA NA ...
## $ verification_status_joint : chr "" "" "" "" ...
## $ acc_now_delinq : int 0 0 0 0 0 0 0 1 0 0 ...
## $ tot_coll_amt : int 622 0 0 0 0 0 0 0 0 0 ...
## $ tot_cur_bal : int 15886 161643 25031 118388 59568 190166 299784 10934 175815 153995 ...
## $ open_acc_6m : int 2 0 0 0 0 0 3 0 1 0 ...
## $ open_act_il : int 1 3 8 0 3 3 1 1 3 0 ...
## $ open_il_12m : int 4 0 0 0 1 1 1 0 2 1 ...
## $ open_il_24m : int 5 3 0 0 3 1 1 0 2 2 ...
## $ mths_since_rcnt_il : int 2 14 46 37 9 12 4 73 11 7 ...
## $ total_bal_il : int 14809 73863 20827 0 39833 22387 29197 1972 30319 0 ...
## $ il_util : int 99 83 38 NA 75 47 92 147 81 NA ...
## $ open_rv_12m : int 0 0 2 0 0 0 4 1 2 0 ...
## $ open_rv_24m : int 0 1 3 1 0 0 8 2 4 0 ...
## $ max_bal_bc : int 1007 5109 650 10239 0 12211 7172 3957 7318 1065 ...
## $ all_util : int 88 71 30 28 81 66 49 85 81 13 ...
## $ total_rev_hi_lim : int 3000 21800 30200 62500 20000 43800 58950 7000 11400 15900 ...
## $ inq_fi : int 3 5 1 0 2 0 0 1 1 1 ...
## $ total_cu_tl : int 0 1 0 0 6 0 1 0 0 4 ...
## $ inq_last_12m : int 2 2 2 0 1 0 5 0 4 0 ...
## $ acc_open_past_24mths : int 5 4 3 1 3 1 9 2 7 2 ...
## $ avg_cur_bal : int 3972 17960 1788 11839 14892 11885 16654 2187 21976 30799 ...
## $ bc_open_to_buy : int 1623 4833 21491 17072 NA 3393 5072 605 3500 13801 ...
## $ bc_util : num 39.9 67.8 3.6 20.4 NA 87.4 59 87.9 85 13.2 ...
## $ chargeoff_within_12_mths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ delinq_amnt : int 0 0 0 0 0 0 0 4199 0 0 ...
## $ mo_sin_old_il_acct : int 101 87 127 140 57 145 102 88 143 141 ...
## $ mo_sin_old_rev_tl_op : int 185 65 132 481 100 193 167 170 163 144 ...
## $ mo_sin_rcnt_rev_tl_op : int 25 22 9 22 33 26 0 11 3 39 ...
## $ mo_sin_rcnt_tl : int 2 14 9 22 9 12 0 11 3 7 ...
## $ mort_acc : int 0 1 0 1 0 6 1 0 2 6 ...
## $ mths_since_recent_bc : int 25 22 10 22 NA 32 9 23 3 39 ...
## $ mths_since_recent_bc_dlq : int 49 NA NA NA NA NA NA NA NA NA ...
## $ mths_since_recent_inq : int 4 5 3 NA 9 20 0 18 3 22 ...
## $ mths_since_recent_revol_delinq : int 49 NA NA NA NA 72 18 NA NA NA ...
## $ num_accts_ever_120_pd : int 2 0 0 0 0 0 0 1 0 0 ...
## $ num_actv_bc_tl : int 2 3 2 5 0 5 9 2 3 4 ...
## $ num_actv_rev_tl : int 2 3 4 7 1 11 16 3 4 4 ...
## $ num_bc_sats : int 2 3 4 5 0 5 16 2 4 6 ...
## $ num_bc_tl : int 3 4 6 7 3 9 16 4 4 11 ...
## $ num_il_tl : int 7 9 9 4 8 7 6 3 7 8 ...
## $ num_op_rev_tl : int 3 5 6 9 1 11 16 3 4 6 ...
## $ num_rev_accts : int 5 9 10 13 4 20 33 6 6 11 ...
## $ num_rev_tl_bal_gt_0 : int 2 3 4 7 1 11 11 3 4 4 ...
## $ num_sats : int 4 9 14 10 4 16 18 4 8 7 ...
## [list output truncated]
summary(loandata)
## id member_id loan_amnt funded_amnt
## Length:133889 Mode:logical Min. : 1000 Min. : 1000
## Class :character NA's:133889 1st Qu.: 8400 1st Qu.: 8400
## Mode :character Median :14400 Median :14400
## Mean :15589 Mean :15589
## 3rd Qu.:21000 3rd Qu.:21000
## Max. :40000 Max. :40000
## NA's :2 NA's :2
## funded_amnt_inv term int_rate installment
## Min. : 775 Length:133889 Length:133889 Min. : 30.12
## 1st Qu.: 8400 Class :character Class :character 1st Qu.: 265.68
## Median :14400 Mode :character Mode :character Median : 398.52
## Mean :15582 Mean : 460.86
## 3rd Qu.:21000 3rd Qu.: 613.91
## Max. :40000 Max. :1536.95
## NA's :2 NA's :2
## grade sub_grade emp_title emp_length
## Length:133889 Length:133889 Length:133889 Length:133889
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## home_ownership annual_inc verification_status issue_d
## Length:133889 Min. : 0 Length:133889 Length:133889
## Class :character 1st Qu.: 49500 Class :character Class :character
## Mode :character Median : 68000 Mode :character Mode :character
## Mean : 80464
## 3rd Qu.: 95008
## Max. :9550000
## NA's :2
## loan_status pymnt_plan url desc
## Length:133889 Length:133889 Mode:logical Length:133889
## Class :character Class :character NA's:133889 Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## purpose title zip_code addr_state
## Length:133889 Length:133889 Length:133889 Length:133889
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## dti delinq_2yrs earliest_cr_line inq_last_6mths
## Min. : -1.00 Min. : 0.0000 Length:133889 Min. :0.0000
## 1st Qu.: 12.64 1st Qu.: 0.0000 Class :character 1st Qu.:0.0000
## Median : 18.76 Median : 0.0000 Mode :character Median :0.0000
## Mean : 19.55 Mean : 0.3435 Mean :0.5765
## 3rd Qu.: 25.77 3rd Qu.: 0.0000 3rd Qu.:1.0000
## Max. :999.00 Max. :22.0000 Max. :5.0000
## NA's :19 NA's :2 NA's :3
## mths_since_last_delinq mths_since_last_record open_acc
## Min. : 0.00 Min. : 0.00 Min. : 1.00
## 1st Qu.: 15.00 1st Qu.: 51.00 1st Qu.: 8.00
## Median : 30.00 Median : 68.00 Median :11.00
## Mean : 33.81 Mean : 66.36 Mean :12.12
## 3rd Qu.: 49.00 3rd Qu.: 82.00 3rd Qu.:15.00
## Max. :192.00 Max. :121.00 Max. :74.00
## NA's :64567 NA's :110218 NA's :2
## pub_rec revol_bal revol_util total_acc
## Min. : 0.0000 Min. : 0 Length:133889 Min. : 2.00
## 1st Qu.: 0.0000 1st Qu.: 6578 Class :character 1st Qu.: 17.00
## Median : 0.0000 Median : 12278 Mode :character Median : 24.00
## Mean : 0.2387 Mean : 18616 Mean : 25.34
## 3rd Qu.: 0.0000 3rd Qu.: 22011 3rd Qu.: 32.00
## Max. :46.0000 Max. :1023940 Max. :176.00
## NA's :2 NA's :2 NA's :2
## initial_list_status out_prncp out_prncp_inv total_pymnt
## Length:133889 Min. : 0 Min. : 0 Min. : 0
## Class :character 1st Qu.: 0 1st Qu.: 0 1st Qu.: 6164
## Mode :character Median : 2158 Median : 2156 Median :10139
## Mean : 4542 Mean : 4540 Mean :12087
## 3rd Qu.: 7555 3rd Qu.: 7547 3rd Qu.:16073
## Max. :31060 Max. :31060 Max. :53854
## NA's :2 NA's :2 NA's :2
## total_pymnt_inv total_rec_prncp total_rec_int total_rec_late_fee
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.000
## 1st Qu.: 6159 1st Qu.: 4303 1st Qu.: 890.4 1st Qu.: 0.000
## Median :10134 Median : 7280 Median : 1745.8 Median : 0.000
## Mean :12081 Mean : 9528 Mean : 2449.7 Mean : 1.363
## 3rd Qu.:16065 3rd Qu.:12425 3rd Qu.: 3310.4 3rd Qu.: 0.000
## Max. :53854 Max. :40000 Max. :18111.0 Max. :609.810
## NA's :2 NA's :2 NA's :2 NA's :2
## recoveries collection_recovery_fee last_pymnt_d last_pymnt_amnt
## Min. : 0.0 Min. : 0.00 Length:133889 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.00 Class :character 1st Qu.: 318.5
## Median : 0.0 Median : 0.00 Mode :character Median : 564.3
## Mean : 108.1 Mean : 19.32 Mean : 3492.1
## 3rd Qu.: 0.0 3rd Qu.: 0.00 3rd Qu.: 3437.6
## Max. :23271.5 Max. :4188.87 Max. :40688.6
## NA's :2 NA's :2 NA's :2
## next_pymnt_d last_credit_pull_d collections_12_mths_ex_med
## Length:133889 Length:133889 Min. : 0.00000
## Class :character Class :character 1st Qu.: 0.00000
## Mode :character Mode :character Median : 0.00000
## Mean : 0.02126
## 3rd Qu.: 0.00000
## Max. :12.00000
## NA's :2
## mths_since_last_major_derog policy_code application_type annual_inc_joint
## Min. : 0.00 Min. :1 Length:133889 Min. : 18072
## 1st Qu.: 27.00 1st Qu.:1 Class :character 1st Qu.: 77000
## Median : 45.00 Median :1 Mode :character Median :102000
## Mean : 45.22 Mean :1 Mean :108618
## 3rd Qu.: 64.00 3rd Qu.:1 3rd Qu.:131500
## Max. :197.00 Max. :1 Max. :480000
## NA's :95815 NA's :2 NA's :131228
## dti_joint verification_status_joint acc_now_delinq
## Min. : 1.11 Length:133889 Min. :0.000000
## 1st Qu.:13.46 Class :character 1st Qu.:0.000000
## Median :18.36 Mode :character Median :0.000000
## Mean :18.60 Mean :0.006969
## 3rd Qu.:23.38 3rd Qu.:0.000000
## Max. :63.66 Max. :3.000000
## NA's :131230 NA's :2
## tot_coll_amt tot_cur_bal open_acc_6m open_act_il
## Min. : 0.0 Min. : 0 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.: 32228 1st Qu.: 0.000 1st Qu.: 1.000
## Median : 0.0 Median : 86204 Median : 1.000 Median : 2.000
## Mean : 259.2 Mean : 147240 Mean : 1.045 Mean : 2.797
## 3rd Qu.: 0.0 3rd Qu.: 217484 3rd Qu.: 2.000 3rd Qu.: 3.000
## Max. :224107.0 Max. :5445012 Max. :16.000 Max. :48.000
## NA's :2 NA's :2 NA's :64 NA's :63
## open_il_12m open_il_24m mths_since_rcnt_il total_bal_il
## Min. : 0.0000 Min. : 0.000 Min. : 0.00 Min. : 0
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 6.00 1st Qu.: 9633
## Median : 0.0000 Median : 1.000 Median : 13.00 Median : 24144
## Mean : 0.7472 Mean : 1.625 Mean : 21.64 Mean : 35704
## 3rd Qu.: 1.0000 3rd Qu.: 2.000 3rd Qu.: 24.00 3rd Qu.: 46562
## Max. :20.0000 Max. :30.000 Max. :446.00 Max. :796104
## NA's :63 NA's :63 NA's :3703 NA's :63
## il_util open_rv_12m open_rv_24m max_bal_bc
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0
## 1st Qu.: 58.00 1st Qu.: 0.000 1st Qu.: 1.00 1st Qu.: 2525
## Median : 74.00 Median : 1.000 Median : 2.00 Median : 4664
## Mean : 70.74 Mean : 1.374 Mean : 2.93 Mean : 6184
## 3rd Qu.: 87.00 3rd Qu.: 2.000 3rd Qu.: 4.00 3rd Qu.: 8020
## Max. :558.00 Max. :28.000 Max. :44.00 Max. :776843
## NA's :17881 NA's :63 NA's :63 NA's :63
## all_util total_rev_hi_lim inq_fi total_cu_tl
## Min. : 0.00 Min. : 0 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 48.00 1st Qu.: 15300 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 62.00 Median : 26400 Median : 0.0000 Median : 0.000
## Mean : 60.65 Mean : 36362 Mean : 0.9529 Mean : 1.544
## 3rd Qu.: 75.00 3rd Qu.: 45000 3rd Qu.: 1.0000 3rd Qu.: 2.000
## Max. :198.00 Max. :1184500 Max. :28.0000 Max. :79.000
## NA's :69 NA's :2 NA's :63 NA's :64
## inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy
## Min. : 0.000 Min. : 0.000 Min. : 0 Min. : 0
## 1st Qu.: 0.000 1st Qu.: 2.000 1st Qu.: 3294 1st Qu.: 1699
## Median : 2.000 Median : 4.000 Median : 7685 Median : 5284
## Mean : 2.187 Mean : 4.799 Mean : 13510 Mean : 10857
## 3rd Qu.: 3.000 3rd Qu.: 7.000 3rd Qu.: 18552 3rd Qu.: 13357
## Max. :40.000 Max. :46.000 Max. :463698 Max. :282091
## NA's :64 NA's :2 NA's :2 NA's :1113
## bc_util chargeoff_within_12_mths delinq_amnt
## Min. : 0.00 Min. :0.00000 Min. : 0.00
## 1st Qu.: 38.30 1st Qu.:0.00000 1st Qu.: 0.00
## Median : 62.40 Median :0.00000 Median : 0.00
## Mean : 59.76 Mean :0.01012 Mean : 21.34
## 3rd Qu.: 84.40 3rd Qu.:0.00000 3rd Qu.: 0.00
## Max. :189.80 Max. :7.00000 Max. :112524.00
## NA's :1177 NA's :2 NA's :2
## mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl
## Min. : 0.0 Min. : 5.0 Min. : 0.00 Min. : 0.0
## 1st Qu.:102.0 1st Qu.:121.0 1st Qu.: 3.00 1st Qu.: 3.0
## Median :130.0 Median :171.0 Median : 8.00 Median : 5.0
## Mean :127.6 Mean :188.7 Mean : 13.16 Mean : 7.8
## 3rd Qu.:153.0 3rd Qu.:241.0 3rd Qu.: 16.00 3rd Qu.: 10.0
## Max. :534.0 Max. :852.0 Max. :327.00 Max. :283.0
## NA's :3648 NA's :2 NA's :2 NA's :2
## mort_acc mths_since_recent_bc mths_since_recent_bc_dlq
## Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 6.00 1st Qu.: 19.00
## Median : 1.000 Median : 13.00 Median : 36.00
## Mean : 1.671 Mean : 23.85 Mean : 38.84
## 3rd Qu.: 3.000 3rd Qu.: 28.00 3rd Qu.: 58.00
## Max. :51.000 Max. :564.00 Max. :188.00
## NA's :2 NA's :1046 NA's :100133
## mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd
## Min. : 0.000 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 2.000 1st Qu.: 16.00 1st Qu.: 0.0000
## Median : 5.000 Median : 32.00 Median : 0.0000
## Mean : 6.813 Mean : 35.22 Mean : 0.5189
## 3rd Qu.:10.000 3rd Qu.: 51.00 3rd Qu.: 0.0000
## Max. :25.000 Max. :197.00 Max. :38.0000
## NA's :13800 NA's :86106 NA's :2
## num_actv_bc_tl num_actv_rev_tl num_bc_sats num_bc_tl
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.00 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 5.000
## Median : 3.00 Median : 5.000 Median : 4.000 Median : 7.000
## Mean : 3.81 Mean : 5.918 Mean : 4.956 Mean : 8.005
## 3rd Qu.: 5.00 3rd Qu.: 8.000 3rd Qu.: 6.000 3rd Qu.:10.000
## Max. :36.00 Max. :47.000 Max. :46.000 Max. :57.000
## NA's :2 NA's :2 NA's :2 NA's :2
## num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0
## Min. : 0.000 Min. : 0.000 Min. : 2.0 Min. : 0.000
## 1st Qu.: 4.000 1st Qu.: 5.000 1st Qu.: 9.0 1st Qu.: 3.000
## Median : 7.000 Median : 8.000 Median : 13.0 Median : 5.000
## Mean : 8.703 Mean : 8.561 Mean : 14.7 Mean : 5.858
## 3rd Qu.: 12.000 3rd Qu.:11.000 3rd Qu.: 19.0 3rd Qu.: 7.000
## Max. :121.000 Max. :72.000 Max. :101.0 Max. :44.000
## NA's :2 NA's :2 NA's :2 NA's :2
## num_sats num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m
## Min. : 1.00 Min. :0.000 Min. :0.000000 Min. : 0.00000
## 1st Qu.: 8.00 1st Qu.:0.000 1st Qu.:0.000000 1st Qu.: 0.00000
## Median :11.00 Median :0.000 Median :0.000000 Median : 0.00000
## Mean :12.06 Mean :0.001 Mean :0.004765 Mean : 0.09061
## 3rd Qu.:15.00 3rd Qu.:0.000 3rd Qu.:0.000000 3rd Qu.: 0.00000
## Max. :74.00 Max. :2.000 Max. :3.000000 Max. :22.00000
## NA's :2 NA's :7623 NA's :2 NA's :2
## num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies
## Min. : 0.000 Min. : 0.00 Min. : 0.00 Min. :0.0000
## 1st Qu.: 1.000 1st Qu.: 91.10 1st Qu.: 10.00 1st Qu.:0.0000
## Median : 2.000 Median : 97.50 Median : 40.00 Median :0.0000
## Mean : 2.249 Mean : 94.06 Mean : 44.56 Mean :0.1305
## 3rd Qu.: 3.000 3rd Qu.:100.00 3rd Qu.: 75.00 3rd Qu.:0.0000
## Max. :31.000 Max. :100.00 Max. :100.00 Max. :9.0000
## NA's :2 NA's :2 NA's :1131 NA's :2
## tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit
## Min. : 0.00000 Min. : 2500 Min. : 0 Min. : 0
## 1st Qu.: 0.00000 1st Qu.: 54220 1st Qu.: 23305 1st Qu.: 8600
## Median : 0.00000 Median : 121201 Median : 41016 Median : 16700
## Mean : 0.07117 Mean : 183894 Mean : 54124 Mean : 23723
## 3rd Qu.: 0.00000 3rd Qu.: 263554 3rd Qu.: 68468 3rd Qu.: 31000
## Max. :45.00000 Max. :9999999 Max. :1034841 Max. :1105500
## NA's :2 NA's :2 NA's :2 NA's :2
## total_il_high_credit_limit revol_bal_joint sec_app_earliest_cr_line
## Min. : 0 Mode:logical Mode:logical
## 1st Qu.: 16585 NA's:133889 NA's:133889
## Median : 34622
## Mean : 45634
## 3rd Qu.: 61258
## Max. :1183946
## NA's :2
## sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc sec_app_revol_util
## Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:133889 NA's:133889 NA's:133889 NA's:133889
##
##
##
##
##
## sec_app_open_act_il sec_app_num_rev_accts sec_app_chargeoff_within_12_mths
## Mode:logical Mode:logical Mode:logical
## NA's:133889 NA's:133889 NA's:133889
##
##
##
##
##
## sec_app_collections_12_mths_ex_med sec_app_mths_since_last_major_derog
## Mode:logical Mode:logical
## NA's:133889 NA's:133889
##
##
##
##
##
## hardship_flag hardship_type hardship_reason hardship_status
## Length:133889 Length:133889 Length:133889 Length:133889
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## deferral_term hardship_amount hardship_start_date hardship_end_date
## Min. :3 Min. : 7.64 Length:133889 Length:133889
## 1st Qu.:3 1st Qu.: 65.27 Class :character Class :character
## Median :3 Median :121.74 Mode :character Mode :character
## Mean :3 Mean :152.71
## 3rd Qu.:3 3rd Qu.:211.15
## Max. :3 Max. :650.09
## NA's :132968 NA's :132968
## payment_plan_start_date hardship_length hardship_dpd hardship_loan_status
## Length:133889 Min. :3 Min. : 0.00 Length:133889
## Class :character 1st Qu.:3 1st Qu.: 0.00 Class :character
## Mode :character Median :3 Median :13.00 Mode :character
## Mean :3 Mean :12.93
## 3rd Qu.:3 3rd Qu.:22.00
## Max. :3 Max. :32.00
## NA's :132968 NA's :132968
## orig_projected_additional_accrued_interest hardship_payoff_balance_amount
## Min. : 22.92 Min. : 628
## 1st Qu.: 183.38 1st Qu.: 6499
## Median : 347.55 Median :10764
## Mean : 442.08 Mean :11798
## 3rd Qu.: 602.55 3rd Qu.:16006
## Max. :1950.27 Max. :32348
## NA's :133159 NA's :132968
## hardship_last_payment_amount disbursement_method debt_settlement_flag
## Min. : 0.01 Length:133889 Length:133889
## 1st Qu.: 35.50 Class :character Class :character
## Median : 120.15 Mode :character Mode :character
## Mean : 176.68
## 3rd Qu.: 263.63
## Max. :1407.86
## NA's :132968
## debt_settlement_flag_date settlement_status settlement_date
## Length:133889 Length:133889 Length:133889
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## settlement_amount settlement_percentage settlement_term
## Min. : 329.2 Min. :20.00 Min. : 0.00
## 1st Qu.: 3127.3 1st Qu.:45.00 1st Qu.:11.00
## Median : 5464.5 Median :50.00 Median :16.00
## Mean : 6250.6 Mean :49.43 Mean :14.64
## 3rd Qu.: 8346.8 3rd Qu.:50.01 3rd Qu.:18.00
## Max. :23895.9 Max. :90.00 Max. :65.00
## NA's :132209 NA's :132209 NA's :132209
# Looking at the unique values in our dependent variable i.e. loan_status
unique(loandata$loan_status)
## [1] "Fully Paid" "Current" "Late (31-120 days)"
## [4] "Charged Off" "Late (16-30 days)" "In Grace Period"
## [7] "Default" ""
# Removing the values we do not need for loanstatuses Current, In Grace Period, Late (16-30 days), and empty ''
loandata <- subset(loandata, loan_status != 'In Grace Period')
loandata <- subset(loandata, loan_status != 'Late (16-30 days)')
loandata <- subset(loandata, loan_status != 'Current')
loandata <- subset(loandata, loan_status != '')
# Install and load dplyr package
install.packages("dplyr", repos='http://cran.us.r-project.org')
##
## The downloaded binary packages are in
## /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Looking at how many loans are there for each status using group_by function
loandata %>% group_by(loan_status) %>% summarise(count=n())
## # A tibble: 4 × 2
## loan_status count
## <chr> <int>
## 1 Charged Off 15698
## 2 Default 9
## 3 Fully Paid 41833
## 4 Late (31-120 days) 2329
# Install and load stringr package
install.packages("stringr", repos='http://cran.us.r-project.org')
##
## The downloaded binary packages are in
## /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(stringr)
# Combining Charged off, Default, and Late(31-120 days) into single category: Default
# By using the str_detect function from stringr
# the final result would be two loan statuses : Paid and Default
loandata$loan_status = ifelse(str_detect(loandata$loan_status, "Paid"), loandata$loan_status, "Default")
# Making sure our loan statuses data is correct
loandata %>% group_by(loan_status) %>% summarise(count=n())
## # A tibble: 2 × 2
## loan_status count
## <chr> <int>
## 1 Default 18036
## 2 Fully Paid 41833
# Plot loan status
install.packages("ggplot2", repos='http://cran.us.r-project.org')
##
## The downloaded binary packages are in
## /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(ggplot2)
gplot <- ggplot(loandata, aes(x=loan_status, fill=loan_status))
gplot + geom_bar()

# Default rate for each type of Loan grade i.e. which are A to G in our loandata
# Use of dplyr package to find the number of defaults for each grade
grade1 = loandata %>% filter(loan_status == "Default") %>% group_by(grade) %>% summarise(default_count = n())
grade1
## # A tibble: 7 × 2
## grade default_count
## <chr> <int>
## 1 A 975
## 2 B 3487
## 3 C 5534
## 4 D 3697
## 5 E 2704
## 6 F 1260
## 7 G 379
# Calculating the default rate in each grade
grade2 = loandata %>% group_by(grade) %>% summarise(count = n())
grade3 <- grade2 %>% left_join(grade1) %>%
mutate(default_rate = default_count/count*100) %>%
select(grade, count, default_count, default_rate)
## Joining with `by = join_by(grade)`
grade3
## # A tibble: 7 × 4
## grade count default_count default_rate
## <chr> <int> <int> <dbl>
## 1 A 9956 975 9.79
## 2 B 16649 3487 20.9
## 3 C 16815 5534 32.9
## 4 D 8480 3697 43.6
## 5 E 5261 2704 51.4
## 6 F 2109 1260 59.7
## 7 G 599 379 63.3
# Plot the default rate
ggplot(grade3, aes(x=grade, y=default_rate, fill=grade)) + geom_bar(stat = "identity")

# Loan Grade vs. Interest Rate
# Firstly, converting interest rate attribute to numeric
loandata$int_rate = (as.numeric(gsub(pattern = "%", replacement = "", x = loandata$int_rate)))
# Grouping the data by grade and their mean interest rates
x1 = loandata %>% filter(loan_status == "Default") %>% group_by(grade) %>% summarise(int_rate = mean(int_rate))
x1
## # A tibble: 7 × 2
## grade int_rate
## <chr> <dbl>
## 1 A 7.06
## 2 B 10.2
## 3 C 13.6
## 4 D 17.7
## 5 E 20.8
## 6 F 24.4
## 7 G 28.0
# Plot the data
ggplot(x1, aes(x=grade, y=int_rate, fill=grade)) + geom_bar(stat = "identity", position = "dodge")

# Training and Test datasets
# Using 30% of data for testing and 70% for training
# Sample Indexes
indexes = sample(1:nrow(loandata), size = 0.3*nrow(loandata))
# Split data
data_test = loandata[indexes,]
data_train = loandata[-indexes,]
dim(data_test)
## [1] 17960 145
dim(data_train)
## [1] 41909 145
colnames(data_train)
## [1] "id"
## [2] "member_id"
## [3] "loan_amnt"
## [4] "funded_amnt"
## [5] "funded_amnt_inv"
## [6] "term"
## [7] "int_rate"
## [8] "installment"
## [9] "grade"
## [10] "sub_grade"
## [11] "emp_title"
## [12] "emp_length"
## [13] "home_ownership"
## [14] "annual_inc"
## [15] "verification_status"
## [16] "issue_d"
## [17] "loan_status"
## [18] "pymnt_plan"
## [19] "url"
## [20] "desc"
## [21] "purpose"
## [22] "title"
## [23] "zip_code"
## [24] "addr_state"
## [25] "dti"
## [26] "delinq_2yrs"
## [27] "earliest_cr_line"
## [28] "inq_last_6mths"
## [29] "mths_since_last_delinq"
## [30] "mths_since_last_record"
## [31] "open_acc"
## [32] "pub_rec"
## [33] "revol_bal"
## [34] "revol_util"
## [35] "total_acc"
## [36] "initial_list_status"
## [37] "out_prncp"
## [38] "out_prncp_inv"
## [39] "total_pymnt"
## [40] "total_pymnt_inv"
## [41] "total_rec_prncp"
## [42] "total_rec_int"
## [43] "total_rec_late_fee"
## [44] "recoveries"
## [45] "collection_recovery_fee"
## [46] "last_pymnt_d"
## [47] "last_pymnt_amnt"
## [48] "next_pymnt_d"
## [49] "last_credit_pull_d"
## [50] "collections_12_mths_ex_med"
## [51] "mths_since_last_major_derog"
## [52] "policy_code"
## [53] "application_type"
## [54] "annual_inc_joint"
## [55] "dti_joint"
## [56] "verification_status_joint"
## [57] "acc_now_delinq"
## [58] "tot_coll_amt"
## [59] "tot_cur_bal"
## [60] "open_acc_6m"
## [61] "open_act_il"
## [62] "open_il_12m"
## [63] "open_il_24m"
## [64] "mths_since_rcnt_il"
## [65] "total_bal_il"
## [66] "il_util"
## [67] "open_rv_12m"
## [68] "open_rv_24m"
## [69] "max_bal_bc"
## [70] "all_util"
## [71] "total_rev_hi_lim"
## [72] "inq_fi"
## [73] "total_cu_tl"
## [74] "inq_last_12m"
## [75] "acc_open_past_24mths"
## [76] "avg_cur_bal"
## [77] "bc_open_to_buy"
## [78] "bc_util"
## [79] "chargeoff_within_12_mths"
## [80] "delinq_amnt"
## [81] "mo_sin_old_il_acct"
## [82] "mo_sin_old_rev_tl_op"
## [83] "mo_sin_rcnt_rev_tl_op"
## [84] "mo_sin_rcnt_tl"
## [85] "mort_acc"
## [86] "mths_since_recent_bc"
## [87] "mths_since_recent_bc_dlq"
## [88] "mths_since_recent_inq"
## [89] "mths_since_recent_revol_delinq"
## [90] "num_accts_ever_120_pd"
## [91] "num_actv_bc_tl"
## [92] "num_actv_rev_tl"
## [93] "num_bc_sats"
## [94] "num_bc_tl"
## [95] "num_il_tl"
## [96] "num_op_rev_tl"
## [97] "num_rev_accts"
## [98] "num_rev_tl_bal_gt_0"
## [99] "num_sats"
## [100] "num_tl_120dpd_2m"
## [101] "num_tl_30dpd"
## [102] "num_tl_90g_dpd_24m"
## [103] "num_tl_op_past_12m"
## [104] "pct_tl_nvr_dlq"
## [105] "percent_bc_gt_75"
## [106] "pub_rec_bankruptcies"
## [107] "tax_liens"
## [108] "tot_hi_cred_lim"
## [109] "total_bal_ex_mort"
## [110] "total_bc_limit"
## [111] "total_il_high_credit_limit"
## [112] "revol_bal_joint"
## [113] "sec_app_earliest_cr_line"
## [114] "sec_app_inq_last_6mths"
## [115] "sec_app_mort_acc"
## [116] "sec_app_open_acc"
## [117] "sec_app_revol_util"
## [118] "sec_app_open_act_il"
## [119] "sec_app_num_rev_accts"
## [120] "sec_app_chargeoff_within_12_mths"
## [121] "sec_app_collections_12_mths_ex_med"
## [122] "sec_app_mths_since_last_major_derog"
## [123] "hardship_flag"
## [124] "hardship_type"
## [125] "hardship_reason"
## [126] "hardship_status"
## [127] "deferral_term"
## [128] "hardship_amount"
## [129] "hardship_start_date"
## [130] "hardship_end_date"
## [131] "payment_plan_start_date"
## [132] "hardship_length"
## [133] "hardship_dpd"
## [134] "hardship_loan_status"
## [135] "orig_projected_additional_accrued_interest"
## [136] "hardship_payoff_balance_amount"
## [137] "hardship_last_payment_amount"
## [138] "disbursement_method"
## [139] "debt_settlement_flag"
## [140] "debt_settlement_flag_date"
## [141] "settlement_status"
## [142] "settlement_date"
## [143] "settlement_amount"
## [144] "settlement_percentage"
## [145] "settlement_term"
# Discard attributes that are not needed
discard_column = c("collection_recovery_fee", "emp_title",
"funded_amnt_inv", "id", "installment",
"last_credit_pull_d", "last_pymnt_d",
"last_pymnt_amnt", "loan_amnt",
"member_id", "next_pymnt_d",
"num_tl_120dpd_2m", "num_tl_30dpd",
"out_prncp", "out_prncp_inv",
"recoveries", "total_pymnt",
"total_pymnt_inv", "total_rec_int",
"total_rec_late_fee", "total_rec_prncp",
"url", "zip_code"
)
data_train = (data_train[,!(names(data_train) %in% discard_column)])
dim(data_train)
## [1] 41909 122
# Discarding grade attribute as well because sub_grade attribute is present
data_train$grade = NULL
# Discard columns with too many Na's - where percentage is greater than 50%
data_train <- data_train[, -which(colMeans(is.na(data_train)) > 0.5)]
# Discarding further attributes that are not needed
discard_column = c("hardship_flag","hardship_type","hardship_reason",
"hardship_status","hardship_start_date","hardship_end_date",
"payment_plan_start_date","hardship_loan_status","disbursement_method",
"debt_settlement_flag","debt_settlement_flag_date","settlement_status",
"settlement_date"
)
data_train = (data_train[,!(names(data_train) %in% discard_column)])
# Converting revol_util attribute to numeric datatype
data_train$revol_util = (as.numeric(gsub(pattern = "%", replacement = "", x = data_train$revol_util)))
install.packages("lubridate", repos='http://cran.us.r-project.org')
##
## The downloaded binary packages are in
## /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Transforming earliest_cr_line attribute to the no. of days before the loan is issued
data_train$earliest_cr_line = parse_date_time(str_c("01", data_train$issue_d), "dmy") - parse_date_time(str_c("01", data_train$earliest_cr_line), "dmy")
data_train$earliest_cr_line = as.numeric(data_train$earliest_cr_line, units="days")
# Analyzing default rate by issued month
data_train$issue_m = sapply(data_train$issue_d, function(x){str_split(x,"-")[[1]][1]})
tmp = data_train %>% filter(loan_status=="Default") %>% group_by(issue_m) %>% summarise(default_count = n())
tmp1 = data_train %>% group_by(issue_m) %>% summarise(count=n())
tmp1 %>% left_join(tmp) %>% mutate(default_rate = default_count/count)
## Joining with `by = join_by(issue_m)`
## # A tibble: 3 × 4
## issue_m count default_count default_rate
## <chr> <int> <int> <dbl>
## 1 Feb 12375 3746 0.303
## 2 Jan 10776 3065 0.284
## 3 Mar 18758 5783 0.308
str(data_train$issue_m)
## chr [1:41909] "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" ...
# As seen the default rate does not vary much by the month it is issued, so we will proceed dropping the issue_d and issue_m attributes
data_train$issue_m = NULL
data_train$issue_d = NULL
# Removing tmp and tmp1
rm(tmp,tmp1)
# Attributes with Zero Variance
# In our dataset, we will look for predictors with zero variance and will proceed removing them
# Let's define some generic functions that we will use later
# Returns the Numeric columns only from a dataset
getNumericColumns <- function(t) {
tn = sapply(t, function(x){is.numeric(x)})
return(names(tn)[which(tn)])
}
# Returns the character columns only from a dataset
getCharColumns <- function(t) {
tn = sapply(t, function(x){is.character(x)})
return(names(tn)[which(tn)])
}
# Returns the factor columns only in a datset
getFactorColumns <- function(t) {
tn = sapply(t, function(x){is.factor(x)})
return(names(tn)[which(tn)])
}
# Returns index of columns along with column names
getIndexofColumns <- function(t, column_names) {
return(match(column_names, colnames(t)))
}
# Find character columns having same value and numeric columns having zero variance
tmp = apply(data_train[getCharColumns(data_train)],2,function(x){length(unique(x))})
tmp = tmp[tmp==1]
tmp2 = apply(data_train[getNumericColumns(data_train)],2,function(x){(sd(x))})
tmp2 = tmp2[tmp2==0]
discard_column = c(names(tmp), names(tmp2))
discard_column
## [1] NA NA NA NA "policy_code"
## [6] NA NA NA NA NA
## [11] NA NA NA NA NA
## [16] NA NA NA NA NA
## [21] NA NA NA NA NA
# There is only one predictor that meets this criteria
# Dropping the zero-variance feature we have found
data_train = data_train[,!names(data_train) %in% discard_column]
# Dropping further attributes that are not needed
# Let's look at attributes 'title' and 'purpose'
table(data_train$title)
##
## Business Car financing
## 3348 404 390
## Credit card refinancing Debt consolidation Green loan
## 8246 22704 21
## Home buying Home improvement Major purchase
## 207 2604 862
## Medical expenses Moving and relocation Other
## 429 253 2207
## Vacation
## 234
table(data_train$purpose)
##
## car credit_card debt_consolidation home_improvement
## 410 9089 24729 2782
## house major_purchase medical moving
## 224 919 460 270
## other renewable_energy small_business vacation
## 2309 25 438 254
# Since both attributes have same information, we will proceed dropping title
data_train$title = NULL
# Looking at 'desc' column
str(data_train$desc)
## chr [1:41909] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ...
# Let's drop desc column as well since it contains mostly empty values
data_train$desc = NULL
# Default by States
# Filtering out the states that have too small no. of loans(less than 1000)
tmp = data_train %>% filter(loan_status=="Default") %>% group_by(addr_state) %>% summarise(default_count = n())
tmp2 = data_train %>% group_by(addr_state) %>% summarise(count = n())
tmp3 = tmp2 %>% left_join(tmp) %>% mutate(default_rate = default_count/count)
## Joining with `by = join_by(addr_state)`
tmp3
## # A tibble: 50 × 4
## addr_state count default_count default_rate
## <chr> <int> <int> <dbl>
## 1 AK 114 36 0.316
## 2 AL 543 198 0.365
## 3 AR 302 121 0.401
## 4 AZ 1090 305 0.280
## 5 CA 6166 1846 0.299
## 6 CO 1005 223 0.222
## 7 CT 550 145 0.264
## 8 DC 95 18 0.189
## 9 DE 108 28 0.259
## 10 FL 3025 954 0.315
## # ℹ 40 more rows
# Order States by Default Rate to identify highest and lowest default rates
# order by highest default rate
high_default = (tmp3 %>% filter(count>1000) %>% arrange(desc(default_rate))) [1:10, "addr_state"]$addr_state
high_default
## [1] "NJ" "NY" "PA" "FL" "OH" "TX" "NC" "IL" "CA" "VA"
# order by lowest default rate
low_default = (tmp3 %>% filter(count>1000) %>% arrange((default_rate))) [1:10, "addr_state"]$addr_state
low_default
## [1] "CO" "GA" "AZ" "MI" "VA" "CA" "IL" "NC" "TX" "OH"
# Creating Binary variables for 5 highest states and 5 lowest states, discarding the rest
data_train$is_nj = ifelse(data_train$addr_state=='NJ',1,0)
data_train$is_ny = ifelse(data_train$addr_state=='NY',1,0)
data_train$is_pa = ifelse(data_train$addr_state=='PA',1,0)
data_train$is_fl = ifelse(data_train$addr_state=='FL',1,0)
data_train$is_oh = ifelse(data_train$addr_state=='OH',1,0)
data_train$is_co = ifelse(data_train$addr_state=='CO',1,0)
data_train$is_ga = ifelse(data_train$addr_state=='GA',1,0)
data_train$is_az = ifelse(data_train$addr_state=='AZ',1,0)
data_train$is_mi = ifelse(data_train$addr_state=='MI',1,0)
data_train$is_va = ifelse(data_train$addr_state=='VA',1,0)
# Let's proceed removing addr_state attribute and other temporary objects
data_train$addr_state = NULL
rm(tmp, tmp2, tmp3, high_default, low_default)
# Remove Correlated features
# Investigating if there are any correlation among features
install.packages("corrplot", repos='http://cran.us.r-project.org')
##
## The downloaded binary packages are in
## /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(data_train[getNumericColumns(data_train)], use="na.or.complete"))

# Numeric features
# Looking at all the numeric features
str(data_train[getNumericColumns(data_train)])
## 'data.frame': 41909 obs. of 75 variables:
## $ funded_amnt : int 35000 14400 18000 7250 10000 10000 5200 8400 10000 6950 ...
## $ int_rate : num 20.75 10.75 11.99 13.67 5.32 ...
## $ annual_inc : num 85000 85000 100000 72000 45000 43000 36000 66000 60000 50000 ...
## $ dti : num 24.98 28.11 20.41 23.93 8.03 ...
## $ delinq_2yrs : int 0 0 1 1 0 0 0 0 0 0 ...
## $ earliest_cr_line : num 2647 5873 5083 11382 9436 ...
## $ inq_last_6mths : int 1 0 4 0 0 1 0 1 0 0 ...
## $ mths_since_last_delinq : int NA 72 18 20 NA NA NA 60 35 55 ...
## $ open_acc : int 9 16 21 12 7 9 11 7 18 5 ...
## $ pub_rec : int 0 0 0 1 1 0 0 0 0 4 ...
## $ revol_bal : int 10167 37582 29047 12220 471 10139 4970 11059 16291 7096 ...
## $ revol_util : num 46.6 85.8 49 65 7.4 30.7 16.7 78.4 75.4 59.1 ...
## $ total_acc : int 19 33 40 24 15 14 12 22 25 26 ...
## $ collections_12_mths_ex_med: int 0 0 0 0 0 0 0 1 0 0 ...
## $ acc_now_delinq : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tot_coll_amt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tot_cur_bal : int 161643 190166 299784 366483 11970 42767 13483 157051 77604 7096 ...
## $ open_acc_6m : int 0 0 3 1 0 1 1 1 0 0 ...
## $ open_act_il : int 3 3 1 2 1 2 1 1 8 0 ...
## $ open_il_12m : int 0 1 1 1 0 1 1 1 1 0 ...
## $ open_il_24m : int 3 1 1 3 1 1 1 1 5 0 ...
## $ mths_since_rcnt_il : int 14 12 4 3 23 6 1 10 7 91 ...
## $ total_bal_il : int 73863 22387 29197 40343 11499 32628 8513 19231 61313 0 ...
## $ il_util : int 83 47 92 92 72 73 100 91 94 NA ...
## $ open_rv_12m : int 0 0 4 1 1 1 0 2 0 0 ...
## $ open_rv_24m : int 1 0 8 1 2 1 3 2 1 1 ...
## $ max_bal_bc : int 5109 12211 7172 3694 325 5260 2060 3019 7376 5255 ...
## $ all_util : int 71 66 49 84 54 55 35 86 89 59 ...
## $ total_rev_hi_lim : int 21800 43800 58950 18800 6400 33000 29700 14100 21600 12000 ...
## $ inq_fi : int 5 0 0 0 2 2 0 1 3 0 ...
## $ total_cu_tl : int 1 0 1 0 2 0 0 1 3 0 ...
## $ inq_last_12m : int 2 0 5 1 0 3 0 3 6 1 ...
## $ acc_open_past_24mths : int 4 1 9 4 3 2 4 4 6 1 ...
## $ avg_cur_bal : int 17960 11885 16654 30540 1710 4752 1226 22436 4565 1419 ...
## $ bc_open_to_buy : int 4833 3393 5072 997 4329 9861 22954 585 1709 2934 ...
## $ bc_util : num 67.8 87.4 59 90.1 9.8 50.7 17.4 90.6 90.5 69.8 ...
## $ chargeoff_within_12_mths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ delinq_amnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mo_sin_old_il_acct : int 87 145 102 132 135 65 1 129 104 153 ...
## $ mo_sin_old_rev_tl_op : int 65 193 167 374 310 65 159 129 107 205 ...
## $ mo_sin_rcnt_rev_tl_op : int 22 26 0 8 10 10 17 2 22 17 ...
## $ mo_sin_rcnt_tl : int 14 12 0 3 10 6 1 2 7 17 ...
## $ mort_acc : int 1 6 1 4 2 0 0 2 0 8 ...
## $ mths_since_recent_bc : int 22 32 9 59 10 10 17 89 22 31 ...
## $ mths_since_recent_inq : int 5 20 0 9 23 6 NA 2 7 11 ...
## $ num_accts_ever_120_pd : int 0 0 0 3 0 0 0 2 0 1 ...
## $ num_actv_bc_tl : int 3 5 9 3 2 4 7 2 6 3 ...
## $ num_actv_rev_tl : int 3 11 16 5 2 4 8 4 6 4 ...
## $ num_bc_sats : int 3 5 16 3 4 4 8 2 6 3 ...
## $ num_bc_tl : int 4 9 16 4 5 4 9 7 7 7 ...
## $ num_il_tl : int 9 7 6 8 4 4 1 2 13 3 ...
## $ num_op_rev_tl : int 5 11 16 9 6 7 10 5 10 5 ...
## $ num_rev_accts : int 9 20 33 12 9 10 11 12 12 15 ...
## $ num_rev_tl_bal_gt_0 : int 3 11 11 5 2 4 8 4 6 4 ...
## $ num_sats : int 9 16 18 12 7 9 11 7 18 5 ...
## $ num_tl_90g_dpd_24m : int 0 0 0 1 0 0 0 0 0 0 ...
## $ num_tl_op_past_12m : int 0 1 5 2 1 2 1 3 1 0 ...
## $ pct_tl_nvr_dlq : num 100 93.9 98 83.3 100 100 100 86.4 96 91.3 ...
## $ percent_bc_gt_75 : num 33.3 100 22.2 100 0 0 25 100 83.3 33.3 ...
## $ pub_rec_bankruptcies : int 0 0 0 0 1 0 0 0 0 3 ...
## $ tax_liens : int 0 0 0 1 0 0 0 0 0 1 ...
## $ tot_hi_cred_lim : int 193751 232277 350840 374306 22363 77555 38200 166260 86947 12000 ...
## $ total_bal_ex_mort : int 84030 59969 58244 52563 11970 42767 13483 30290 77604 7096 ...
## $ total_bc_limit : int 15000 27000 45650 10100 4800 20000 27800 6200 18000 9700 ...
## $ total_il_high_credit_limit: int 87951 47352 31590 43666 15963 44555 8500 21079 65347 0 ...
## $ is_nj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_ny : num 1 0 0 0 0 0 0 0 0 0 ...
## $ is_pa : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_fl : num 0 0 0 0 1 0 0 0 0 1 ...
## $ is_oh : num 0 0 0 0 0 0 1 0 0 0 ...
## $ is_co : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_ga : num 0 0 0 0 0 0 0 1 0 0 ...
## $ is_az : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_mi : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_va : num 0 0 0 0 0 0 0 0 0 0 ...
# Transforming annual_inc, revol_bal, avg_cur_bal, and bc_open_to_buy by dividing them by funded_amnt (amount of loan issued or funded)
data_train$annual_inc = data_train$annual_inc/data_train$funded_amnt
data_train$revol_bal = data_train$revol_bal/data_train$funded_amnt
data_train$avg_cur_bal = data_train$avg_cur_bal/data_train$funded_amnt
data_train$bc_open_to_buy = data_train$bc_open_to_buy/data_train$funded_amnt
# Removing funded_amnt attribute
data_train$funded_amnt = NULL
# Character features
# Looking at all the character features
str(data_train[getCharColumns(data_train)])
## 'data.frame': 41909 obs. of 11 variables:
## $ term : chr " 60 months" " 36 months" " 60 months" " 36 months" ...
## $ sub_grade : chr "E2" "B4" "C1" "C3" ...
## $ emp_length : chr "3 years" "10+ years" "6 years" "3 years" ...
## $ home_ownership : chr "MORTGAGE" "MORTGAGE" "MORTGAGE" "MORTGAGE" ...
## $ verification_status : chr "Source Verified" "Not Verified" "Verified" "Source Verified" ...
## $ loan_status : chr "Fully Paid" "Fully Paid" "Fully Paid" "Fully Paid" ...
## $ pymnt_plan : chr "n" "n" "n" "n" ...
## $ purpose : chr "debt_consolidation" "small_business" "debt_consolidation" "debt_consolidation" ...
## $ initial_list_status : chr "w" "w" "w" "w" ...
## $ application_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ verification_status_joint: chr "" "" "" "" ...
# Removing verification_status_joint attribute as it only contains nulls
data_train$verification_status_joint = NULL
# Looking at the home_ownership attribute
table(data_train$home_ownership)
##
## MORTGAGE OWN RENT
## 20984 5127 15798
# Let's now look at all loans or observation with pymnt_plan = "y", that all ended in being default status
data_train %>% filter(pymnt_plan=="y") %>% select(pymnt_plan, loan_status)
## pymnt_plan loan_status
## 1 y Default
## 2 y Default
## 3 y Default
## 4 y Default
## 5 y Default
## 6 y Default
## 7 y Default
## 8 y Default
## 9 y Default
## 10 y Default
## 11 y Default
## 12 y Default
## 13 y Default
## 14 y Default
## 15 y Default
## 16 y Default
## 17 y Default
## 18 y Default
## 19 y Default
## 20 y Default
## 21 y Default
## 22 y Default
## 23 y Default
## 24 y Default
## 25 y Default
## 26 y Default
## 27 y Default
## 28 y Default
## 29 y Default
## 30 y Default
## 31 y Default
## 32 y Default
## 33 y Default
## 34 y Default
## 35 y Default
## 36 y Default
## 37 y Default
## 38 y Default
## 39 y Default
## 40 y Default
## 41 y Default
## 42 y Default
## 43 y Default
## 44 y Default
## 45 y Default
## 46 y Default
## 47 y Default
## 48 y Default
## 49 y Default
## 50 y Default
## 51 y Default
## 52 y Default
## 53 y Default
## 54 y Default
## 55 y Default
## 56 y Default
## 57 y Default
## 58 y Default
## 59 y Default
## 60 y Default
## 61 y Default
## 62 y Default
## 63 y Default
## 64 y Default
## we can see there are total of 64 observations that are defaulted but had payment plan as yes
# In our training data with 41909 observation, 64 is a miniscule number of records
# For our analysis purpose, we will remove the pymnt_plan attribute
data_train$pymnt_plan = NULL
# Logistic regression
# Using the preProcess function from caret package to center and scale (Normalize) the data
## we can try to remove the number of dimensions further by fitting the logistic regression
## and investigate p-value of the coefficients. Our Null hypothesis is that each feature makes
## no contribution to the predictive model (i.e. its coefficient is zero).
## we will then discard each feature that fails to reject the hypothesis.
install.packages("caret", repos='http://cran.us.r-project.org')
##
## The downloaded binary packages are in
## /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(caret)
## Loading required package: lattice
install.packages("rms", repos='http://cran.us.r-project.org')
##
## The downloaded binary packages are in
## /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
trans.model = preProcess(data_train, method = c("center", "scale"))
data_train = predict(trans.model, data_train)
model = lrm(loan_status ~ ., data_train)
model
## Frequencies of Missing Values Due to Each Variable
## loan_status term
## 0 0
## int_rate sub_grade
## 0 0
## emp_length home_ownership
## 0 0
## annual_inc verification_status
## 0 0
## purpose dti
## 0 4
## delinq_2yrs earliest_cr_line
## 0 0
## inq_last_6mths mths_since_last_delinq
## 1 20133
## open_acc pub_rec
## 0 0
## revol_bal revol_util
## 0 28
## total_acc initial_list_status
## 0 0
## collections_12_mths_ex_med application_type
## 0 0
## acc_now_delinq tot_coll_amt
## 0 0
## tot_cur_bal open_acc_6m
## 0 20
## open_act_il open_il_12m
## 19 19
## open_il_24m mths_since_rcnt_il
## 19 948
## total_bal_il il_util
## 19 5112
## open_rv_12m open_rv_24m
## 19 19
## max_bal_bc all_util
## 19 22
## total_rev_hi_lim inq_fi
## 0 19
## total_cu_tl inq_last_12m
## 20 20
## acc_open_past_24mths avg_cur_bal
## 0 0
## bc_open_to_buy bc_util
## 392 423
## chargeoff_within_12_mths delinq_amnt
## 0 0
## mo_sin_old_il_acct mo_sin_old_rev_tl_op
## 931 0
## mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl
## 0 0
## mort_acc mths_since_recent_bc
## 0 366
## mths_since_recent_inq num_accts_ever_120_pd
## 3339 0
## num_actv_bc_tl num_actv_rev_tl
## 0 0
## num_bc_sats num_bc_tl
## 0 0
## num_il_tl num_op_rev_tl
## 0 0
## num_rev_accts num_rev_tl_bal_gt_0
## 0 0
## num_sats num_tl_90g_dpd_24m
## 0 0
## num_tl_op_past_12m pct_tl_nvr_dlq
## 0 0
## percent_bc_gt_75 pub_rec_bankruptcies
## 399 0
## tax_liens tot_hi_cred_lim
## 0 0
## total_bal_ex_mort total_bc_limit
## 0 0
## total_il_high_credit_limit is_nj
## 0 0
## is_ny is_pa
## 0 0
## is_fl is_oh
## 0 0
## is_co is_ga
## 0 0
## is_az is_mi
## 0 0
## is_va
## 0
##
## Logistic Regression Model
##
## lrm(formula = loan_status ~ ., data = data_train)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 18046 LR chi2 2593.14 R2 0.187 C 0.725
## Default 5779 d.f. 137 R2(137,18046)0.127 Dxy 0.450
## Fully Paid 12267 Pr(> chi2) <0.0001 R2(137,11785)0.188 gamma 0.450
## max |deriv| 1e-06 Brier 0.188 tau-a 0.196
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 3.1104 0.5021 6.20 <0.0001
## term= 60 months -0.3924 0.0478 -8.21 <0.0001
## int_rate 0.2680 0.2606 1.03 0.3038
## sub_grade=A2 -0.4924 0.2583 -1.91 0.0566
## sub_grade=A3 -0.3362 0.2698 -1.25 0.2128
## sub_grade=A4 -0.6098 0.2568 -2.37 0.0176
## sub_grade=A5 -0.8196 0.2529 -3.24 0.0012
## sub_grade=B1 -1.0137 0.2586 -3.92 <0.0001
## sub_grade=B2 -0.9146 0.2824 -3.24 0.0012
## sub_grade=B3 -1.0339 0.3034 -3.41 0.0007
## sub_grade=B4 -1.4030 0.3427 -4.09 <0.0001
## sub_grade=B5 -1.2283 0.3741 -3.28 0.0010
## sub_grade=C1 -1.4535 0.3966 -3.66 0.0002
## sub_grade=C2 -1.6519 0.4413 -3.74 0.0002
## sub_grade=C3 -1.6109 0.4722 -3.41 0.0006
## sub_grade=C4 -1.7167 0.5065 -3.39 0.0007
## sub_grade=C5 -1.9442 0.5473 -3.55 0.0004
## sub_grade=D1 -1.9237 0.5967 -3.22 0.0013
## sub_grade=D2 -2.1504 0.6434 -3.34 0.0008
## sub_grade=D3 -2.3580 0.6894 -3.42 0.0006
## sub_grade=D4 -2.3300 0.7242 -3.22 0.0013
## sub_grade=D5 -2.5266 0.7516 -3.36 0.0008
## sub_grade=E1 -2.4304 0.7765 -3.13 0.0017
## sub_grade=E2 -2.4651 0.8099 -3.04 0.0023
## sub_grade=E3 -2.4197 0.8330 -2.90 0.0037
## sub_grade=E4 -2.4510 0.8707 -2.81 0.0049
## sub_grade=E5 -2.6498 0.9040 -2.93 0.0034
## sub_grade=F1 -2.5280 0.9415 -2.69 0.0073
## sub_grade=F2 -2.7730 0.9927 -2.79 0.0052
## sub_grade=F3 -3.1869 1.0470 -3.04 0.0023
## sub_grade=F4 -3.3709 1.0946 -3.08 0.0021
## sub_grade=F5 -3.0529 1.1309 -2.70 0.0069
## sub_grade=G1 -2.9341 1.1875 -2.47 0.0135
## sub_grade=G2 -3.1018 1.2344 -2.51 0.0120
## sub_grade=G3 -2.8577 1.2551 -2.28 0.0228
## sub_grade=G4 -3.5188 1.2945 -2.72 0.0066
## sub_grade=G5 -3.2211 1.3048 -2.47 0.0136
## emp_length=1 year 0.1216 0.0917 1.33 0.1849
## emp_length=10+ years 0.1559 0.0708 2.20 0.0277
## emp_length=2 years 0.0215 0.0847 0.25 0.7999
## emp_length=3 years 0.0676 0.0871 0.78 0.4380
## emp_length=4 years 0.1874 0.0957 1.96 0.0503
## emp_length=5 years 0.0104 0.0940 0.11 0.9120
## emp_length=6 years 0.0538 0.1056 0.51 0.6103
## emp_length=7 years -0.1075 0.1106 -0.97 0.3310
## emp_length=8 years 0.0033 0.1004 0.03 0.9738
## emp_length=9 years 0.0547 0.1052 0.52 0.6028
## emp_length=n/a -0.4117 0.0972 -4.24 <0.0001
## home_ownership=OWN -0.0958 0.0593 -1.61 0.1065
## home_ownership=RENT -0.3107 0.0475 -6.54 <0.0001
## annual_inc 0.2064 0.0495 4.17 <0.0001
## verification_status=Source Verified -0.0440 0.0442 -0.99 0.3202
## verification_status=Verified 0.0066 0.0485 0.14 0.8911
## purpose=credit_card -0.4770 0.2154 -2.21 0.0268
## purpose=debt_consolidation -0.4900 0.2124 -2.31 0.0210
## purpose=home_improvement -0.5901 0.2216 -2.66 0.0077
## purpose=house -0.5003 0.3052 -1.64 0.1011
## purpose=major_purchase -0.8237 0.2409 -3.42 0.0006
## purpose=medical -0.8457 0.2643 -3.20 0.0014
## purpose=moving -0.7877 0.3002 -2.62 0.0087
## purpose=other -0.5468 0.2236 -2.45 0.0145
## purpose=renewable_energy -1.9616 0.9284 -2.11 0.0346
## purpose=small_business -0.9322 0.2620 -3.56 0.0004
## purpose=vacation -1.1051 0.3141 -3.52 0.0004
## dti -0.1354 0.0239 -5.67 <0.0001
## delinq_2yrs -0.0500 0.0214 -2.33 0.0197
## earliest_cr_line -0.2244 0.0462 -4.85 <0.0001
## inq_last_6mths -0.0299 0.0215 -1.39 0.1651
## mths_since_last_delinq 0.0889 0.0218 4.07 <0.0001
## open_acc 0.1100 0.3539 0.31 0.7559
## pub_rec 0.0473 0.0422 1.12 0.2631
## revol_bal 0.0677 0.0457 1.48 0.1383
## revol_util -0.0579 0.0414 -1.40 0.1618
## total_acc 0.0562 0.3323 0.17 0.8658
## initial_list_status=w 0.1027 0.0467 2.20 0.0279
## collections_12_mths_ex_med -0.0246 0.0145 -1.70 0.0890
## application_type=Joint App 0.3663 0.1307 2.80 0.0051
## acc_now_delinq 0.0021 0.0135 0.15 0.8771
## tot_coll_amt 0.0289 0.0171 1.69 0.0911
## tot_cur_bal 0.0423 0.1363 0.31 0.7564
## open_acc_6m 0.0020 0.0263 0.07 0.9408
## open_act_il 0.0027 0.0813 0.03 0.9732
## open_il_12m 0.0345 0.0751 0.46 0.6458
## open_il_24m -0.0610 0.0903 -0.68 0.4992
## mths_since_rcnt_il -0.0534 0.0335 -1.60 0.1105
## total_bal_il 0.2258 0.1109 2.04 0.0417
## il_util 0.0148 0.0312 0.47 0.6363
## open_rv_12m 0.1682 0.1105 1.52 0.1280
## open_rv_24m -0.1534 0.1369 -1.12 0.2625
## max_bal_bc 0.0305 0.0453 0.67 0.5011
## all_util -0.1037 0.0417 -2.49 0.0128
## total_rev_hi_lim 0.0897 0.0612 1.47 0.1428
## inq_fi -0.0610 0.0195 -3.13 0.0018
## total_cu_tl 0.0387 0.0184 2.10 0.0356
## inq_last_12m 0.0436 0.0236 1.85 0.0644
## acc_open_past_24mths 0.0524 0.1665 0.31 0.7530
## avg_cur_bal 0.0210 0.0362 0.58 0.5625
## bc_open_to_buy 0.0448 0.0464 0.97 0.3344
## bc_util 0.0841 0.0454 1.85 0.0641
## chargeoff_within_12_mths 0.0020 0.0136 0.15 0.8825
## delinq_amnt 0.0276 0.0251 1.10 0.2713
## mo_sin_old_il_acct 0.0347 0.0217 1.60 0.1099
## mo_sin_old_rev_tl_op 0.1823 0.0453 4.02 <0.0001
## mo_sin_rcnt_rev_tl_op -0.0342 0.0296 -1.15 0.2484
## mo_sin_rcnt_tl 0.0950 0.0367 2.59 0.0096
## mort_acc 0.0925 0.0556 1.66 0.0962
## mths_since_recent_bc 0.0744 0.0246 3.02 0.0025
## mths_since_recent_inq 0.0347 0.0230 1.51 0.1303
## num_accts_ever_120_pd 0.0255 0.0180 1.42 0.1560
## num_actv_bc_tl -0.0658 0.0494 -1.33 0.1825
## num_actv_rev_tl -0.1144 0.0841 -1.36 0.1737
## num_bc_sats -0.0754 0.0495 -1.52 0.1280
## num_bc_tl 0.0264 0.0481 0.55 0.5835
## num_il_tl -0.0495 0.2063 -0.24 0.8104
## num_op_rev_tl -0.0403 0.1387 -0.29 0.7716
## num_rev_accts 0.0500 0.2311 0.22 0.8286
## num_rev_tl_bal_gt_0 -0.0062 0.0829 -0.08 0.9401
## num_sats -0.0729 0.3589 -0.20 0.8391
## num_tl_90g_dpd_24m 0.0321 0.0183 1.76 0.0791
## num_tl_op_past_12m -0.1466 0.1373 -1.07 0.2857
## pct_tl_nvr_dlq 0.0261 0.0222 1.18 0.2386
## percent_bc_gt_75 -0.1117 0.0328 -3.41 0.0007
## pub_rec_bankruptcies -0.0439 0.0308 -1.42 0.1546
## tax_liens -0.0364 0.0341 -1.07 0.2862
## tot_hi_cred_lim 0.0288 0.1396 0.21 0.8365
## total_bal_ex_mort -0.2741 0.1009 -2.72 0.0066
## total_bc_limit 0.1166 0.0636 1.83 0.0666
## total_il_high_credit_limit 0.0226 0.0780 0.29 0.7724
## is_nj -0.0413 0.0165 -2.50 0.0123
## is_ny -0.0498 0.0177 -2.81 0.0049
## is_pa -0.0228 0.0168 -1.36 0.1751
## is_fl -0.0004 0.0177 -0.02 0.9839
## is_oh -0.0182 0.0172 -1.06 0.2879
## is_co 0.0453 0.0195 2.33 0.0198
## is_ga 0.0156 0.0176 0.89 0.3732
## is_az 0.0211 0.0174 1.21 0.2259
## is_mi 0.0010 0.0172 0.06 0.9546
## is_va -0.0024 0.0179 -0.13 0.8947