# Importing the data into R
loandata = read.csv("loan_data_PD.csv")

# Set seed
set.seed(0)

# Look at structure and summary of our data
str(loandata)
## 'data.frame':    133889 obs. of  145 variables:
##  $ id                                        : chr  "" "" "" "" ...
##  $ member_id                                 : logi  NA NA NA NA NA NA ...
##  $ loan_amnt                                 : int  10000 35000 20000 17475 8000 14400 18000 5800 12500 3000 ...
##  $ funded_amnt                               : int  10000 35000 20000 17475 8000 14400 18000 5800 12500 3000 ...
##  $ funded_amnt_inv                           : num  10000 35000 20000 17475 8000 ...
##  $ term                                      : chr  " 60 months" " 60 months" " 60 months" " 60 months" ...
##  $ int_rate                                  : chr  " 19.53%" " 20.75%" "  9.16%" " 11.47%" ...
##  $ installment                               : num  262 942 417 384 255 ...
##  $ grade                                     : chr  "D" "E" "B" "B" ...
##  $ sub_grade                                 : chr  "D5" "E2" "B2" "B5" ...
##  $ emp_title                                 : chr  "lpn/charge nurse" "Coiler" "Reliability Engineer" "" ...
##  $ emp_length                                : chr  "4 years" "3 years" "1 year" "n/a" ...
##  $ home_ownership                            : chr  "OWN" "MORTGAGE" "MORTGAGE" "MORTGAGE" ...
##  $ annual_inc                                : num  52000 85000 77000 41682 72000 ...
##  $ verification_status                       : chr  "Source Verified" "Source Verified" "Not Verified" "Verified" ...
##  $ issue_d                                   : chr  "Mar-2016" "Mar-2016" "Mar-2016" "Mar-2016" ...
##  $ loan_status                               : chr  "Fully Paid" "Fully Paid" "Current" "Current" ...
##  $ pymnt_plan                                : chr  "n" "n" "n" "n" ...
##  $ url                                       : logi  NA NA NA NA NA NA ...
##  $ desc                                      : chr  "" "" "" "" ...
##  $ purpose                                   : chr  "other" "debt_consolidation" "home_improvement" "debt_consolidation" ...
##  $ title                                     : chr  "Other" "Debt consolidation" "Home improvement" "Debt consolidation" ...
##  $ zip_code                                  : chr  "317xx" "144xx" "606xx" "796xx" ...
##  $ addr_state                                : chr  "GA" "NY" "IL" "TX" ...
##  $ dti                                       : num  15 25 13.8 30.1 22.6 ...
##  $ delinq_2yrs                               : int  0 0 0 0 0 0 1 1 0 1 ...
##  $ earliest_cr_line                          : chr  "Oct-2000" "Dec-2008" "Mar-2005" "Feb-1976" ...
##  $ inq_last_6mths                            : int  1 1 1 0 0 0 4 0 1 0 ...
##  $ mths_since_last_delinq                    : int  44 NA NA NA NA 72 18 2 NA 9 ...
##  $ mths_since_last_record                    : int  52 NA NA NA NA NA NA NA NA 71 ...
##  $ open_acc                                  : int  4 9 14 10 4 16 21 5 8 7 ...
##  $ pub_rec                                   : int  2 0 0 0 0 0 0 0 0 1 ...
##  $ revol_bal                                 : int  1077 10167 4204 24431 19735 37582 29047 4416 9215 2099 ...
##  $ revol_util                                : chr  "35.9%" "46.6%" "13.9%" "27.6%" ...
##  $ total_acc                                 : int  12 19 19 18 12 33 40 10 15 25 ...
##  $ initial_list_status                       : chr  "w" "w" "w" "w" ...
##  $ out_prncp                                 : num  0 0 13700 12189 3373 ...
##  $ out_prncp_inv                             : num  0 0 13700 12189 3373 ...
##  $ total_pymnt                               : num  11128 37226 9148 8433 5602 ...
##  $ total_pymnt_inv                           : num  11128 37226 9148 8433 5602 ...
##  $ total_rec_prncp                           : num  10000 35000 6300 5286 4627 ...
##  $ total_rec_int                             : num  1128 2226 2848 3147 975 ...
##  $ total_rec_late_fee                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ recoveries                                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ collection_recovery_fee                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ last_pymnt_d                              : chr  "Nov-2016" "Aug-2016" "Feb-2018" "Feb-2018" ...
##  $ last_pymnt_amnt                           : num  9575 509 417 384 255 ...
##  $ next_pymnt_d                              : chr  "" "" "Mar-2018" "Mar-2018" ...
##  $ last_credit_pull_d                        : chr  "Dec-2017" "Feb-2018" "Feb-2018" "Feb-2018" ...
##  $ collections_12_mths_ex_med                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mths_since_last_major_derog               : int  44 NA NA NA NA NA NA 2 NA NA ...
##  $ policy_code                               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ application_type                          : chr  "Individual" "Individual" "Individual" "Individual" ...
##  $ annual_inc_joint                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ dti_joint                                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ verification_status_joint                 : chr  "" "" "" "" ...
##  $ acc_now_delinq                            : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ tot_coll_amt                              : int  622 0 0 0 0 0 0 0 0 0 ...
##  $ tot_cur_bal                               : int  15886 161643 25031 118388 59568 190166 299784 10934 175815 153995 ...
##  $ open_acc_6m                               : int  2 0 0 0 0 0 3 0 1 0 ...
##  $ open_act_il                               : int  1 3 8 0 3 3 1 1 3 0 ...
##  $ open_il_12m                               : int  4 0 0 0 1 1 1 0 2 1 ...
##  $ open_il_24m                               : int  5 3 0 0 3 1 1 0 2 2 ...
##  $ mths_since_rcnt_il                        : int  2 14 46 37 9 12 4 73 11 7 ...
##  $ total_bal_il                              : int  14809 73863 20827 0 39833 22387 29197 1972 30319 0 ...
##  $ il_util                                   : int  99 83 38 NA 75 47 92 147 81 NA ...
##  $ open_rv_12m                               : int  0 0 2 0 0 0 4 1 2 0 ...
##  $ open_rv_24m                               : int  0 1 3 1 0 0 8 2 4 0 ...
##  $ max_bal_bc                                : int  1007 5109 650 10239 0 12211 7172 3957 7318 1065 ...
##  $ all_util                                  : int  88 71 30 28 81 66 49 85 81 13 ...
##  $ total_rev_hi_lim                          : int  3000 21800 30200 62500 20000 43800 58950 7000 11400 15900 ...
##  $ inq_fi                                    : int  3 5 1 0 2 0 0 1 1 1 ...
##  $ total_cu_tl                               : int  0 1 0 0 6 0 1 0 0 4 ...
##  $ inq_last_12m                              : int  2 2 2 0 1 0 5 0 4 0 ...
##  $ acc_open_past_24mths                      : int  5 4 3 1 3 1 9 2 7 2 ...
##  $ avg_cur_bal                               : int  3972 17960 1788 11839 14892 11885 16654 2187 21976 30799 ...
##  $ bc_open_to_buy                            : int  1623 4833 21491 17072 NA 3393 5072 605 3500 13801 ...
##  $ bc_util                                   : num  39.9 67.8 3.6 20.4 NA 87.4 59 87.9 85 13.2 ...
##  $ chargeoff_within_12_mths                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ delinq_amnt                               : int  0 0 0 0 0 0 0 4199 0 0 ...
##  $ mo_sin_old_il_acct                        : int  101 87 127 140 57 145 102 88 143 141 ...
##  $ mo_sin_old_rev_tl_op                      : int  185 65 132 481 100 193 167 170 163 144 ...
##  $ mo_sin_rcnt_rev_tl_op                     : int  25 22 9 22 33 26 0 11 3 39 ...
##  $ mo_sin_rcnt_tl                            : int  2 14 9 22 9 12 0 11 3 7 ...
##  $ mort_acc                                  : int  0 1 0 1 0 6 1 0 2 6 ...
##  $ mths_since_recent_bc                      : int  25 22 10 22 NA 32 9 23 3 39 ...
##  $ mths_since_recent_bc_dlq                  : int  49 NA NA NA NA NA NA NA NA NA ...
##  $ mths_since_recent_inq                     : int  4 5 3 NA 9 20 0 18 3 22 ...
##  $ mths_since_recent_revol_delinq            : int  49 NA NA NA NA 72 18 NA NA NA ...
##  $ num_accts_ever_120_pd                     : int  2 0 0 0 0 0 0 1 0 0 ...
##  $ num_actv_bc_tl                            : int  2 3 2 5 0 5 9 2 3 4 ...
##  $ num_actv_rev_tl                           : int  2 3 4 7 1 11 16 3 4 4 ...
##  $ num_bc_sats                               : int  2 3 4 5 0 5 16 2 4 6 ...
##  $ num_bc_tl                                 : int  3 4 6 7 3 9 16 4 4 11 ...
##  $ num_il_tl                                 : int  7 9 9 4 8 7 6 3 7 8 ...
##  $ num_op_rev_tl                             : int  3 5 6 9 1 11 16 3 4 6 ...
##  $ num_rev_accts                             : int  5 9 10 13 4 20 33 6 6 11 ...
##  $ num_rev_tl_bal_gt_0                       : int  2 3 4 7 1 11 11 3 4 4 ...
##  $ num_sats                                  : int  4 9 14 10 4 16 18 4 8 7 ...
##   [list output truncated]
summary(loandata)
##       id            member_id        loan_amnt      funded_amnt   
##  Length:133889      Mode:logical   Min.   : 1000   Min.   : 1000  
##  Class :character   NA's:133889    1st Qu.: 8400   1st Qu.: 8400  
##  Mode  :character                  Median :14400   Median :14400  
##                                    Mean   :15589   Mean   :15589  
##                                    3rd Qu.:21000   3rd Qu.:21000  
##                                    Max.   :40000   Max.   :40000  
##                                    NA's   :2       NA's   :2      
##  funded_amnt_inv     term             int_rate          installment     
##  Min.   :  775   Length:133889      Length:133889      Min.   :  30.12  
##  1st Qu.: 8400   Class :character   Class :character   1st Qu.: 265.68  
##  Median :14400   Mode  :character   Mode  :character   Median : 398.52  
##  Mean   :15582                                         Mean   : 460.86  
##  3rd Qu.:21000                                         3rd Qu.: 613.91  
##  Max.   :40000                                         Max.   :1536.95  
##  NA's   :2                                             NA's   :2        
##     grade            sub_grade          emp_title          emp_length       
##  Length:133889      Length:133889      Length:133889      Length:133889     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  home_ownership       annual_inc      verification_status   issue_d         
##  Length:133889      Min.   :      0   Length:133889       Length:133889     
##  Class :character   1st Qu.:  49500   Class :character    Class :character  
##  Mode  :character   Median :  68000   Mode  :character    Mode  :character  
##                     Mean   :  80464                                         
##                     3rd Qu.:  95008                                         
##                     Max.   :9550000                                         
##                     NA's   :2                                               
##  loan_status         pymnt_plan          url              desc          
##  Length:133889      Length:133889      Mode:logical   Length:133889     
##  Class :character   Class :character   NA's:133889    Class :character  
##  Mode  :character   Mode  :character                  Mode  :character  
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##    purpose             title             zip_code          addr_state       
##  Length:133889      Length:133889      Length:133889      Length:133889     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##       dti          delinq_2yrs      earliest_cr_line   inq_last_6mths  
##  Min.   : -1.00   Min.   : 0.0000   Length:133889      Min.   :0.0000  
##  1st Qu.: 12.64   1st Qu.: 0.0000   Class :character   1st Qu.:0.0000  
##  Median : 18.76   Median : 0.0000   Mode  :character   Median :0.0000  
##  Mean   : 19.55   Mean   : 0.3435                      Mean   :0.5765  
##  3rd Qu.: 25.77   3rd Qu.: 0.0000                      3rd Qu.:1.0000  
##  Max.   :999.00   Max.   :22.0000                      Max.   :5.0000  
##  NA's   :19       NA's   :2                            NA's   :3       
##  mths_since_last_delinq mths_since_last_record    open_acc    
##  Min.   :  0.00         Min.   :  0.00         Min.   : 1.00  
##  1st Qu.: 15.00         1st Qu.: 51.00         1st Qu.: 8.00  
##  Median : 30.00         Median : 68.00         Median :11.00  
##  Mean   : 33.81         Mean   : 66.36         Mean   :12.12  
##  3rd Qu.: 49.00         3rd Qu.: 82.00         3rd Qu.:15.00  
##  Max.   :192.00         Max.   :121.00         Max.   :74.00  
##  NA's   :64567          NA's   :110218         NA's   :2      
##     pub_rec          revol_bal        revol_util          total_acc     
##  Min.   : 0.0000   Min.   :      0   Length:133889      Min.   :  2.00  
##  1st Qu.: 0.0000   1st Qu.:   6578   Class :character   1st Qu.: 17.00  
##  Median : 0.0000   Median :  12278   Mode  :character   Median : 24.00  
##  Mean   : 0.2387   Mean   :  18616                      Mean   : 25.34  
##  3rd Qu.: 0.0000   3rd Qu.:  22011                      3rd Qu.: 32.00  
##  Max.   :46.0000   Max.   :1023940                      Max.   :176.00  
##  NA's   :2         NA's   :2                            NA's   :2       
##  initial_list_status   out_prncp     out_prncp_inv    total_pymnt   
##  Length:133889       Min.   :    0   Min.   :    0   Min.   :    0  
##  Class :character    1st Qu.:    0   1st Qu.:    0   1st Qu.: 6164  
##  Mode  :character    Median : 2158   Median : 2156   Median :10139  
##                      Mean   : 4542   Mean   : 4540   Mean   :12087  
##                      3rd Qu.: 7555   3rd Qu.: 7547   3rd Qu.:16073  
##                      Max.   :31060   Max.   :31060   Max.   :53854  
##                      NA's   :2       NA's   :2       NA's   :2      
##  total_pymnt_inv total_rec_prncp total_rec_int     total_rec_late_fee
##  Min.   :    0   Min.   :    0   Min.   :    0.0   Min.   :  0.000   
##  1st Qu.: 6159   1st Qu.: 4303   1st Qu.:  890.4   1st Qu.:  0.000   
##  Median :10134   Median : 7280   Median : 1745.8   Median :  0.000   
##  Mean   :12081   Mean   : 9528   Mean   : 2449.7   Mean   :  1.363   
##  3rd Qu.:16065   3rd Qu.:12425   3rd Qu.: 3310.4   3rd Qu.:  0.000   
##  Max.   :53854   Max.   :40000   Max.   :18111.0   Max.   :609.810   
##  NA's   :2       NA's   :2       NA's   :2         NA's   :2         
##    recoveries      collection_recovery_fee last_pymnt_d       last_pymnt_amnt  
##  Min.   :    0.0   Min.   :   0.00         Length:133889      Min.   :    0.0  
##  1st Qu.:    0.0   1st Qu.:   0.00         Class :character   1st Qu.:  318.5  
##  Median :    0.0   Median :   0.00         Mode  :character   Median :  564.3  
##  Mean   :  108.1   Mean   :  19.32                            Mean   : 3492.1  
##  3rd Qu.:    0.0   3rd Qu.:   0.00                            3rd Qu.: 3437.6  
##  Max.   :23271.5   Max.   :4188.87                            Max.   :40688.6  
##  NA's   :2         NA's   :2                                  NA's   :2        
##  next_pymnt_d       last_credit_pull_d collections_12_mths_ex_med
##  Length:133889      Length:133889      Min.   : 0.00000          
##  Class :character   Class :character   1st Qu.: 0.00000          
##  Mode  :character   Mode  :character   Median : 0.00000          
##                                        Mean   : 0.02126          
##                                        3rd Qu.: 0.00000          
##                                        Max.   :12.00000          
##                                        NA's   :2                 
##  mths_since_last_major_derog  policy_code application_type   annual_inc_joint
##  Min.   :  0.00              Min.   :1    Length:133889      Min.   : 18072  
##  1st Qu.: 27.00              1st Qu.:1    Class :character   1st Qu.: 77000  
##  Median : 45.00              Median :1    Mode  :character   Median :102000  
##  Mean   : 45.22              Mean   :1                       Mean   :108618  
##  3rd Qu.: 64.00              3rd Qu.:1                       3rd Qu.:131500  
##  Max.   :197.00              Max.   :1                       Max.   :480000  
##  NA's   :95815               NA's   :2                       NA's   :131228  
##    dti_joint      verification_status_joint acc_now_delinq    
##  Min.   : 1.11    Length:133889             Min.   :0.000000  
##  1st Qu.:13.46    Class :character          1st Qu.:0.000000  
##  Median :18.36    Mode  :character          Median :0.000000  
##  Mean   :18.60                              Mean   :0.006969  
##  3rd Qu.:23.38                              3rd Qu.:0.000000  
##  Max.   :63.66                              Max.   :3.000000  
##  NA's   :131230                             NA's   :2         
##   tot_coll_amt       tot_cur_bal       open_acc_6m      open_act_il    
##  Min.   :     0.0   Min.   :      0   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:     0.0   1st Qu.:  32228   1st Qu.: 0.000   1st Qu.: 1.000  
##  Median :     0.0   Median :  86204   Median : 1.000   Median : 2.000  
##  Mean   :   259.2   Mean   : 147240   Mean   : 1.045   Mean   : 2.797  
##  3rd Qu.:     0.0   3rd Qu.: 217484   3rd Qu.: 2.000   3rd Qu.: 3.000  
##  Max.   :224107.0   Max.   :5445012   Max.   :16.000   Max.   :48.000  
##  NA's   :2          NA's   :2         NA's   :64       NA's   :63      
##   open_il_12m       open_il_24m     mths_since_rcnt_il  total_bal_il   
##  Min.   : 0.0000   Min.   : 0.000   Min.   :  0.00     Min.   :     0  
##  1st Qu.: 0.0000   1st Qu.: 0.000   1st Qu.:  6.00     1st Qu.:  9633  
##  Median : 0.0000   Median : 1.000   Median : 13.00     Median : 24144  
##  Mean   : 0.7472   Mean   : 1.625   Mean   : 21.64     Mean   : 35704  
##  3rd Qu.: 1.0000   3rd Qu.: 2.000   3rd Qu.: 24.00     3rd Qu.: 46562  
##  Max.   :20.0000   Max.   :30.000   Max.   :446.00     Max.   :796104  
##  NA's   :63        NA's   :63       NA's   :3703       NA's   :63      
##     il_util        open_rv_12m      open_rv_24m      max_bal_bc    
##  Min.   :  0.00   Min.   : 0.000   Min.   : 0.00   Min.   :     0  
##  1st Qu.: 58.00   1st Qu.: 0.000   1st Qu.: 1.00   1st Qu.:  2525  
##  Median : 74.00   Median : 1.000   Median : 2.00   Median :  4664  
##  Mean   : 70.74   Mean   : 1.374   Mean   : 2.93   Mean   :  6184  
##  3rd Qu.: 87.00   3rd Qu.: 2.000   3rd Qu.: 4.00   3rd Qu.:  8020  
##  Max.   :558.00   Max.   :28.000   Max.   :44.00   Max.   :776843  
##  NA's   :17881    NA's   :63       NA's   :63      NA's   :63      
##     all_util      total_rev_hi_lim      inq_fi         total_cu_tl    
##  Min.   :  0.00   Min.   :      0   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.: 48.00   1st Qu.:  15300   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Median : 62.00   Median :  26400   Median : 0.0000   Median : 0.000  
##  Mean   : 60.65   Mean   :  36362   Mean   : 0.9529   Mean   : 1.544  
##  3rd Qu.: 75.00   3rd Qu.:  45000   3rd Qu.: 1.0000   3rd Qu.: 2.000  
##  Max.   :198.00   Max.   :1184500   Max.   :28.0000   Max.   :79.000  
##  NA's   :69       NA's   :2         NA's   :63        NA's   :64      
##   inq_last_12m    acc_open_past_24mths  avg_cur_bal     bc_open_to_buy  
##  Min.   : 0.000   Min.   : 0.000       Min.   :     0   Min.   :     0  
##  1st Qu.: 0.000   1st Qu.: 2.000       1st Qu.:  3294   1st Qu.:  1699  
##  Median : 2.000   Median : 4.000       Median :  7685   Median :  5284  
##  Mean   : 2.187   Mean   : 4.799       Mean   : 13510   Mean   : 10857  
##  3rd Qu.: 3.000   3rd Qu.: 7.000       3rd Qu.: 18552   3rd Qu.: 13357  
##  Max.   :40.000   Max.   :46.000       Max.   :463698   Max.   :282091  
##  NA's   :64       NA's   :2            NA's   :2        NA's   :1113    
##     bc_util       chargeoff_within_12_mths  delinq_amnt       
##  Min.   :  0.00   Min.   :0.00000          Min.   :     0.00  
##  1st Qu.: 38.30   1st Qu.:0.00000          1st Qu.:     0.00  
##  Median : 62.40   Median :0.00000          Median :     0.00  
##  Mean   : 59.76   Mean   :0.01012          Mean   :    21.34  
##  3rd Qu.: 84.40   3rd Qu.:0.00000          3rd Qu.:     0.00  
##  Max.   :189.80   Max.   :7.00000          Max.   :112524.00  
##  NA's   :1177     NA's   :2                NA's   :2          
##  mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl 
##  Min.   :  0.0      Min.   :  5.0        Min.   :  0.00        Min.   :  0.0  
##  1st Qu.:102.0      1st Qu.:121.0        1st Qu.:  3.00        1st Qu.:  3.0  
##  Median :130.0      Median :171.0        Median :  8.00        Median :  5.0  
##  Mean   :127.6      Mean   :188.7        Mean   : 13.16        Mean   :  7.8  
##  3rd Qu.:153.0      3rd Qu.:241.0        3rd Qu.: 16.00        3rd Qu.: 10.0  
##  Max.   :534.0      Max.   :852.0        Max.   :327.00        Max.   :283.0  
##  NA's   :3648       NA's   :2            NA's   :2             NA's   :2      
##     mort_acc      mths_since_recent_bc mths_since_recent_bc_dlq
##  Min.   : 0.000   Min.   :  0.00       Min.   :  0.00          
##  1st Qu.: 0.000   1st Qu.:  6.00       1st Qu.: 19.00          
##  Median : 1.000   Median : 13.00       Median : 36.00          
##  Mean   : 1.671   Mean   : 23.85       Mean   : 38.84          
##  3rd Qu.: 3.000   3rd Qu.: 28.00       3rd Qu.: 58.00          
##  Max.   :51.000   Max.   :564.00       Max.   :188.00          
##  NA's   :2        NA's   :1046         NA's   :100133          
##  mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd
##  Min.   : 0.000        Min.   :  0.00                 Min.   : 0.0000      
##  1st Qu.: 2.000        1st Qu.: 16.00                 1st Qu.: 0.0000      
##  Median : 5.000        Median : 32.00                 Median : 0.0000      
##  Mean   : 6.813        Mean   : 35.22                 Mean   : 0.5189      
##  3rd Qu.:10.000        3rd Qu.: 51.00                 3rd Qu.: 0.0000      
##  Max.   :25.000        Max.   :197.00                 Max.   :38.0000      
##  NA's   :13800         NA's   :86106                  NA's   :2            
##  num_actv_bc_tl  num_actv_rev_tl   num_bc_sats       num_bc_tl     
##  Min.   : 0.00   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 2.00   1st Qu.: 3.000   1st Qu.: 3.000   1st Qu.: 5.000  
##  Median : 3.00   Median : 5.000   Median : 4.000   Median : 7.000  
##  Mean   : 3.81   Mean   : 5.918   Mean   : 4.956   Mean   : 8.005  
##  3rd Qu.: 5.00   3rd Qu.: 8.000   3rd Qu.: 6.000   3rd Qu.:10.000  
##  Max.   :36.00   Max.   :47.000   Max.   :46.000   Max.   :57.000  
##  NA's   :2       NA's   :2        NA's   :2        NA's   :2       
##    num_il_tl       num_op_rev_tl    num_rev_accts   num_rev_tl_bal_gt_0
##  Min.   :  0.000   Min.   : 0.000   Min.   :  2.0   Min.   : 0.000     
##  1st Qu.:  4.000   1st Qu.: 5.000   1st Qu.:  9.0   1st Qu.: 3.000     
##  Median :  7.000   Median : 8.000   Median : 13.0   Median : 5.000     
##  Mean   :  8.703   Mean   : 8.561   Mean   : 14.7   Mean   : 5.858     
##  3rd Qu.: 12.000   3rd Qu.:11.000   3rd Qu.: 19.0   3rd Qu.: 7.000     
##  Max.   :121.000   Max.   :72.000   Max.   :101.0   Max.   :44.000     
##  NA's   :2         NA's   :2        NA's   :2       NA's   :2          
##     num_sats     num_tl_120dpd_2m  num_tl_30dpd      num_tl_90g_dpd_24m
##  Min.   : 1.00   Min.   :0.000    Min.   :0.000000   Min.   : 0.00000  
##  1st Qu.: 8.00   1st Qu.:0.000    1st Qu.:0.000000   1st Qu.: 0.00000  
##  Median :11.00   Median :0.000    Median :0.000000   Median : 0.00000  
##  Mean   :12.06   Mean   :0.001    Mean   :0.004765   Mean   : 0.09061  
##  3rd Qu.:15.00   3rd Qu.:0.000    3rd Qu.:0.000000   3rd Qu.: 0.00000  
##  Max.   :74.00   Max.   :2.000    Max.   :3.000000   Max.   :22.00000  
##  NA's   :2       NA's   :7623     NA's   :2          NA's   :2         
##  num_tl_op_past_12m pct_tl_nvr_dlq   percent_bc_gt_75 pub_rec_bankruptcies
##  Min.   : 0.000     Min.   :  0.00   Min.   :  0.00   Min.   :0.0000      
##  1st Qu.: 1.000     1st Qu.: 91.10   1st Qu.: 10.00   1st Qu.:0.0000      
##  Median : 2.000     Median : 97.50   Median : 40.00   Median :0.0000      
##  Mean   : 2.249     Mean   : 94.06   Mean   : 44.56   Mean   :0.1305      
##  3rd Qu.: 3.000     3rd Qu.:100.00   3rd Qu.: 75.00   3rd Qu.:0.0000      
##  Max.   :31.000     Max.   :100.00   Max.   :100.00   Max.   :9.0000      
##  NA's   :2          NA's   :2        NA's   :1131     NA's   :2           
##    tax_liens        tot_hi_cred_lim   total_bal_ex_mort total_bc_limit   
##  Min.   : 0.00000   Min.   :   2500   Min.   :      0   Min.   :      0  
##  1st Qu.: 0.00000   1st Qu.:  54220   1st Qu.:  23305   1st Qu.:   8600  
##  Median : 0.00000   Median : 121201   Median :  41016   Median :  16700  
##  Mean   : 0.07117   Mean   : 183894   Mean   :  54124   Mean   :  23723  
##  3rd Qu.: 0.00000   3rd Qu.: 263554   3rd Qu.:  68468   3rd Qu.:  31000  
##  Max.   :45.00000   Max.   :9999999   Max.   :1034841   Max.   :1105500  
##  NA's   :2          NA's   :2         NA's   :2         NA's   :2        
##  total_il_high_credit_limit revol_bal_joint sec_app_earliest_cr_line
##  Min.   :      0            Mode:logical    Mode:logical            
##  1st Qu.:  16585            NA's:133889     NA's:133889             
##  Median :  34622                                                    
##  Mean   :  45634                                                    
##  3rd Qu.:  61258                                                    
##  Max.   :1183946                                                    
##  NA's   :2                                                          
##  sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc sec_app_revol_util
##  Mode:logical           Mode:logical     Mode:logical     Mode:logical      
##  NA's:133889            NA's:133889      NA's:133889      NA's:133889       
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  sec_app_open_act_il sec_app_num_rev_accts sec_app_chargeoff_within_12_mths
##  Mode:logical        Mode:logical          Mode:logical                    
##  NA's:133889         NA's:133889           NA's:133889                     
##                                                                            
##                                                                            
##                                                                            
##                                                                            
##                                                                            
##  sec_app_collections_12_mths_ex_med sec_app_mths_since_last_major_derog
##  Mode:logical                       Mode:logical                       
##  NA's:133889                        NA's:133889                        
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##  hardship_flag      hardship_type      hardship_reason    hardship_status   
##  Length:133889      Length:133889      Length:133889      Length:133889     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  deferral_term    hardship_amount  hardship_start_date hardship_end_date 
##  Min.   :3        Min.   :  7.64   Length:133889       Length:133889     
##  1st Qu.:3        1st Qu.: 65.27   Class :character    Class :character  
##  Median :3        Median :121.74   Mode  :character    Mode  :character  
##  Mean   :3        Mean   :152.71                                         
##  3rd Qu.:3        3rd Qu.:211.15                                         
##  Max.   :3        Max.   :650.09                                         
##  NA's   :132968   NA's   :132968                                         
##  payment_plan_start_date hardship_length   hardship_dpd    hardship_loan_status
##  Length:133889           Min.   :3        Min.   : 0.00    Length:133889       
##  Class :character        1st Qu.:3        1st Qu.: 0.00    Class :character    
##  Mode  :character        Median :3        Median :13.00    Mode  :character    
##                          Mean   :3        Mean   :12.93                        
##                          3rd Qu.:3        3rd Qu.:22.00                        
##                          Max.   :3        Max.   :32.00                        
##                          NA's   :132968   NA's   :132968                       
##  orig_projected_additional_accrued_interest hardship_payoff_balance_amount
##  Min.   :  22.92                            Min.   :  628                 
##  1st Qu.: 183.38                            1st Qu.: 6499                 
##  Median : 347.55                            Median :10764                 
##  Mean   : 442.08                            Mean   :11798                 
##  3rd Qu.: 602.55                            3rd Qu.:16006                 
##  Max.   :1950.27                            Max.   :32348                 
##  NA's   :133159                             NA's   :132968                
##  hardship_last_payment_amount disbursement_method debt_settlement_flag
##  Min.   :   0.01              Length:133889       Length:133889       
##  1st Qu.:  35.50              Class :character    Class :character    
##  Median : 120.15              Mode  :character    Mode  :character    
##  Mean   : 176.68                                                      
##  3rd Qu.: 263.63                                                      
##  Max.   :1407.86                                                      
##  NA's   :132968                                                       
##  debt_settlement_flag_date settlement_status  settlement_date   
##  Length:133889             Length:133889      Length:133889     
##  Class :character          Class :character   Class :character  
##  Mode  :character          Mode  :character   Mode  :character  
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  settlement_amount settlement_percentage settlement_term 
##  Min.   :  329.2   Min.   :20.00         Min.   : 0.00   
##  1st Qu.: 3127.3   1st Qu.:45.00         1st Qu.:11.00   
##  Median : 5464.5   Median :50.00         Median :16.00   
##  Mean   : 6250.6   Mean   :49.43         Mean   :14.64   
##  3rd Qu.: 8346.8   3rd Qu.:50.01         3rd Qu.:18.00   
##  Max.   :23895.9   Max.   :90.00         Max.   :65.00   
##  NA's   :132209    NA's   :132209        NA's   :132209
# Looking at the unique values in our dependent variable i.e. loan_status
unique(loandata$loan_status)
## [1] "Fully Paid"         "Current"            "Late (31-120 days)"
## [4] "Charged Off"        "Late (16-30 days)"  "In Grace Period"   
## [7] "Default"            ""
# Removing the values we do not need for loanstatuses Current, In Grace Period, Late (16-30 days), and empty ''
loandata <- subset(loandata, loan_status != 'In Grace Period')
loandata <- subset(loandata, loan_status != 'Late (16-30 days)')
loandata <- subset(loandata, loan_status != 'Current')
loandata <- subset(loandata, loan_status != '')


# Install and load dplyr package
install.packages("dplyr", repos='http://cran.us.r-project.org')
## 
## The downloaded binary packages are in
##  /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Looking at how many loans are there for each status using group_by function
loandata %>% group_by(loan_status) %>% summarise(count=n())
## # A tibble: 4 × 2
##   loan_status        count
##   <chr>              <int>
## 1 Charged Off        15698
## 2 Default                9
## 3 Fully Paid         41833
## 4 Late (31-120 days)  2329
# Install and load stringr package
install.packages("stringr", repos='http://cran.us.r-project.org')
## 
## The downloaded binary packages are in
##  /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(stringr)

# Combining Charged off, Default, and Late(31-120 days) into single category: Default
# By using the str_detect function from stringr
# the final result would be two loan statuses : Paid and Default
loandata$loan_status = ifelse(str_detect(loandata$loan_status, "Paid"), loandata$loan_status, "Default")

# Making sure our loan statuses data is correct
loandata %>% group_by(loan_status) %>% summarise(count=n())
## # A tibble: 2 × 2
##   loan_status count
##   <chr>       <int>
## 1 Default     18036
## 2 Fully Paid  41833
# Plot loan status
install.packages("ggplot2", repos='http://cran.us.r-project.org')
## 
## The downloaded binary packages are in
##  /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(ggplot2)

gplot <- ggplot(loandata, aes(x=loan_status, fill=loan_status))
gplot + geom_bar()

# Default rate for each type of Loan grade i.e. which are A to G in our loandata
# Use of dplyr package to find the number of defaults for each grade
grade1 = loandata %>% filter(loan_status == "Default") %>% group_by(grade) %>% summarise(default_count = n())
grade1
## # A tibble: 7 × 2
##   grade default_count
##   <chr>         <int>
## 1 A               975
## 2 B              3487
## 3 C              5534
## 4 D              3697
## 5 E              2704
## 6 F              1260
## 7 G               379
# Calculating the default rate in each grade
grade2 = loandata %>% group_by(grade) %>% summarise(count = n())

grade3 <- grade2 %>% left_join(grade1) %>% 
  mutate(default_rate = default_count/count*100) %>% 
  select(grade, count, default_count, default_rate)
## Joining with `by = join_by(grade)`
grade3
## # A tibble: 7 × 4
##   grade count default_count default_rate
##   <chr> <int>         <int>        <dbl>
## 1 A      9956           975         9.79
## 2 B     16649          3487        20.9 
## 3 C     16815          5534        32.9 
## 4 D      8480          3697        43.6 
## 5 E      5261          2704        51.4 
## 6 F      2109          1260        59.7 
## 7 G       599           379        63.3
# Plot the default rate
ggplot(grade3, aes(x=grade, y=default_rate, fill=grade)) + geom_bar(stat = "identity")

# Loan Grade vs. Interest Rate
# Firstly, converting interest rate attribute to numeric
loandata$int_rate = (as.numeric(gsub(pattern = "%", replacement = "", x = loandata$int_rate)))

# Grouping the data by grade and their mean interest rates
x1 = loandata %>% filter(loan_status == "Default") %>% group_by(grade) %>% summarise(int_rate = mean(int_rate)) 
x1
## # A tibble: 7 × 2
##   grade int_rate
##   <chr>    <dbl>
## 1 A         7.06
## 2 B        10.2 
## 3 C        13.6 
## 4 D        17.7 
## 5 E        20.8 
## 6 F        24.4 
## 7 G        28.0
# Plot the data
ggplot(x1, aes(x=grade, y=int_rate, fill=grade)) + geom_bar(stat = "identity", position = "dodge")

# Training and Test datasets
# Using 30% of data for testing and 70% for training

# Sample Indexes
indexes = sample(1:nrow(loandata), size = 0.3*nrow(loandata))

# Split data
data_test = loandata[indexes,]
data_train = loandata[-indexes,]

dim(data_test)
## [1] 17960   145
dim(data_train)
## [1] 41909   145
colnames(data_train)
##   [1] "id"                                        
##   [2] "member_id"                                 
##   [3] "loan_amnt"                                 
##   [4] "funded_amnt"                               
##   [5] "funded_amnt_inv"                           
##   [6] "term"                                      
##   [7] "int_rate"                                  
##   [8] "installment"                               
##   [9] "grade"                                     
##  [10] "sub_grade"                                 
##  [11] "emp_title"                                 
##  [12] "emp_length"                                
##  [13] "home_ownership"                            
##  [14] "annual_inc"                                
##  [15] "verification_status"                       
##  [16] "issue_d"                                   
##  [17] "loan_status"                               
##  [18] "pymnt_plan"                                
##  [19] "url"                                       
##  [20] "desc"                                      
##  [21] "purpose"                                   
##  [22] "title"                                     
##  [23] "zip_code"                                  
##  [24] "addr_state"                                
##  [25] "dti"                                       
##  [26] "delinq_2yrs"                               
##  [27] "earliest_cr_line"                          
##  [28] "inq_last_6mths"                            
##  [29] "mths_since_last_delinq"                    
##  [30] "mths_since_last_record"                    
##  [31] "open_acc"                                  
##  [32] "pub_rec"                                   
##  [33] "revol_bal"                                 
##  [34] "revol_util"                                
##  [35] "total_acc"                                 
##  [36] "initial_list_status"                       
##  [37] "out_prncp"                                 
##  [38] "out_prncp_inv"                             
##  [39] "total_pymnt"                               
##  [40] "total_pymnt_inv"                           
##  [41] "total_rec_prncp"                           
##  [42] "total_rec_int"                             
##  [43] "total_rec_late_fee"                        
##  [44] "recoveries"                                
##  [45] "collection_recovery_fee"                   
##  [46] "last_pymnt_d"                              
##  [47] "last_pymnt_amnt"                           
##  [48] "next_pymnt_d"                              
##  [49] "last_credit_pull_d"                        
##  [50] "collections_12_mths_ex_med"                
##  [51] "mths_since_last_major_derog"               
##  [52] "policy_code"                               
##  [53] "application_type"                          
##  [54] "annual_inc_joint"                          
##  [55] "dti_joint"                                 
##  [56] "verification_status_joint"                 
##  [57] "acc_now_delinq"                            
##  [58] "tot_coll_amt"                              
##  [59] "tot_cur_bal"                               
##  [60] "open_acc_6m"                               
##  [61] "open_act_il"                               
##  [62] "open_il_12m"                               
##  [63] "open_il_24m"                               
##  [64] "mths_since_rcnt_il"                        
##  [65] "total_bal_il"                              
##  [66] "il_util"                                   
##  [67] "open_rv_12m"                               
##  [68] "open_rv_24m"                               
##  [69] "max_bal_bc"                                
##  [70] "all_util"                                  
##  [71] "total_rev_hi_lim"                          
##  [72] "inq_fi"                                    
##  [73] "total_cu_tl"                               
##  [74] "inq_last_12m"                              
##  [75] "acc_open_past_24mths"                      
##  [76] "avg_cur_bal"                               
##  [77] "bc_open_to_buy"                            
##  [78] "bc_util"                                   
##  [79] "chargeoff_within_12_mths"                  
##  [80] "delinq_amnt"                               
##  [81] "mo_sin_old_il_acct"                        
##  [82] "mo_sin_old_rev_tl_op"                      
##  [83] "mo_sin_rcnt_rev_tl_op"                     
##  [84] "mo_sin_rcnt_tl"                            
##  [85] "mort_acc"                                  
##  [86] "mths_since_recent_bc"                      
##  [87] "mths_since_recent_bc_dlq"                  
##  [88] "mths_since_recent_inq"                     
##  [89] "mths_since_recent_revol_delinq"            
##  [90] "num_accts_ever_120_pd"                     
##  [91] "num_actv_bc_tl"                            
##  [92] "num_actv_rev_tl"                           
##  [93] "num_bc_sats"                               
##  [94] "num_bc_tl"                                 
##  [95] "num_il_tl"                                 
##  [96] "num_op_rev_tl"                             
##  [97] "num_rev_accts"                             
##  [98] "num_rev_tl_bal_gt_0"                       
##  [99] "num_sats"                                  
## [100] "num_tl_120dpd_2m"                          
## [101] "num_tl_30dpd"                              
## [102] "num_tl_90g_dpd_24m"                        
## [103] "num_tl_op_past_12m"                        
## [104] "pct_tl_nvr_dlq"                            
## [105] "percent_bc_gt_75"                          
## [106] "pub_rec_bankruptcies"                      
## [107] "tax_liens"                                 
## [108] "tot_hi_cred_lim"                           
## [109] "total_bal_ex_mort"                         
## [110] "total_bc_limit"                            
## [111] "total_il_high_credit_limit"                
## [112] "revol_bal_joint"                           
## [113] "sec_app_earliest_cr_line"                  
## [114] "sec_app_inq_last_6mths"                    
## [115] "sec_app_mort_acc"                          
## [116] "sec_app_open_acc"                          
## [117] "sec_app_revol_util"                        
## [118] "sec_app_open_act_il"                       
## [119] "sec_app_num_rev_accts"                     
## [120] "sec_app_chargeoff_within_12_mths"          
## [121] "sec_app_collections_12_mths_ex_med"        
## [122] "sec_app_mths_since_last_major_derog"       
## [123] "hardship_flag"                             
## [124] "hardship_type"                             
## [125] "hardship_reason"                           
## [126] "hardship_status"                           
## [127] "deferral_term"                             
## [128] "hardship_amount"                           
## [129] "hardship_start_date"                       
## [130] "hardship_end_date"                         
## [131] "payment_plan_start_date"                   
## [132] "hardship_length"                           
## [133] "hardship_dpd"                              
## [134] "hardship_loan_status"                      
## [135] "orig_projected_additional_accrued_interest"
## [136] "hardship_payoff_balance_amount"            
## [137] "hardship_last_payment_amount"              
## [138] "disbursement_method"                       
## [139] "debt_settlement_flag"                      
## [140] "debt_settlement_flag_date"                 
## [141] "settlement_status"                         
## [142] "settlement_date"                           
## [143] "settlement_amount"                         
## [144] "settlement_percentage"                     
## [145] "settlement_term"
# Discard attributes that are not needed
discard_column = c("collection_recovery_fee", "emp_title",
                   "funded_amnt_inv", "id", "installment",
                   "last_credit_pull_d", "last_pymnt_d",
                   "last_pymnt_amnt", "loan_amnt",
                   "member_id", "next_pymnt_d",
                   "num_tl_120dpd_2m", "num_tl_30dpd",
                   "out_prncp", "out_prncp_inv",
                   "recoveries", "total_pymnt",
                   "total_pymnt_inv", "total_rec_int",
                   "total_rec_late_fee", "total_rec_prncp",
                   "url", "zip_code"
)
data_train = (data_train[,!(names(data_train) %in% discard_column)])

dim(data_train)
## [1] 41909   122
# Discarding grade attribute as well because sub_grade attribute is present
data_train$grade = NULL

# Discard columns with too many Na's - where percentage is greater than 50%
data_train <- data_train[, -which(colMeans(is.na(data_train)) > 0.5)]

# Discarding further attributes that are not needed
discard_column = c("hardship_flag","hardship_type","hardship_reason",
                   "hardship_status","hardship_start_date","hardship_end_date",
                   "payment_plan_start_date","hardship_loan_status","disbursement_method",
                   "debt_settlement_flag","debt_settlement_flag_date","settlement_status",
                   "settlement_date"
)
data_train = (data_train[,!(names(data_train) %in% discard_column)])

# Converting revol_util attribute to numeric datatype
data_train$revol_util = (as.numeric(gsub(pattern = "%", replacement = "", x = data_train$revol_util)))

install.packages("lubridate", repos='http://cran.us.r-project.org')
## 
## The downloaded binary packages are in
##  /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Transforming earliest_cr_line attribute to the no. of days before the loan is issued
data_train$earliest_cr_line = parse_date_time(str_c("01", data_train$issue_d), "dmy") - parse_date_time(str_c("01", data_train$earliest_cr_line), "dmy")
data_train$earliest_cr_line = as.numeric(data_train$earliest_cr_line, units="days")

# Analyzing default rate by issued month
data_train$issue_m = sapply(data_train$issue_d, function(x){str_split(x,"-")[[1]][1]})

tmp = data_train %>% filter(loan_status=="Default") %>% group_by(issue_m) %>% summarise(default_count = n())

tmp1 = data_train %>% group_by(issue_m) %>% summarise(count=n())

tmp1 %>% left_join(tmp) %>% mutate(default_rate = default_count/count)
## Joining with `by = join_by(issue_m)`
## # A tibble: 3 × 4
##   issue_m count default_count default_rate
##   <chr>   <int>         <int>        <dbl>
## 1 Feb     12375          3746        0.303
## 2 Jan     10776          3065        0.284
## 3 Mar     18758          5783        0.308
str(data_train$issue_m)
##  chr [1:41909] "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" "Mar" ...
# As seen the default rate does not vary much by the month it is issued, so we will proceed dropping the issue_d and issue_m attributes
data_train$issue_m = NULL
data_train$issue_d = NULL

# Removing tmp and tmp1
rm(tmp,tmp1)


# Attributes with Zero Variance
# In our dataset, we will look for predictors with zero variance and will proceed removing them
# Let's define some generic functions that we will use later

# Returns the Numeric columns only from a dataset
getNumericColumns <- function(t) {
  tn = sapply(t, function(x){is.numeric(x)})
  return(names(tn)[which(tn)])
}

# Returns the character columns only from a dataset
getCharColumns <- function(t) {
  tn = sapply(t, function(x){is.character(x)})
  return(names(tn)[which(tn)])
}

# Returns the factor columns only in a datset
getFactorColumns <- function(t) {
  tn = sapply(t, function(x){is.factor(x)})
  return(names(tn)[which(tn)])
}

# Returns index of columns along with column names
getIndexofColumns <- function(t, column_names) {
  return(match(column_names, colnames(t)))
}

# Find character columns having same value and numeric columns having zero variance
tmp = apply(data_train[getCharColumns(data_train)],2,function(x){length(unique(x))})

tmp = tmp[tmp==1]

tmp2 = apply(data_train[getNumericColumns(data_train)],2,function(x){(sd(x))})

tmp2 = tmp2[tmp2==0]

discard_column = c(names(tmp), names(tmp2))
discard_column
##  [1] NA            NA            NA            NA            "policy_code"
##  [6] NA            NA            NA            NA            NA           
## [11] NA            NA            NA            NA            NA           
## [16] NA            NA            NA            NA            NA           
## [21] NA            NA            NA            NA            NA
# There is only one predictor that meets this criteria
# Dropping the zero-variance feature we have found
data_train = data_train[,!names(data_train) %in% discard_column]

# Dropping further attributes that are not needed
# Let's look at attributes 'title' and 'purpose'
table(data_train$title)
## 
##                                        Business           Car financing 
##                    3348                     404                     390 
## Credit card refinancing      Debt consolidation              Green loan 
##                    8246                   22704                      21 
##             Home buying        Home improvement          Major purchase 
##                     207                    2604                     862 
##        Medical expenses   Moving and relocation                   Other 
##                     429                     253                    2207 
##                Vacation 
##                     234
table(data_train$purpose)
## 
##                car        credit_card debt_consolidation   home_improvement 
##                410               9089              24729               2782 
##              house     major_purchase            medical             moving 
##                224                919                460                270 
##              other   renewable_energy     small_business           vacation 
##               2309                 25                438                254
# Since both attributes have same information, we will proceed dropping title
data_train$title = NULL

# Looking at 'desc' column
str(data_train$desc)
##  chr [1:41909] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ...
# Let's drop desc column as well since it contains mostly empty values
data_train$desc = NULL

# Default by States
# Filtering out the states that have too small no. of loans(less than 1000)
tmp = data_train %>% filter(loan_status=="Default") %>% group_by(addr_state) %>% summarise(default_count = n())

tmp2 = data_train %>% group_by(addr_state) %>% summarise(count = n())

tmp3 = tmp2 %>% left_join(tmp) %>% mutate(default_rate = default_count/count)
## Joining with `by = join_by(addr_state)`
tmp3
## # A tibble: 50 × 4
##    addr_state count default_count default_rate
##    <chr>      <int>         <int>        <dbl>
##  1 AK           114            36        0.316
##  2 AL           543           198        0.365
##  3 AR           302           121        0.401
##  4 AZ          1090           305        0.280
##  5 CA          6166          1846        0.299
##  6 CO          1005           223        0.222
##  7 CT           550           145        0.264
##  8 DC            95            18        0.189
##  9 DE           108            28        0.259
## 10 FL          3025           954        0.315
## # ℹ 40 more rows
# Order States by Default Rate to identify highest and lowest default rates
# order by highest default rate

high_default = (tmp3 %>% filter(count>1000) %>% arrange(desc(default_rate))) [1:10, "addr_state"]$addr_state
high_default
##  [1] "NJ" "NY" "PA" "FL" "OH" "TX" "NC" "IL" "CA" "VA"
# order by lowest default rate

low_default = (tmp3 %>% filter(count>1000) %>% arrange((default_rate))) [1:10, "addr_state"]$addr_state
low_default
##  [1] "CO" "GA" "AZ" "MI" "VA" "CA" "IL" "NC" "TX" "OH"
# Creating Binary variables for 5 highest states and 5 lowest states, discarding the rest
data_train$is_nj = ifelse(data_train$addr_state=='NJ',1,0)
data_train$is_ny = ifelse(data_train$addr_state=='NY',1,0)
data_train$is_pa = ifelse(data_train$addr_state=='PA',1,0)
data_train$is_fl = ifelse(data_train$addr_state=='FL',1,0)
data_train$is_oh = ifelse(data_train$addr_state=='OH',1,0)

data_train$is_co = ifelse(data_train$addr_state=='CO',1,0)
data_train$is_ga = ifelse(data_train$addr_state=='GA',1,0)
data_train$is_az = ifelse(data_train$addr_state=='AZ',1,0)
data_train$is_mi = ifelse(data_train$addr_state=='MI',1,0)
data_train$is_va = ifelse(data_train$addr_state=='VA',1,0)


# Let's proceed removing addr_state attribute and other temporary objects
data_train$addr_state = NULL
rm(tmp, tmp2, tmp3, high_default, low_default)


# Remove Correlated features

# Investigating if there are any correlation among features
install.packages("corrplot", repos='http://cran.us.r-project.org')
## 
## The downloaded binary packages are in
##  /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(data_train[getNumericColumns(data_train)], use="na.or.complete"))

# Numeric features
# Looking at all the numeric features
str(data_train[getNumericColumns(data_train)])
## 'data.frame':    41909 obs. of  75 variables:
##  $ funded_amnt               : int  35000 14400 18000 7250 10000 10000 5200 8400 10000 6950 ...
##  $ int_rate                  : num  20.75 10.75 11.99 13.67 5.32 ...
##  $ annual_inc                : num  85000 85000 100000 72000 45000 43000 36000 66000 60000 50000 ...
##  $ dti                       : num  24.98 28.11 20.41 23.93 8.03 ...
##  $ delinq_2yrs               : int  0 0 1 1 0 0 0 0 0 0 ...
##  $ earliest_cr_line          : num  2647 5873 5083 11382 9436 ...
##  $ inq_last_6mths            : int  1 0 4 0 0 1 0 1 0 0 ...
##  $ mths_since_last_delinq    : int  NA 72 18 20 NA NA NA 60 35 55 ...
##  $ open_acc                  : int  9 16 21 12 7 9 11 7 18 5 ...
##  $ pub_rec                   : int  0 0 0 1 1 0 0 0 0 4 ...
##  $ revol_bal                 : int  10167 37582 29047 12220 471 10139 4970 11059 16291 7096 ...
##  $ revol_util                : num  46.6 85.8 49 65 7.4 30.7 16.7 78.4 75.4 59.1 ...
##  $ total_acc                 : int  19 33 40 24 15 14 12 22 25 26 ...
##  $ collections_12_mths_ex_med: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ acc_now_delinq            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ tot_coll_amt              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ tot_cur_bal               : int  161643 190166 299784 366483 11970 42767 13483 157051 77604 7096 ...
##  $ open_acc_6m               : int  0 0 3 1 0 1 1 1 0 0 ...
##  $ open_act_il               : int  3 3 1 2 1 2 1 1 8 0 ...
##  $ open_il_12m               : int  0 1 1 1 0 1 1 1 1 0 ...
##  $ open_il_24m               : int  3 1 1 3 1 1 1 1 5 0 ...
##  $ mths_since_rcnt_il        : int  14 12 4 3 23 6 1 10 7 91 ...
##  $ total_bal_il              : int  73863 22387 29197 40343 11499 32628 8513 19231 61313 0 ...
##  $ il_util                   : int  83 47 92 92 72 73 100 91 94 NA ...
##  $ open_rv_12m               : int  0 0 4 1 1 1 0 2 0 0 ...
##  $ open_rv_24m               : int  1 0 8 1 2 1 3 2 1 1 ...
##  $ max_bal_bc                : int  5109 12211 7172 3694 325 5260 2060 3019 7376 5255 ...
##  $ all_util                  : int  71 66 49 84 54 55 35 86 89 59 ...
##  $ total_rev_hi_lim          : int  21800 43800 58950 18800 6400 33000 29700 14100 21600 12000 ...
##  $ inq_fi                    : int  5 0 0 0 2 2 0 1 3 0 ...
##  $ total_cu_tl               : int  1 0 1 0 2 0 0 1 3 0 ...
##  $ inq_last_12m              : int  2 0 5 1 0 3 0 3 6 1 ...
##  $ acc_open_past_24mths      : int  4 1 9 4 3 2 4 4 6 1 ...
##  $ avg_cur_bal               : int  17960 11885 16654 30540 1710 4752 1226 22436 4565 1419 ...
##  $ bc_open_to_buy            : int  4833 3393 5072 997 4329 9861 22954 585 1709 2934 ...
##  $ bc_util                   : num  67.8 87.4 59 90.1 9.8 50.7 17.4 90.6 90.5 69.8 ...
##  $ chargeoff_within_12_mths  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ delinq_amnt               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mo_sin_old_il_acct        : int  87 145 102 132 135 65 1 129 104 153 ...
##  $ mo_sin_old_rev_tl_op      : int  65 193 167 374 310 65 159 129 107 205 ...
##  $ mo_sin_rcnt_rev_tl_op     : int  22 26 0 8 10 10 17 2 22 17 ...
##  $ mo_sin_rcnt_tl            : int  14 12 0 3 10 6 1 2 7 17 ...
##  $ mort_acc                  : int  1 6 1 4 2 0 0 2 0 8 ...
##  $ mths_since_recent_bc      : int  22 32 9 59 10 10 17 89 22 31 ...
##  $ mths_since_recent_inq     : int  5 20 0 9 23 6 NA 2 7 11 ...
##  $ num_accts_ever_120_pd     : int  0 0 0 3 0 0 0 2 0 1 ...
##  $ num_actv_bc_tl            : int  3 5 9 3 2 4 7 2 6 3 ...
##  $ num_actv_rev_tl           : int  3 11 16 5 2 4 8 4 6 4 ...
##  $ num_bc_sats               : int  3 5 16 3 4 4 8 2 6 3 ...
##  $ num_bc_tl                 : int  4 9 16 4 5 4 9 7 7 7 ...
##  $ num_il_tl                 : int  9 7 6 8 4 4 1 2 13 3 ...
##  $ num_op_rev_tl             : int  5 11 16 9 6 7 10 5 10 5 ...
##  $ num_rev_accts             : int  9 20 33 12 9 10 11 12 12 15 ...
##  $ num_rev_tl_bal_gt_0       : int  3 11 11 5 2 4 8 4 6 4 ...
##  $ num_sats                  : int  9 16 18 12 7 9 11 7 18 5 ...
##  $ num_tl_90g_dpd_24m        : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ num_tl_op_past_12m        : int  0 1 5 2 1 2 1 3 1 0 ...
##  $ pct_tl_nvr_dlq            : num  100 93.9 98 83.3 100 100 100 86.4 96 91.3 ...
##  $ percent_bc_gt_75          : num  33.3 100 22.2 100 0 0 25 100 83.3 33.3 ...
##  $ pub_rec_bankruptcies      : int  0 0 0 0 1 0 0 0 0 3 ...
##  $ tax_liens                 : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ tot_hi_cred_lim           : int  193751 232277 350840 374306 22363 77555 38200 166260 86947 12000 ...
##  $ total_bal_ex_mort         : int  84030 59969 58244 52563 11970 42767 13483 30290 77604 7096 ...
##  $ total_bc_limit            : int  15000 27000 45650 10100 4800 20000 27800 6200 18000 9700 ...
##  $ total_il_high_credit_limit: int  87951 47352 31590 43666 15963 44555 8500 21079 65347 0 ...
##  $ is_nj                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_ny                     : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ is_pa                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_fl                     : num  0 0 0 0 1 0 0 0 0 1 ...
##  $ is_oh                     : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ is_co                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_ga                     : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ is_az                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_mi                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_va                     : num  0 0 0 0 0 0 0 0 0 0 ...
# Transforming annual_inc, revol_bal, avg_cur_bal, and bc_open_to_buy by dividing them by funded_amnt (amount of loan issued or funded)

data_train$annual_inc = data_train$annual_inc/data_train$funded_amnt

data_train$revol_bal = data_train$revol_bal/data_train$funded_amnt

data_train$avg_cur_bal = data_train$avg_cur_bal/data_train$funded_amnt

data_train$bc_open_to_buy = data_train$bc_open_to_buy/data_train$funded_amnt

# Removing funded_amnt attribute
data_train$funded_amnt = NULL


# Character features
# Looking at all the character features
str(data_train[getCharColumns(data_train)])
## 'data.frame':    41909 obs. of  11 variables:
##  $ term                     : chr  " 60 months" " 36 months" " 60 months" " 36 months" ...
##  $ sub_grade                : chr  "E2" "B4" "C1" "C3" ...
##  $ emp_length               : chr  "3 years" "10+ years" "6 years" "3 years" ...
##  $ home_ownership           : chr  "MORTGAGE" "MORTGAGE" "MORTGAGE" "MORTGAGE" ...
##  $ verification_status      : chr  "Source Verified" "Not Verified" "Verified" "Source Verified" ...
##  $ loan_status              : chr  "Fully Paid" "Fully Paid" "Fully Paid" "Fully Paid" ...
##  $ pymnt_plan               : chr  "n" "n" "n" "n" ...
##  $ purpose                  : chr  "debt_consolidation" "small_business" "debt_consolidation" "debt_consolidation" ...
##  $ initial_list_status      : chr  "w" "w" "w" "w" ...
##  $ application_type         : chr  "Individual" "Individual" "Individual" "Individual" ...
##  $ verification_status_joint: chr  "" "" "" "" ...
# Removing verification_status_joint attribute as it only contains nulls
data_train$verification_status_joint = NULL

# Looking at the home_ownership attribute
table(data_train$home_ownership)
## 
## MORTGAGE      OWN     RENT 
##    20984     5127    15798
# Let's now look at all loans or observation with pymnt_plan = "y", that all ended in being default status

data_train %>% filter(pymnt_plan=="y") %>% select(pymnt_plan, loan_status)
##    pymnt_plan loan_status
## 1           y     Default
## 2           y     Default
## 3           y     Default
## 4           y     Default
## 5           y     Default
## 6           y     Default
## 7           y     Default
## 8           y     Default
## 9           y     Default
## 10          y     Default
## 11          y     Default
## 12          y     Default
## 13          y     Default
## 14          y     Default
## 15          y     Default
## 16          y     Default
## 17          y     Default
## 18          y     Default
## 19          y     Default
## 20          y     Default
## 21          y     Default
## 22          y     Default
## 23          y     Default
## 24          y     Default
## 25          y     Default
## 26          y     Default
## 27          y     Default
## 28          y     Default
## 29          y     Default
## 30          y     Default
## 31          y     Default
## 32          y     Default
## 33          y     Default
## 34          y     Default
## 35          y     Default
## 36          y     Default
## 37          y     Default
## 38          y     Default
## 39          y     Default
## 40          y     Default
## 41          y     Default
## 42          y     Default
## 43          y     Default
## 44          y     Default
## 45          y     Default
## 46          y     Default
## 47          y     Default
## 48          y     Default
## 49          y     Default
## 50          y     Default
## 51          y     Default
## 52          y     Default
## 53          y     Default
## 54          y     Default
## 55          y     Default
## 56          y     Default
## 57          y     Default
## 58          y     Default
## 59          y     Default
## 60          y     Default
## 61          y     Default
## 62          y     Default
## 63          y     Default
## 64          y     Default
## we can see there are total of 64 observations that are defaulted but had payment plan as yes

# In our training data with 41909 observation, 64 is a miniscule number of records
# For our analysis purpose, we will remove the pymnt_plan attribute

data_train$pymnt_plan = NULL


# Logistic regression
# Using the preProcess function from caret package to center and scale (Normalize) the data

## we can try to remove the number of dimensions further by fitting the logistic regression
## and investigate p-value of the coefficients. Our Null hypothesis is that each feature makes
## no contribution to the predictive model (i.e. its coefficient is zero).
## we will then discard each feature that fails to reject the hypothesis.


install.packages("caret", repos='http://cran.us.r-project.org')
## 
## The downloaded binary packages are in
##  /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(caret)
## Loading required package: lattice
install.packages("rms", repos='http://cran.us.r-project.org')
## 
## The downloaded binary packages are in
##  /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmpYRTnau/downloaded_packages
library(rms)
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
trans.model = preProcess(data_train, method = c("center", "scale"))

data_train = predict(trans.model, data_train)

model = lrm(loan_status ~ ., data_train)
model
## Frequencies of Missing Values Due to Each Variable
##                loan_status                       term 
##                          0                          0 
##                   int_rate                  sub_grade 
##                          0                          0 
##                 emp_length             home_ownership 
##                          0                          0 
##                 annual_inc        verification_status 
##                          0                          0 
##                    purpose                        dti 
##                          0                          4 
##                delinq_2yrs           earliest_cr_line 
##                          0                          0 
##             inq_last_6mths     mths_since_last_delinq 
##                          1                      20133 
##                   open_acc                    pub_rec 
##                          0                          0 
##                  revol_bal                 revol_util 
##                          0                         28 
##                  total_acc        initial_list_status 
##                          0                          0 
## collections_12_mths_ex_med           application_type 
##                          0                          0 
##             acc_now_delinq               tot_coll_amt 
##                          0                          0 
##                tot_cur_bal                open_acc_6m 
##                          0                         20 
##                open_act_il                open_il_12m 
##                         19                         19 
##                open_il_24m         mths_since_rcnt_il 
##                         19                        948 
##               total_bal_il                    il_util 
##                         19                       5112 
##                open_rv_12m                open_rv_24m 
##                         19                         19 
##                 max_bal_bc                   all_util 
##                         19                         22 
##           total_rev_hi_lim                     inq_fi 
##                          0                         19 
##                total_cu_tl               inq_last_12m 
##                         20                         20 
##       acc_open_past_24mths                avg_cur_bal 
##                          0                          0 
##             bc_open_to_buy                    bc_util 
##                        392                        423 
##   chargeoff_within_12_mths                delinq_amnt 
##                          0                          0 
##         mo_sin_old_il_acct       mo_sin_old_rev_tl_op 
##                        931                          0 
##      mo_sin_rcnt_rev_tl_op             mo_sin_rcnt_tl 
##                          0                          0 
##                   mort_acc       mths_since_recent_bc 
##                          0                        366 
##      mths_since_recent_inq      num_accts_ever_120_pd 
##                       3339                          0 
##             num_actv_bc_tl            num_actv_rev_tl 
##                          0                          0 
##                num_bc_sats                  num_bc_tl 
##                          0                          0 
##                  num_il_tl              num_op_rev_tl 
##                          0                          0 
##              num_rev_accts        num_rev_tl_bal_gt_0 
##                          0                          0 
##                   num_sats         num_tl_90g_dpd_24m 
##                          0                          0 
##         num_tl_op_past_12m             pct_tl_nvr_dlq 
##                          0                          0 
##           percent_bc_gt_75       pub_rec_bankruptcies 
##                        399                          0 
##                  tax_liens            tot_hi_cred_lim 
##                          0                          0 
##          total_bal_ex_mort             total_bc_limit 
##                          0                          0 
## total_il_high_credit_limit                      is_nj 
##                          0                          0 
##                      is_ny                      is_pa 
##                          0                          0 
##                      is_fl                      is_oh 
##                          0                          0 
##                      is_co                      is_ga 
##                          0                          0 
##                      is_az                      is_mi 
##                          0                          0 
##                      is_va 
##                          0 
## 
## Logistic Regression Model
## 
## lrm(formula = loan_status ~ ., data = data_train)
## 
## 
##                        Model Likelihood        Discrimination    Rank Discrim.    
##                              Ratio Test               Indexes          Indexes    
## Obs         18046    LR chi2    2593.14        R2       0.187    C       0.725    
##  Default     5779    d.f.           137    R2(137,18046)0.127    Dxy     0.450    
##  Fully Paid 12267    Pr(> chi2) <0.0001    R2(137,11785)0.188    gamma   0.450    
## max |deriv| 1e-06                              Brier    0.188    tau-a   0.196    
## 
##                                     Coef    S.E.   Wald Z Pr(>|Z|)
## Intercept                            3.1104 0.5021  6.20  <0.0001 
## term= 60 months                     -0.3924 0.0478 -8.21  <0.0001 
## int_rate                             0.2680 0.2606  1.03  0.3038  
## sub_grade=A2                        -0.4924 0.2583 -1.91  0.0566  
## sub_grade=A3                        -0.3362 0.2698 -1.25  0.2128  
## sub_grade=A4                        -0.6098 0.2568 -2.37  0.0176  
## sub_grade=A5                        -0.8196 0.2529 -3.24  0.0012  
## sub_grade=B1                        -1.0137 0.2586 -3.92  <0.0001 
## sub_grade=B2                        -0.9146 0.2824 -3.24  0.0012  
## sub_grade=B3                        -1.0339 0.3034 -3.41  0.0007  
## sub_grade=B4                        -1.4030 0.3427 -4.09  <0.0001 
## sub_grade=B5                        -1.2283 0.3741 -3.28  0.0010  
## sub_grade=C1                        -1.4535 0.3966 -3.66  0.0002  
## sub_grade=C2                        -1.6519 0.4413 -3.74  0.0002  
## sub_grade=C3                        -1.6109 0.4722 -3.41  0.0006  
## sub_grade=C4                        -1.7167 0.5065 -3.39  0.0007  
## sub_grade=C5                        -1.9442 0.5473 -3.55  0.0004  
## sub_grade=D1                        -1.9237 0.5967 -3.22  0.0013  
## sub_grade=D2                        -2.1504 0.6434 -3.34  0.0008  
## sub_grade=D3                        -2.3580 0.6894 -3.42  0.0006  
## sub_grade=D4                        -2.3300 0.7242 -3.22  0.0013  
## sub_grade=D5                        -2.5266 0.7516 -3.36  0.0008  
## sub_grade=E1                        -2.4304 0.7765 -3.13  0.0017  
## sub_grade=E2                        -2.4651 0.8099 -3.04  0.0023  
## sub_grade=E3                        -2.4197 0.8330 -2.90  0.0037  
## sub_grade=E4                        -2.4510 0.8707 -2.81  0.0049  
## sub_grade=E5                        -2.6498 0.9040 -2.93  0.0034  
## sub_grade=F1                        -2.5280 0.9415 -2.69  0.0073  
## sub_grade=F2                        -2.7730 0.9927 -2.79  0.0052  
## sub_grade=F3                        -3.1869 1.0470 -3.04  0.0023  
## sub_grade=F4                        -3.3709 1.0946 -3.08  0.0021  
## sub_grade=F5                        -3.0529 1.1309 -2.70  0.0069  
## sub_grade=G1                        -2.9341 1.1875 -2.47  0.0135  
## sub_grade=G2                        -3.1018 1.2344 -2.51  0.0120  
## sub_grade=G3                        -2.8577 1.2551 -2.28  0.0228  
## sub_grade=G4                        -3.5188 1.2945 -2.72  0.0066  
## sub_grade=G5                        -3.2211 1.3048 -2.47  0.0136  
## emp_length=1 year                    0.1216 0.0917  1.33  0.1849  
## emp_length=10+ years                 0.1559 0.0708  2.20  0.0277  
## emp_length=2 years                   0.0215 0.0847  0.25  0.7999  
## emp_length=3 years                   0.0676 0.0871  0.78  0.4380  
## emp_length=4 years                   0.1874 0.0957  1.96  0.0503  
## emp_length=5 years                   0.0104 0.0940  0.11  0.9120  
## emp_length=6 years                   0.0538 0.1056  0.51  0.6103  
## emp_length=7 years                  -0.1075 0.1106 -0.97  0.3310  
## emp_length=8 years                   0.0033 0.1004  0.03  0.9738  
## emp_length=9 years                   0.0547 0.1052  0.52  0.6028  
## emp_length=n/a                      -0.4117 0.0972 -4.24  <0.0001 
## home_ownership=OWN                  -0.0958 0.0593 -1.61  0.1065  
## home_ownership=RENT                 -0.3107 0.0475 -6.54  <0.0001 
## annual_inc                           0.2064 0.0495  4.17  <0.0001 
## verification_status=Source Verified -0.0440 0.0442 -0.99  0.3202  
## verification_status=Verified         0.0066 0.0485  0.14  0.8911  
## purpose=credit_card                 -0.4770 0.2154 -2.21  0.0268  
## purpose=debt_consolidation          -0.4900 0.2124 -2.31  0.0210  
## purpose=home_improvement            -0.5901 0.2216 -2.66  0.0077  
## purpose=house                       -0.5003 0.3052 -1.64  0.1011  
## purpose=major_purchase              -0.8237 0.2409 -3.42  0.0006  
## purpose=medical                     -0.8457 0.2643 -3.20  0.0014  
## purpose=moving                      -0.7877 0.3002 -2.62  0.0087  
## purpose=other                       -0.5468 0.2236 -2.45  0.0145  
## purpose=renewable_energy            -1.9616 0.9284 -2.11  0.0346  
## purpose=small_business              -0.9322 0.2620 -3.56  0.0004  
## purpose=vacation                    -1.1051 0.3141 -3.52  0.0004  
## dti                                 -0.1354 0.0239 -5.67  <0.0001 
## delinq_2yrs                         -0.0500 0.0214 -2.33  0.0197  
## earliest_cr_line                    -0.2244 0.0462 -4.85  <0.0001 
## inq_last_6mths                      -0.0299 0.0215 -1.39  0.1651  
## mths_since_last_delinq               0.0889 0.0218  4.07  <0.0001 
## open_acc                             0.1100 0.3539  0.31  0.7559  
## pub_rec                              0.0473 0.0422  1.12  0.2631  
## revol_bal                            0.0677 0.0457  1.48  0.1383  
## revol_util                          -0.0579 0.0414 -1.40  0.1618  
## total_acc                            0.0562 0.3323  0.17  0.8658  
## initial_list_status=w                0.1027 0.0467  2.20  0.0279  
## collections_12_mths_ex_med          -0.0246 0.0145 -1.70  0.0890  
## application_type=Joint App           0.3663 0.1307  2.80  0.0051  
## acc_now_delinq                       0.0021 0.0135  0.15  0.8771  
## tot_coll_amt                         0.0289 0.0171  1.69  0.0911  
## tot_cur_bal                          0.0423 0.1363  0.31  0.7564  
## open_acc_6m                          0.0020 0.0263  0.07  0.9408  
## open_act_il                          0.0027 0.0813  0.03  0.9732  
## open_il_12m                          0.0345 0.0751  0.46  0.6458  
## open_il_24m                         -0.0610 0.0903 -0.68  0.4992  
## mths_since_rcnt_il                  -0.0534 0.0335 -1.60  0.1105  
## total_bal_il                         0.2258 0.1109  2.04  0.0417  
## il_util                              0.0148 0.0312  0.47  0.6363  
## open_rv_12m                          0.1682 0.1105  1.52  0.1280  
## open_rv_24m                         -0.1534 0.1369 -1.12  0.2625  
## max_bal_bc                           0.0305 0.0453  0.67  0.5011  
## all_util                            -0.1037 0.0417 -2.49  0.0128  
## total_rev_hi_lim                     0.0897 0.0612  1.47  0.1428  
## inq_fi                              -0.0610 0.0195 -3.13  0.0018  
## total_cu_tl                          0.0387 0.0184  2.10  0.0356  
## inq_last_12m                         0.0436 0.0236  1.85  0.0644  
## acc_open_past_24mths                 0.0524 0.1665  0.31  0.7530  
## avg_cur_bal                          0.0210 0.0362  0.58  0.5625  
## bc_open_to_buy                       0.0448 0.0464  0.97  0.3344  
## bc_util                              0.0841 0.0454  1.85  0.0641  
## chargeoff_within_12_mths             0.0020 0.0136  0.15  0.8825  
## delinq_amnt                          0.0276 0.0251  1.10  0.2713  
## mo_sin_old_il_acct                   0.0347 0.0217  1.60  0.1099  
## mo_sin_old_rev_tl_op                 0.1823 0.0453  4.02  <0.0001 
## mo_sin_rcnt_rev_tl_op               -0.0342 0.0296 -1.15  0.2484  
## mo_sin_rcnt_tl                       0.0950 0.0367  2.59  0.0096  
## mort_acc                             0.0925 0.0556  1.66  0.0962  
## mths_since_recent_bc                 0.0744 0.0246  3.02  0.0025  
## mths_since_recent_inq                0.0347 0.0230  1.51  0.1303  
## num_accts_ever_120_pd                0.0255 0.0180  1.42  0.1560  
## num_actv_bc_tl                      -0.0658 0.0494 -1.33  0.1825  
## num_actv_rev_tl                     -0.1144 0.0841 -1.36  0.1737  
## num_bc_sats                         -0.0754 0.0495 -1.52  0.1280  
## num_bc_tl                            0.0264 0.0481  0.55  0.5835  
## num_il_tl                           -0.0495 0.2063 -0.24  0.8104  
## num_op_rev_tl                       -0.0403 0.1387 -0.29  0.7716  
## num_rev_accts                        0.0500 0.2311  0.22  0.8286  
## num_rev_tl_bal_gt_0                 -0.0062 0.0829 -0.08  0.9401  
## num_sats                            -0.0729 0.3589 -0.20  0.8391  
## num_tl_90g_dpd_24m                   0.0321 0.0183  1.76  0.0791  
## num_tl_op_past_12m                  -0.1466 0.1373 -1.07  0.2857  
## pct_tl_nvr_dlq                       0.0261 0.0222  1.18  0.2386  
## percent_bc_gt_75                    -0.1117 0.0328 -3.41  0.0007  
## pub_rec_bankruptcies                -0.0439 0.0308 -1.42  0.1546  
## tax_liens                           -0.0364 0.0341 -1.07  0.2862  
## tot_hi_cred_lim                      0.0288 0.1396  0.21  0.8365  
## total_bal_ex_mort                   -0.2741 0.1009 -2.72  0.0066  
## total_bc_limit                       0.1166 0.0636  1.83  0.0666  
## total_il_high_credit_limit           0.0226 0.0780  0.29  0.7724  
## is_nj                               -0.0413 0.0165 -2.50  0.0123  
## is_ny                               -0.0498 0.0177 -2.81  0.0049  
## is_pa                               -0.0228 0.0168 -1.36  0.1751  
## is_fl                               -0.0004 0.0177 -0.02  0.9839  
## is_oh                               -0.0182 0.0172 -1.06  0.2879  
## is_co                                0.0453 0.0195  2.33  0.0198  
## is_ga                                0.0156 0.0176  0.89  0.3732  
## is_az                                0.0211 0.0174  1.21  0.2259  
## is_mi                                0.0010 0.0172  0.06  0.9546  
## is_va                               -0.0024 0.0179 -0.13  0.8947