00. R Read CSV

df0 = read.csv("loans.csv")
df = read.csv("loans.csv")
str(df)

## 'data.frame':    9578 obs. of  15 variables:
##  $ customer.id      : int  10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 ...
##  $ credit.policy    : chr  "1" "1" "1" "1" ...
##  $ purpose          : chr  "debt_consolidation" "credit_card" "debt_consolidation" "debt_consolidation" ...
##  $ int.rate         : num  0.119 0.107 0.136 0.101 0.143 ...
##  $ installment      : num  829 228 367 162 103 ...
##  $ log.annual.inc   : num  11.4 11.1 10.4 11.4 11.3 ...
##  $ dti              : chr  "19.48" "14.29" "11.63" "8.1" ...
##  $ fico             : int  737 707 682 712 667 727 667 722 682 707 ...
##  $ days.with.cr.line: num  5640 2760 4710 2700 4066 ...
##  $ revol.bal        : int  28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
##  $ revol.util       : chr  "52.1" "76.7" "25.6" "73.2" ...
##  $ inq.last.6mths   : int  0 0 1 1 0 0 0 0 1 1 ...
##  $ delinq.2yrs      : chr  "0" "0" "0" "0" ...
##  $ pub.rec          : chr  "0" "0" "0" "0" ...
##  $ not.fully.paid   : int  0 0 0 0 0 0 1 1 0 0 ...

summary(df)

##   customer.id    credit.policy        purpose             int.rate      
##  Min.   :10001   Length:9578        Length:9578        Min.   : 0.0600  
##  1st Qu.:12395   Class :character   Class :character   1st Qu.: 0.1039  
##  Median :14790   Mode  :character   Mode  :character   Median : 0.1221  
##  Mean   :14790                                         Mean   : 0.1255  
##  3rd Qu.:17184                                         3rd Qu.: 0.1407  
##  Max.   :19578                                         Max.   :14.7000  
##                                                                         
##   installment     log.annual.inc       dti                 fico       
##  Min.   : 15.67   Min.   : 7.548   Length:9578        Min.   : 612.0  
##  1st Qu.:163.77   1st Qu.:10.558   Class :character   1st Qu.: 682.0  
##  Median :268.95   Median :10.928   Mode  :character   Median : 707.0  
##  Mean   :319.09   Mean   :10.932                      Mean   : 711.2  
##  3rd Qu.:432.76   3rd Qu.:11.290                      3rd Qu.: 737.0  
##  Max.   :940.14   Max.   :14.528                      Max.   :1812.0  
##                   NA's   :5                                           
##  days.with.cr.line   revol.bal        revol.util        inq.last.6mths  
##  Min.   :  179     Min.   :      0   Length:9578        Min.   : 0.000  
##  1st Qu.: 2820     1st Qu.:   3187   Class :character   1st Qu.: 0.000  
##  Median : 4140     Median :   8596   Mode  :character   Median : 1.000  
##  Mean   : 4562     Mean   :  16915                      Mean   : 1.572  
##  3rd Qu.: 5730     3rd Qu.:  18252                      3rd Qu.: 2.000  
##  Max.   :17640     Max.   :1207359                      Max.   :33.000  
##  NA's   :29        NA's   :1                            NA's   :30      
##  delinq.2yrs          pub.rec          not.fully.paid  
##  Length:9578        Length:9578        Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :0.0000  
##                                        Mean   :0.1601  
##                                        3rd Qu.:0.0000  
##                                        Max.   :1.0000  
##

01. Cleaning Data

df$credit.policy have [zero] x1 > convert done [one] x1 > convert done df$pub.rec have [NO] x1 > convert done df$log.annual.inc have NA, Blank > convert to NA and omit df$dti have [six] x1 > convert done df$days.with.cr.line have NA > convert to NA and omit df$revol.bal have Blank > convert to NA and omit df$revol.util have NA, [one] > convert done df$inq.last.6mths have NA, Blank > convert to NA and omit df$delinq.2yrs have NA, [yes] > convert done df$pub.rec have NA, [NO] > convert done
df$int.rate have 14.7 , 13.25 which not correct > Delete done

df = replace(df, df == "one",1)
df = replace(df, df == "zero",0)
df = replace(df, df == "NO",0)
df = replace(df, df == "yes",1)
df = replace(df, df == "six",6)
df = replace(df, df == "?", NA)
df = na.omit(df)
df = subset(df, select= -c(customer.id))
df = subset.data.frame(df,df$int.rate<1 & df$fico <1000)

df = replace(df, df == "debt_consolidation", 1)
df$P_debt_consolidation = ifelse(df$purpose == 1,1,0)

df = replace(df, df == "credit_card", 2)
df$P_credit_card = ifelse(df$purpose == 2,1,0)

df = replace(df, df == "all_other", 3)
df$P_all_other = ifelse(df$purpose == 3,1,0)

df = replace(df, df == "home_improvement", 4)
df$P_home_improvement = ifelse(df$purpose == 4,1,0)

df = replace(df, df == "small_business", 5)
df$P_small_business = ifelse(df$purpose == 5,1,0)

df = replace(df, df == "major_purchase", 6)
df$P_major_purchase = ifelse(df$purpose == 6,1,0)

df = replace(df, df == "educational", 7)
df$P_educational = ifelse(df$purpose == 7,1,0)

##df = subset(df, select= -c(purpose))

test

# check outlines 

df[sapply(df, is.character)] = lapply(df[sapply(df, is.character)], as.factor)
str(df)

## 'data.frame':    9508 obs. of  21 variables:
##  $ credit.policy       : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ purpose             : Factor w/ 7 levels "1","2","3","4",..: 1 2 1 1 2 2 1 3 4 1 ...
##  $ int.rate            : num  0.119 0.107 0.136 0.101 0.143 ...
##  $ installment         : num  829 228 367 162 103 ...
##  $ log.annual.inc      : num  11.4 11.1 10.4 11.4 11.3 ...
##  $ dti                 : Factor w/ 2527 levels "0","0.01","0.02",..: 1119 612 348 2341 678 877 1944 293 904 187 ...
##  $ fico                : int  737 707 682 712 667 727 667 722 682 707 ...
##  $ days.with.cr.line   : num  5640 2760 4710 2700 4066 ...
##  $ revol.bal           : int  28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
##  $ revol.util          : Factor w/ 1036 levels "0","0.04","0.1",..: 513 784 212 749 365 502 785 691 503 186 ...
##  $ inq.last.6mths      : int  0 0 1 1 0 0 0 0 1 1 ...
##  $ delinq.2yrs         : Factor w/ 11 levels "0","1","11","13",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ pub.rec             : Factor w/ 7 levels "0","1","2","3",..: 1 1 1 1 1 1 2 1 1 1 ...
##  $ not.fully.paid      : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ P_debt_consolidation: num  1 0 1 1 0 0 1 0 0 1 ...
##  $ P_credit_card       : num  0 1 0 0 1 1 0 0 0 0 ...
##  $ P_all_other         : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ P_home_improvement  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ P_small_business    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ P_major_purchase    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ P_educational       : num  0 0 0 0 0 0 0 0 0 0 ...

summary(df)

##  credit.policy purpose     int.rate       installment     log.annual.inc  
##  0:1830        1:3945   Min.   :0.0600   Min.   : 15.67   Min.   : 7.548  
##  1:7678        2:1258   1st Qu.:0.1039   1st Qu.:164.02   1st Qu.:10.565  
##                3:2289   Median :0.1221   Median :269.76   Median :10.933  
##                4: 626   Mean   :0.1227   Mean   :320.21   Mean   :10.934  
##                5: 618   3rd Qu.:0.1407   3rd Qu.:435.40   3rd Qu.:11.290  
##                6: 432   Max.   :0.2164   Max.   :940.14   Max.   :14.528  
##                7: 340                                                     
##       dti            fico       days.with.cr.line   revol.bal      
##  0      :  86   Min.   :612.0   Min.   :  180     Min.   :      0  
##  0.6    :  16   1st Qu.:682.0   1st Qu.: 2820     1st Qu.:   3273  
##  12     :  13   Median :707.0   Median : 4140     Median :   8688  
##  13.16  :  13   Mean   :710.8   Mean   : 4567     Mean   :  16987  
##  15.1   :  13   3rd Qu.:737.0   3rd Qu.: 5730     3rd Qu.:  18345  
##  19.2   :  13   Max.   :827.0   Max.   :17640     Max.   :1207359  
##  (Other):9354                                                      
##    revol.util   inq.last.6mths    delinq.2yrs   pub.rec   not.fully.paid  
##  0      : 295   Min.   : 0.000   0      :8396   0 :8952   Min.   :0.0000  
##  0.5    :  24   1st Qu.: 0.000   1      : 826   1 : 529   1st Qu.:0.0000  
##  0.3    :  22   Median : 1.000   2      : 191   2 :  19   Median :0.0000  
##  73.7   :  22   Mean   : 1.574   3      :  65   3 :   5   Mean   :0.1597  
##  3.3    :  21   3rd Qu.: 2.000   4      :  18   4 :   1   3rd Qu.:0.0000  
##  47.8   :  21   Max.   :33.000   5      :   6   5 :   1   Max.   :1.0000  
##  (Other):9103                    (Other):   6   no:   1                   
##  P_debt_consolidation P_credit_card     P_all_other     P_home_improvement
##  Min.   :0.0000       Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   
##  1st Qu.:0.0000       1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   
##  Median :0.0000       Median :0.0000   Median :0.0000   Median :0.00000   
##  Mean   :0.4149       Mean   :0.1323   Mean   :0.2407   Mean   :0.06584   
##  3rd Qu.:1.0000       3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000   
##  Max.   :1.0000       Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   
##                                                                           
##  P_small_business P_major_purchase  P_educational    
##  Min.   :0.000    Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000    1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.000    Median :0.00000   Median :0.00000  
##  Mean   :0.065    Mean   :0.04544   Mean   :0.03576  
##  3rd Qu.:0.000    3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.000    Max.   :1.00000   Max.   :1.00000  
##

02. Checking Correlation variables

set df1 as numeric of df in order to make cor:

df1 = data.frame(df)
df1[sapply(df1, is.factor)] = lapply(df1[sapply(df1, is.factor)], as.numeric)
str(df1)

## 'data.frame':    9508 obs. of  21 variables:
##  $ credit.policy       : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ purpose             : num  1 2 1 1 2 2 1 3 4 1 ...
##  $ int.rate            : num  0.119 0.107 0.136 0.101 0.143 ...
##  $ installment         : num  829 228 367 162 103 ...
##  $ log.annual.inc      : num  11.4 11.1 10.4 11.4 11.3 ...
##  $ dti                 : num  1119 612 348 2341 678 ...
##  $ fico                : int  737 707 682 712 667 727 667 722 682 707 ...
##  $ days.with.cr.line   : num  5640 2760 4710 2700 4066 ...
##  $ revol.bal           : int  28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
##  $ revol.util          : num  513 784 212 749 365 502 785 691 503 186 ...
##  $ inq.last.6mths      : int  0 0 1 1 0 0 0 0 1 1 ...
##  $ delinq.2yrs         : num  1 1 1 1 2 1 1 1 1 1 ...
##  $ pub.rec             : num  1 1 1 1 1 1 2 1 1 1 ...
##  $ not.fully.paid      : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ P_debt_consolidation: num  1 0 1 1 0 0 1 0 0 1 ...
##  $ P_credit_card       : num  0 1 0 0 1 1 0 0 0 0 ...
##  $ P_all_other         : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ P_home_improvement  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ P_small_business    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ P_major_purchase    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ P_educational       : num  0 0 0 0 0 0 0 0 0 0 ...

cor_matrix = cor(df1, method = "spearman")
cor_matrix

##                      credit.policy      purpose    int.rate  installment
## credit.policy         1.0000000000 -0.016163243 -0.29922247  0.067491035
## purpose              -0.0161632433  1.000000000 -0.09928159 -0.182333107
## int.rate             -0.2992224717 -0.099281592  1.00000000  0.242631447
## installment           0.0674910354 -0.182333107  0.24263145  1.000000000
## log.annual.inc        0.0286474657  0.019648270  0.04404270  0.432329014
## dti                  -0.0162452831 -0.027347376  0.02911917 -0.007310878
## fico                  0.3584555240  0.154259175 -0.74718746  0.084843434
## days.with.cr.line     0.1125471337 -0.015164185 -0.13364416  0.200905272
## revol.bal            -0.0205474651 -0.231370464  0.14771106  0.346562909
## revol.util           -0.0695959186 -0.193042477  0.39482798  0.080842359
## inq.last.6mths       -0.4318228891  0.063312657  0.17903857 -0.004468728
## delinq.2yrs          -0.0588612939  0.004056783  0.17340105 -0.007928289
## pub.rec              -0.0492471260 -0.041899617  0.09588996 -0.026970454
## not.fully.paid       -0.1615671719  0.042274849  0.15259239  0.041371573
## P_debt_consolidation  0.0158603644 -0.893813193  0.13266839  0.201333421
## P_credit_card         0.0008870614 -0.026821593 -0.04563619  0.006639555
## P_all_other          -0.0146246685  0.342401111 -0.11823802 -0.226672310
## P_home_improvement    0.0037498403  0.309081287 -0.05223355  0.014932395
## P_small_business     -0.0054691112  0.369541322  0.12815333  0.124572630
## P_major_purchase      0.0232461523  0.349490831 -0.06674728 -0.091850569
## P_educational        -0.0338489284  0.336856931 -0.02203136 -0.106749245
##                      log.annual.inc          dti         fico days.with.cr.line
## credit.policy           0.028647466 -0.016245283  0.358455524        0.11254713
## purpose                 0.019648270 -0.027347376  0.154259175       -0.01516419
## int.rate                0.044042698  0.029119172 -0.747187460       -0.13364416
## installment             0.432329014 -0.007310878  0.084843434        0.20090527
## log.annual.inc          1.000000000  0.004640402  0.105031553        0.39843259
## dti                     0.004640402  1.000000000 -0.024115760        0.02579815
## fico                    0.105031553 -0.024115760  1.000000000        0.25079899
## days.with.cr.line       0.398432592  0.025798152  0.250798993        1.00000000
## revol.bal               0.417874687  0.063032290 -0.095392931        0.32368390
## revol.util              0.062705436  0.036064542 -0.409084190        0.01312478
## inq.last.6mths          0.031006619 -0.008345033 -0.176888073       -0.04233460
## delinq.2yrs             0.030011419 -0.001526905 -0.237049054        0.09530589
## pub.rec                 0.012733237  0.019297467 -0.149090010        0.10033473
## not.fully.paid         -0.033219039  0.008954960 -0.146916680       -0.02556690
## P_debt_consolidation   -0.039963448  0.026759035 -0.145136472        0.00377366
## P_credit_card           0.080749340  0.003626599 -0.002884949        0.05086867
## P_all_other            -0.076930921 -0.020059170  0.059301122       -0.07107850
## P_home_improvement      0.115563397 -0.009124255  0.083982881        0.07567048
## P_small_business        0.093334405 -0.005533118  0.064942701        0.03699011
## P_major_purchase       -0.041573804 -0.011677143  0.059422296       -0.02817956
## P_educational          -0.095804357  0.001193172 -0.011195196       -0.05774714
##                         revol.bal  revol.util inq.last.6mths   delinq.2yrs
## credit.policy        -0.020547465 -0.06959592   -0.431822889 -0.0588612939
## purpose              -0.231370464 -0.19304248    0.063312657  0.0040567828
## int.rate              0.147711064  0.39482798    0.179038574  0.1734010479
## installment           0.346562909  0.08084236   -0.004468728 -0.0079282887
## log.annual.inc        0.417874687  0.06270544    0.031006619  0.0300114187
## dti                   0.063032290  0.03606454   -0.008345033 -0.0015269046
## fico                 -0.095392931 -0.40908419   -0.176888073 -0.2370490544
## days.with.cr.line     0.323683905  0.01312478   -0.042334603  0.0953058892
## revol.bal             1.000000000  0.40480070   -0.018620415 -0.0549307994
## revol.util            0.404800697  1.00000000   -0.017866928 -0.0294949073
## inq.last.6mths       -0.018620415 -0.01786693    1.000000000  0.0201724639
## delinq.2yrs          -0.054930799 -0.02949491    0.020172464  1.0000000000
## pub.rec              -0.026008365  0.05803940    0.058052758  0.0017620569
## not.fully.paid        0.021789714  0.05518818    0.133370677  0.0147114210
## P_debt_consolidation  0.175763917  0.16122353   -0.038436981 -0.0018603797
## P_credit_card         0.153992538  0.06625554   -0.042582187 -0.0113995073
## P_all_other          -0.199252296 -0.10682264    0.009305541  0.0142434370
## P_home_improvement   -0.041272508 -0.08849297    0.052120662 -0.0098447523
## P_small_business      0.001951696 -0.04303562    0.044796715  0.0007633006
## P_major_purchase     -0.125488357 -0.07786746    0.006632473 -0.0014745587
## P_educational        -0.095326847 -0.04008113    0.021741160  0.0067313051
##                            pub.rec not.fully.paid P_debt_consolidation
## credit.policy        -0.0492471260   -0.161567172           0.01586036
## purpose              -0.0418996166    0.042274849          -0.89381319
## int.rate              0.0958899618    0.152592385           0.13266839
## installment          -0.0269704540    0.041371573           0.20133342
## log.annual.inc        0.0127332373   -0.033219039          -0.03996345
## dti                   0.0192974667    0.008954960           0.02675904
## fico                 -0.1490900098   -0.146916680          -0.14513647
## days.with.cr.line     0.1003347325   -0.025566898           0.00377366
## revol.bal            -0.0260083651    0.021789714           0.17576392
## revol.util            0.0580393951    0.055188184           0.16122353
## inq.last.6mths        0.0580527580    0.133370677          -0.03843698
## delinq.2yrs           0.0017620569    0.014711421          -0.00186038
## pub.rec               1.0000000000    0.058682336           0.03900768
## not.fully.paid        0.0586823359    1.000000000          -0.01797229
## P_debt_consolidation  0.0390076776   -0.017972286           1.00000000
## P_credit_card         0.0074384874   -0.047326552          -0.32883824
## P_all_other          -0.0322863373    0.009100103          -0.47419105
## P_home_improvement    0.0008380379    0.005853833          -0.22356343
## P_small_business     -0.0039592890    0.085415242          -0.22203034
## P_major_purchase     -0.0238199764   -0.027535311          -0.18372303
## P_educational        -0.0118868597    0.022757838          -0.16217023
##                      P_credit_card  P_all_other P_home_improvement
## credit.policy         0.0008870614 -0.014624669       0.0037498403
## purpose              -0.0268215934  0.342401111       0.3090812873
## int.rate             -0.0456361945 -0.118238016      -0.0522335467
## installment           0.0066395554 -0.226672310       0.0149323948
## log.annual.inc        0.0807493403 -0.076930921       0.1155633970
## dti                   0.0036265986 -0.020059170      -0.0091242551
## fico                 -0.0028849487  0.059301122       0.0839828811
## days.with.cr.line     0.0508686721 -0.071078499       0.0756704755
## revol.bal             0.1539925385 -0.199252296      -0.0412725080
## revol.util            0.0662555353 -0.106822638      -0.0884929705
## inq.last.6mths       -0.0425821868  0.009305541       0.0521206615
## delinq.2yrs          -0.0113995073  0.014243437      -0.0098447523
## pub.rec               0.0074384874 -0.032286337       0.0008380379
## not.fully.paid       -0.0473265520  0.009100103       0.0058538335
## P_debt_consolidation -0.3288382438 -0.474191046      -0.2235634258
## P_credit_card         1.0000000000 -0.219886072      -0.1036680975
## P_all_other          -0.2198860723  1.000000000      -0.1494913823
## P_home_improvement   -0.1036680975 -0.149491382       1.0000000000
## P_small_business     -0.1029571953 -0.148466248      -0.0699963089
## P_major_purchase     -0.0851937973 -0.122851087      -0.0579197144
## P_educational        -0.0751995951 -0.108439257      -0.0511250727
##                      P_small_business P_major_purchase P_educational
## credit.policy           -0.0054691112      0.023246152  -0.033848928
## purpose                  0.3695413218      0.349490831   0.336856931
## int.rate                 0.1281533343     -0.066747278  -0.022031356
## installment              0.1245726297     -0.091850569  -0.106749245
## log.annual.inc           0.0933344050     -0.041573804  -0.095804357
## dti                     -0.0055331184     -0.011677143   0.001193172
## fico                     0.0649427007      0.059422296  -0.011195196
## days.with.cr.line        0.0369901069     -0.028179562  -0.057747140
## revol.bal                0.0019516958     -0.125488357  -0.095326847
## revol.util              -0.0430356156     -0.077867465  -0.040081128
## inq.last.6mths           0.0447967146      0.006632473   0.021741160
## delinq.2yrs              0.0007633006     -0.001474559   0.006731305
## pub.rec                 -0.0039592890     -0.023819976  -0.011886860
## not.fully.paid           0.0854152419     -0.027535311   0.022757838
## P_debt_consolidation    -0.2220303435     -0.183723032  -0.162170229
## P_credit_card           -0.1029571953     -0.085193797  -0.075199595
## P_all_other             -0.1484662477     -0.122851087  -0.108439257
## P_home_improvement      -0.0699963089     -0.057919714  -0.051125073
## P_small_business         1.0000000000     -0.057522531  -0.050774483
## P_major_purchase        -0.0575225310      1.000000000  -0.042014267
## P_educational           -0.0507744834     -0.042014267   1.000000000

corrplot.mixed(cor_matrix)

plot(df$fico,df$days.with.cr.line)

plot(df$fico,df$log.annual.inc)

plot(df$fico,df$int.rate)

plot(df$fico,df$dti)

FICO credit scores are a method of quantifying and evaluating an individuals creditworthiness. 300-329 - BAD 630-689 - Fair 690-719 - Good 720-850 - Excellent

03. Linear regression - Fico

Adjusted R squared is thr measuring impact of independent variables on the correlation, only increases if the new predictor enhances the model above what would be obtained by probability. Conversely, it will decrease when a predictor improves the model less than what is predicted by chance.

from the result:

lm1 dependent fico with most independent x12
Adjust r = 0.6768 > rmse = 21.38428
lm2 dependent fico with significant independent only x8
Adjust r = 0.6411 > rmse = 22.81494
lm3 dependent fico with most independent - highest significant x11
Adjust r = 0.3884 > rmse = 28.88878

lm2 is slightly better than lm1 by 0.01 but both of them have same adjust r. less rmse mean model have higher correlation coefficient.

if the correlation coefficient is 1, the RMSE will be 0, because all of the points lie on the regression line (and therefore there are no errors)

check significant value by looking at T-value that far away from 0 as much as possible. which could indicate a relationship exists.

# linear regression models
set.seed(150)
index = sample.split(df$fico, SplitRatio = 0.70) # split ratio 70%
train_lm = subset(df1, index == TRUE) # create the train data set
test_lm = subset(df1, index == FALSE) # create the test data set

lm1 = lm(fico ~.-not.fully.paid -pub.rec -P_educational -P_major_purchase, train_lm)
## fico with most independent

summary(lm1)

## 
## Call:
## lm(formula = fico ~ . - not.fully.paid - pub.rec - P_educational - 
##     P_major_purchase, data = train_lm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -93.156 -14.368  -2.354  12.052  98.804 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           8.322e+02  1.360e+01  61.184  < 2e-16 ***
## credit.policy         1.059e+01  8.534e-01  12.405  < 2e-16 ***
## purpose              -4.176e+00  1.846e+00  -2.263  0.02369 *  
## int.rate             -9.512e+02  1.235e+01 -76.990  < 2e-16 ***
## installment           4.618e-02  1.564e-03  29.521  < 2e-16 ***
## log.annual.inc       -2.004e-01  5.305e-01  -0.378  0.70566    
## dti                  -3.353e-05  3.491e-04  -0.096  0.92350    
## days.with.cr.line     1.956e-03  1.169e-04  16.731  < 2e-16 ***
## revol.bal            -7.086e-06  8.931e-06  -0.793  0.42756    
## revol.util           -1.917e-02  1.010e-03 -18.976  < 2e-16 ***
## inq.last.6mths        1.247e-01  1.413e-01   0.882  0.37760    
## delinq.2yrs          -5.437e+00  3.363e-01 -16.168  < 2e-16 ***
## P_debt_consolidation -2.914e+01  1.012e+01  -2.880  0.00399 ** 
## P_credit_card        -2.412e+01  8.301e+00  -2.906  0.00368 ** 
## P_all_other          -1.441e+01  6.457e+00  -2.232  0.02563 *  
## P_home_improvement   -8.945e+00  4.722e+00  -1.894  0.05824 .  
## P_small_business      8.829e+00  3.040e+00   2.904  0.00369 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.57 on 6639 degrees of freedom
## Multiple R-squared:  0.6776, Adjusted R-squared:  0.6768 
## F-statistic: 872.1 on 16 and 6639 DF,  p-value: < 2.2e-16

test_lm$predicted_fico1 = predict(lm1, test_lm)
rmse(actual = test_lm$fico, predicted = test_lm$predicted_fico1)

## [1] 21.38428

lm2 = lm(fico ~. -log.annual.inc -dti  -revol.bal -inq.last.6mths-pub.rec
         -P_educational-P_home_improvement -P_major_purchase -P_debt_consolidation 
         -days.with.cr.line -revol.util -P_credit_card , train_lm)

## fico with significant independent only  

summary(lm2)

## 
## Call:
## lm(formula = fico ~ . - log.annual.inc - dti - revol.bal - inq.last.6mths - 
##     pub.rec - P_educational - P_home_improvement - P_major_purchase - 
##     P_debt_consolidation - days.with.cr.line - revol.util - P_credit_card, 
##     data = train_lm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -97.384 -14.777  -2.923  12.286 102.741 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       8.064e+02  2.306e+00 349.707  < 2e-16 ***
## credit.policy     9.729e+00  7.541e-01  12.901  < 2e-16 ***
## purpose           1.849e+00  1.836e-01  10.076  < 2e-16 ***
## int.rate         -1.071e+03  1.167e+01 -91.699  < 2e-16 ***
## installment       5.333e-02  1.447e-03  36.849  < 2e-16 ***
## delinq.2yrs      -4.042e+00  3.456e-01 -11.696  < 2e-16 ***
## not.fully.paid   -5.219e+00  7.702e-01  -6.775 1.35e-11 ***
## P_all_other       4.012e+00  6.790e-01   5.908 3.62e-09 ***
## P_small_business  1.741e+01  1.293e+00  13.460  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.56 on 6647 degrees of freedom
## Multiple R-squared:  0.6469, Adjusted R-squared:  0.6464 
## F-statistic:  1522 on 8 and 6647 DF,  p-value: < 2.2e-16

test_lm$predicted_fico2 = predict(lm2, test_lm)
rmse(actual = test_lm$fico, predicted = test_lm$predicted_fico2)

## [1] 22.59684

lm3 = lm(fico ~.-not.fully.paid -pub.rec -int.rate -P_educational-P_major_purchase , train_lm)

## fico with most independent - highest significant
summary(lm3)

## 
## Call:
## lm(formula = fico ~ . - not.fully.paid - pub.rec - int.rate - 
##     P_educational - P_major_purchase, data = train_lm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -101.826  -21.414   -1.241   19.896  108.666 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.157e+02  1.860e+01  38.487  < 2e-16 ***
## credit.policy         2.544e+01  1.144e+00  22.249  < 2e-16 ***
## purpose              -6.554e+00  2.539e+00  -2.581 0.009862 ** 
## installment           9.903e-03  2.052e-03   4.825 1.43e-06 ***
## log.annual.inc        1.269e+00  7.293e-01   1.740 0.081865 .  
## dti                  -3.091e-04  4.802e-04  -0.644 0.519862    
## days.with.cr.line     3.464e-03  1.586e-04  21.841  < 2e-16 ***
## revol.bal             2.907e-05  1.227e-05   2.369 0.017867 *  
## revol.util           -4.978e-02  1.278e-03 -38.961  < 2e-16 ***
## inq.last.6mths       -7.036e-01  1.939e-01  -3.629 0.000287 ***
## delinq.2yrs          -1.021e+01  4.547e-01 -22.459  < 2e-16 ***
## P_debt_consolidation -4.358e+01  1.392e+01  -3.132 0.001745 ** 
## P_credit_card        -3.242e+01  1.142e+01  -2.839 0.004532 ** 
## P_all_other          -2.149e+01  8.882e+00  -2.419 0.015571 *  
## P_home_improvement   -1.450e+01  6.496e+00  -2.232 0.025636 *  
## P_small_business     -7.890e+00  4.171e+00  -1.892 0.058581 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.67 on 6640 degrees of freedom
## Multiple R-squared:  0.3898, Adjusted R-squared:  0.3884 
## F-statistic: 282.7 on 15 and 6640 DF,  p-value: < 2.2e-16

test_lm$predicted_fico3 = predict(lm3, test_lm)
rmse(actual = test_lm$fico, predicted = test_lm$predicted_fico3)

## [1] 28.88878

04. Randomly split - simple baseline model

Baseline module has an accuracy of (7990/(7990+1518)) = 84 %

                In data set baseline

1-Not fully paid back - 1517
0-fully paid back - 7990
Total = 9507

table(df$not.fully.paid)

## 
##    0    1 
## 7990 1518

ggplot(df, aes(int.rate, as.factor(not.fully.paid))) + geom_boxplot() + coord_flip()

ggplot(df, aes(inq.last.6mths, as.factor(not.fully.paid))) + geom_boxplot() + coord_flip()

Randomly split

set.seed(42)
split = sample.split (df1$not.fully.paid, SplitRatio = 0.70) 
train_glm = subset(df1, split == TRUE)
test_glm = subset(df1, split == FALSE)

#checking baseline [train and test set] (result same)

table(train_glm$not.fully.paid)

## 
##    0    1 
## 5593 1063

nrow(train_glm)

## [1] 6656

(5593/(5593+1063))

## [1] 0.8402945

table(test_glm$not.fully.paid)

## 
##    0    1 
## 2397  455

nrow(test_glm)

## [1] 2852

(2397/(2397+445))

## [1] 0.8434201

05. logistic regression - not.fully.paid

model1 = glm(not.fully.paid ~ ., family = binomial, train_glm)
summary(model1)

## 
## Call:
## glm(formula = not.fully.paid ~ ., family = binomial, data = train_glm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9423  -0.6169  -0.4978  -0.3769   2.4416  
## 
## Coefficients: (2 not defined because of singularities)
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           4.744e+00  2.273e+00   2.087  0.03687 *  
## credit.policy        -3.210e-01  1.016e-01  -3.160  0.00158 ** 
## purpose               4.581e-01  2.423e-01   1.891  0.05868 .  
## int.rate              3.132e+00  2.058e+00   1.522  0.12794    
## installment           1.069e-03  2.090e-04   5.115 3.13e-07 ***
## log.annual.inc       -3.557e-01  7.109e-02  -5.003 5.63e-07 ***
## dti                   1.858e-05  4.696e-05   0.396  0.69242    
## fico                 -8.536e-03  1.635e-03  -5.222 1.77e-07 ***
## days.with.cr.line     1.790e-05  1.555e-05   1.151  0.24977    
## revol.bal             1.838e-06  1.094e-06   1.680  0.09291 .  
## revol.util            7.554e-05  1.359e-04   0.556  0.57838    
## inq.last.6mths        8.329e-02  1.643e-02   5.069 4.01e-07 ***
## delinq.2yrs          -9.118e-02  4.390e-02  -2.077  0.03782 *  
## pub.rec               2.722e-01  1.122e-01   2.425  0.01531 *  
## P_debt_consolidation  2.155e+00  1.354e+00   1.592  0.11146    
## P_credit_card         1.587e+00  1.116e+00   1.422  0.15509    
## P_all_other           1.622e+00  8.727e-01   1.858  0.06314 .  
## P_home_improvement    1.117e+00  6.444e-01   1.733  0.08310 .  
## P_small_business      1.122e+00  4.131e-01   2.715  0.00663 ** 
## P_major_purchase             NA         NA      NA       NA    
## P_educational                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5846.4  on 6655  degrees of freedom
## Residual deviance: 5480.4  on 6637  degrees of freedom
## AIC: 5518.4
## 
## Number of Fisher Scoring iterations: 5

06. Predict the probability - AUC

Predict the probability of the test set loans not being paid back in full. Store these values in a variable named PredictedRisk and add it to your test set.

-What is the accuracy of the logistic regression model on the test set using a threshold of 0.25?

0.80 or 80 % = (2179+112)/(2179+218+343+112)

-What is the AUC of the model?

0.68 or 68 % (result in Q7)

PredictedRisk = predict(model1, type = "response", test_glm)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

PredictedRisk_cat = ifelse(PredictedRisk > 0.25,1,0)

table_acc = table(test_glm$not.fully.paid, PredictedRisk_cat)
table_acc

##    PredictedRisk_cat
##        0    1
##   0 2170  227
##   1  319  136

accuracy_PredictedRisk = round(sum(diag(table_acc))/ sum(table_acc),3)

print("accuracy of the logistic regression model")

## [1] "accuracy of the logistic regression model"

accuracy_PredictedRisk

## [1] 0.809

#manual Cal Accuracy of glm
round((2179+112)/(2179+218+343+112),3)

## [1] 0.803

07. Best threshold at FP0.25

What is the best threshold value to maximize true positive rate while keeping false positive at max 25% (or 0.25)?

[ANS] Threshold values is 0.18 at maximum False positive at 25%

ROC - Receiver Operator Characteristic

# ROC will show us the overall performance of the model
ROCRpred = prediction (PredictedRisk, test_glm$not.fully.paid )

ROCRperf = performance (ROCRpred, "tpr", "fpr")
as.numeric (performance (ROCRpred, "auc") @y.values) # higher auc value is better

## [1] 0.6901796

plot (ROCRperf, 
      colorize = TRUE, 
      main ="ROC CURVE",
      print.cutoffs.at = seq (0, 1, by = 0.01535),
      text.adj = c(-0.2, 1.7)
      )
abline(h=0.493, v=0.25)

08. Predicted loan at THH0.25

Can you create a simpler model with very much similar evaluation metrics? Use this new model to make predictions for observations in the test set.

How many loans would be predicted that would not be paid back in full if we used a threshold of 0.25? Compute the AUC.

In data set (total = 2852 obs)

1-Not fully paid back = 458 obs

0-fully paid back = 2394 obs

Predicted would not fully paid back threshold of 0.25 (total = 2852 obs)

1-Not fully paid back = 397 obs

0-fully paid back = 2455 obs

AUC of this module at 0.25 threshold is 0.70 or 70%

and predicted of people who not paid in full is 414/2852 = 0.15 or 15%

table(test_lm$not.fully.paid)

## 
##    0    1 
## 2394  458

model2 = glm(not.fully.paid ~ ., family = binomial, train_glm)
PredictedRisk_Q8 = predict(model2, type = "response", test_lm)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

PredictedRisk_cat_Q8 = ifelse(PredictedRisk_Q8 > 0.25,1,0)

table_acc_Q8 = table(test_lm$not.fully.paid, PredictedRisk_cat_Q8)
table_acc_Q8

##    PredictedRisk_cat_Q8
##        0    1
##   0 2133  261
##   1  305  153

ROCRpred_Q8 = prediction (PredictedRisk_Q8, test_lm$not.fully.paid )

ROCRperf_Q8 = performance (ROCRpred_Q8, "tpr", "fpr")
as.numeric (performance (ROCRpred_Q8, "auc") @y.values) # higher auc value is better

## [1] 0.7066091

plot (ROCRperf_Q8, 
      colorize = TRUE, 
      main ="ROC CURVE",
      print.cutoffs.at = seq (0, 1, by = 0.05),
      text.adj = c(-0.2, 1.7)
      )
abline(h=0.332, v=0.111)

09. Maximize return with a high interest rate (15% or higher)

Compute the profit of a $1 investment in each loan, save your result to a variable named Profit.

Hint: Carefully think about the profit if the loan defaults and if the loan is paid back in full. This then allows you to determine the expected profit.

Analyze now a strategy in which the investor purchases loans with a high interest rate (15% or higher) to maximize return, but among these loans selects the ones with the lowest predicted risk of not being paid back in full.

[ANS]

1.Int.rate > 15% and Fully.paid.back get [610$],

select lowest predicted risk Not.paid.back <= 30% get [-194$]

Best case get 610 + 194 = 804$

worst case get 610 - 194 = 416$

different 388$

2.Int.rate > 15% and Fully.paid.back get [610$],

select lowest predicted risk Not.paid.back <= 25% get [-133$]

Best case get 610 + 133 = 743$

worst case get 610 - 133 = 477$

different 266$

3.Int.rate > 15% and Fully.paid.back get [610$],

select lowest predicted risk Not.paid.back <= 20% get [-72$]

Best case get 610 + 72 = 682$

worst case get 610 - 72 = 538$

different 144$

df2 = subset.data.frame(df1)
df2$profit = (1+ df1$int.rate)^3 - 1
df2$profit[ df1$not.fully.paid == 1] = -1

df2$pred = predict(model1,df2, type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

profit_15_get = subset.data.frame(df2,df2$int.rate >0.15 & df2$not.fully.paid == 0 )
sum(profit_15_get$profit)

## [1] 610.542

profit_15_Notget = subset.data.frame(df2,df2$int.rate >0.15 & df2$not.fully.paid == 1 & df2$pred <= 0.30)
X=sum(profit_15_Notget$profit)
X

## [1] -194

profit_15_Notget = subset.data.frame(df2,df2$int.rate >0.15 & df2$not.fully.paid == 1 & df2$pred <= 0.25)
X1=sum(profit_15_Notget$profit)
X1

## [1] -133

profit_15_Notget = subset.data.frame(df2,df2$int.rate >0.15 & df2$not.fully.paid == 1 & df2$pred <= 0.20)
X2=sum(profit_15_Notget$profit)
X2

## [1] -72

10. Test set loans with an interest rate of at least 15%

Create a dataset called [HighInterest] consisting of the test set loans with an interest rate of at least 15%.

What is the average profit of a $1 investment in one of these high-interest loans?

average profit is 0.1898
What proportion of the high-interest loans were not paid back in full?

Not fully paid back which have Int.rate >15 = 345 case = proportion = 24.6 %

fully paid back which have Int.rate >15 = 1,054 case = proportion = 75.3 %

HighInterest = subset.data.frame(df1)
HighInterest$profit = (1+ df1$int.rate)^3 - 1
HighInterest$profit[ df1$not.fully.paid == 1] = -1

HighInterest = HighInterest[which(HighInterest$int.rate >0.15),]
z = mean(HighInterest$profit)
cat(" average profit is",z)

##  average profit is 0.1898084

summary(HighInterest$profit)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.0000  0.5213  0.5468  0.1898  0.5873  0.7808

X = HighInterest[which(HighInterest$not.fully.paid ==1),]
y = HighInterest[which(HighInterest$not.fully.paid ==0),]

11. 100 loans with the lowest [PredictedRisk.]

Next, sort the loans in the [Highlnterest] dataset by the variable [PredictedRisk.]

Create a new dataset called [SelectedLoans] that consists of 100 loans with the lowest [PredictedRisk.]

What is the profit to an investor who invested $1 in each of these 100 loans?

investor will get 26$ from 100 loan that have lowest Predicted Risk rate.
How does this compare to investing in all loans?

option 1 > investor will get 26$ from 100 loan which come from lowest Predicted Risk rate and high interest rate.

option 2 > investor will get 20$ from 100 loan which come from random data not specific.

that make it 6$ different from 100 cases , So choosing loan from 100 lowest risk is better than randomly pick.

HighInterest$PredictedRisk = predict(model1,HighInterest, type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

HighInterest$PredictedRisk = sort(HighInterest$PredictedRisk)


SelectedLoans_Low = subset.data.frame(HighInterest)
SelectedLoans_Low = head(SelectedLoans_Low,100) 
sum(SelectedLoans_Low$profit)

## [1] 25.65067

SelectedLoans_random = subset.data.frame(df2)
SelectedLoans_random = head(SelectedLoans_random,100) 
sum(SelectedLoans_random$profit)

## [1] 19.58962

12. Optimize regression

Create and optimize regression and classification models using both a single decision tree and a random forest (or a boosted decision tree).

What are your findings?
Which models work better?

Compare these models with the models you created at Task 3 and Task 6 by using rmse value for regression, and AUC for classification.

FICO prediction

Decision tree - [Fico]——-RMSE = 22.93

Linear regression - [Fico]—RMSE = 21.38

Random forrest - [Fico]——RMSE = 17.34 !!! Win No.1

!!When there are large number of features with less data-sets(with low noise), linear regressions may outperform Decision trees/random forests. In general cases, Decision trees will be having better average accuracy.

Not fully paid prediction

Decision tree - Not fully paid prediction———AUC = 68.0 %

Logistic regression - Not fully paid prediction—AUC = 60.6 %

Random forrest - Not fully paid prediction——–AUC = 79.2 % !!! WIN

but [The response has five or fewer unique values. Are you sure you want to do regression?]

——Decision tree - regression - Fico

## Decision tree - fico (continuous - 620, 700,800, 624, ... - Linear )

Tree_fico = rpart(fico ~ .,train_lm)
printcp(Tree_fico)

## 
## Regression tree:
## rpart(formula = fico ~ ., data = train_lm)
## 
## Variables actually used in tree construction:
## [1] installment      int.rate         P_small_business
## 
## Root node error: 9581443/6656 = 1439.5
## 
## n= 6656 
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.411887      0   1.00000 1.00043 0.0153868
## 2 0.094183      1   0.58811 0.59283 0.0112220
## 3 0.039921      2   0.49393 0.49318 0.0103565
## 4 0.026514      3   0.45401 0.46014 0.0100568
## 5 0.018855      4   0.42749 0.43092 0.0096806
## 6 0.016429      5   0.40864 0.41679 0.0089073
## 7 0.011134      6   0.39221 0.39740 0.0082037
## 8 0.010000      8   0.36994 0.37744 0.0081526

plotcp(Tree_fico, upper = "splits")

optimal_tree_lm <- rpart(
    formula = fico ~ .,
    data    = train_lm,
    method  = "anova",
    control = list(minsplit = 9, maxdepth = 5, cp = 0.011)
    )

## rpart to generate a full tree by using cp = 0 (no penalty results in a fully grown tree)
## minsplit: the minimum number of data points required to attempt a split before it is forced to create a                   terminal node
## maxdepth: the maximum number of internal nodes between the root node and the terminal nodes.

pred =  predict(optimal_tree_lm, test_lm )
rmse_fico = sqrt(mean((test_lm$fico - pred)^2))
rmse_fico

## [1] 22.93026

rpart.plot(optimal_tree_lm)

rpart.rules(optimal_tree_lm)

##  fico                                                                             
##   678 when int.rate >=          0.132                      & P_small_business is 0
##   696 when int.rate is 0.119 to 0.132 & installment <  499                        
##   707 when int.rate >=          0.132                      & P_small_business is 1
##   711 when int.rate is 0.096 to 0.119 & installment <  328                        
##   722 when int.rate is 0.119 to 0.132 & installment >= 499                        
##   737 when int.rate is 0.096 to 0.119 & installment >= 328                        
##   738 when int.rate is 0.082 to 0.096 & installment <  236                        
##   761 when int.rate is 0.082 to 0.096 & installment >= 236                        
##   767 when int.rate <  0.082

——Decision tree - Classification - Not fully paid

## preparing df3

df3 = data.frame(df1)
df3$not.fully.paid = as.factor(df3$not.fully.paid )
df3$inq.last.6mths = as.numeric(df3$inq.last.6mths)
df3$delinq.2yrs = as.numeric(df3$delinq.2yrs)

df3$purpose = as.factor(df3$purpose)
df3$P_debt_consolidation = as.factor(df3$P_debt_consolidation)
df3$P_credit_card  = as.factor(df3$P_credit_card )
df3$P_all_other   = as.factor(df3$P_all_other  )
df3$P_all_other = as.factor(df3$P_all_other)
df3$P_home_improvement = as.factor(df3$P_home_improvement)
df3$P_small_business = as.factor(df3$P_small_business)
df3$P_major_purchase  = as.factor(df3$P_major_purchase )
df3$P_educational  = as.factor(df3$P_educational)

#df3$inq.last.6mths   = as.factor(df3$inq.last.6mths)
#df3$delinq.2yrs   = as.factor(df3$delinq.2yrs  )

df3 <- subset(df3, select = -c(1,15:21))
str(df3)

## 'data.frame':    9508 obs. of  13 variables:
##  $ purpose          : Factor w/ 7 levels "1","2","3","4",..: 1 2 1 1 2 2 1 3 4 1 ...
##  $ int.rate         : num  0.119 0.107 0.136 0.101 0.143 ...
##  $ installment      : num  829 228 367 162 103 ...
##  $ log.annual.inc   : num  11.4 11.1 10.4 11.4 11.3 ...
##  $ dti              : num  1119 612 348 2341 678 ...
##  $ fico             : int  737 707 682 712 667 727 667 722 682 707 ...
##  $ days.with.cr.line: num  5640 2760 4710 2700 4066 ...
##  $ revol.bal        : int  28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
##  $ revol.util       : num  513 784 212 749 365 502 785 691 503 186 ...
##  $ inq.last.6mths   : num  0 0 1 1 0 0 0 0 1 1 ...
##  $ delinq.2yrs      : num  1 1 1 1 2 1 1 1 1 1 ...
##  $ pub.rec          : num  1 1 1 1 1 1 2 1 1 1 ...
##  $ not.fully.paid   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 1 1 ...

## Decision tree - Not.fully.paid ( category = 0,1 - Classification ) 

set.seed(42)
split = sample.split (df3$not.fully.paid, SplitRatio = 0.70)

train_glmf = subset(df3, split == TRUE)
test_glmf = subset(df3, split == FALSE)

TreeModel_NFP = tree(not.fully.paid ~ ., data = train_glmf, method = "class" )
summary(TreeModel_NFP)

## 
## Classification tree:
## tree(formula = not.fully.paid ~ ., data = train_glmf, method = "class")
## Variables actually used in tree construction:
## [1] "int.rate"       "inq.last.6mths"
## Number of terminal nodes:  3 
## Residual mean deviance:  0.8415 = 5598 / 6653 
## Misclassification error rate: 0.1597 = 1063 / 6656

# plot the tree model
plot (TreeModel_NFP)
text (TreeModel_NFP)

tree.pred = predict(TreeModel_NFP, test_glmf, type = "class")
tree.pred2 = predict(TreeModel_NFP, test_glmf, type = "vector")
tree.pred_cat = ifelse(tree.pred2[ , 2] > 0.25, 1, 0)

table(test_glmf$not.fully.paid, tree.pred_cat)

##    tree.pred_cat
##        0    1
##   0 1998  399
##   1  310  145

# ROC will show us the overall performance of the model

ROCRpred = prediction (tree.pred2[,2], test_glmf$not.fully.paid)
as.numeric (performance (ROCRpred, "auc") @y.values)

## [1] 0.606758

ROCRperf = performance (ROCRpred, "tpr", "fpr")
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.05), text.adj = c(-0.8, 1.7))

cv.TreeModel1 = cv.tree(TreeModel_NFP, FUN = prune.misclass)
cv.TreeModel1

## $size
## [1] 3 1
## 
## $dev
## [1] 1063 1063
## 
## $k
## [1] -Inf    0
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

plot(cv.TreeModel1$size, cv.TreeModel1$dev, type = "b", xlab = "Tree size", ylab = "Deviance")

prune.TreeModel1 = prune.misclass(TreeModel_NFP, best = 3)
plot(prune.TreeModel1)
text(prune.TreeModel1, pretty = 0)

summary(TreeModel_NFP)

## 
## Classification tree:
## tree(formula = not.fully.paid ~ ., data = train_glmf, method = "class")
## Variables actually used in tree construction:
## [1] "int.rate"       "inq.last.6mths"
## Number of terminal nodes:  3 
## Residual mean deviance:  0.8415 = 5598 / 6653 
## Misclassification error rate: 0.1597 = 1063 / 6656

summary(prune.TreeModel1)

## 
## Classification tree:
## tree(formula = not.fully.paid ~ ., data = train_glmf, method = "class")
## Variables actually used in tree construction:
## [1] "int.rate"       "inq.last.6mths"
## Number of terminal nodes:  3 
## Residual mean deviance:  0.8415 = 5598 / 6653 
## Misclassification error rate: 0.1597 = 1063 / 6656

prune.pred = predict(prune.TreeModel1, test_glmf, type = "class")
tree.pred2 = predict(TreeModel_NFP, test_glmf, type = "vector")
tree.pred_cat = ifelse(tree.pred2[ , 2] > 0.25, 1, 0)

table(test_glmf$not.fully.paid, tree.pred_cat)

##    tree.pred_cat
##        0    1
##   0 1998  399
##   1  310  145

##——Random forrest - regression - Fico

## Decision tree - fico (continuous - 620, 700,800, 624, ... - Linear )

rf = randomForest (fico ~ ., data = train_lm, ntree = 200, nodesize = 2, importance = TRUE)


## rpart to generate a full tree by using cp = 0 (no penalty results in a fully grown tree)
## minsplit: the minimum number of data points required to attempt a split before it is forced to create a                   terminal node
## maxdepth: the maximum number of internal nodes between the root node and the terminal nodes.

pred_rf =  predict(rf, test_lm )
rmse_fico_rf = sqrt(mean((test_lm$fico - pred_rf)^2))
rmse_fico_rf

## [1] 17.38607

varImpPlot(rf)

##——Random forrest - Classification - Not fully paid

rf_class = randomForest(not.fully.paid ~ ., data = train_glm, mtry = 0, ntree = 20, nodesize = 10, importance = TRUE)

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range

rf_p_train <- predict(rf_class,newdata = train_glm , type = "class")
rf_pr_train <- prediction(rf_p_train, train_glm$not.fully.paid)
r_auc_train1 <- performance(rf_pr_train, measure = "auc")@y.values[[1]] 
r_auc_train1

## [1] 0.7987195

ROCRperf_rf = performance (rf_pr_train, "tpr", "fpr")
plot (ROCRperf_rf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.1), text.adj = c(-0.2, 1.7))

13. Clustering analysis

Conduct clustering analysis using any method of your choice and try to find meaningful clusters. Interpret why the clusters make sense. (optional)

#km.out = kmeans(df3$fico, 2, nstart = 5)
#plot(df, col = km.out$cluster)

CE-Loan-Nakorn

Nakorn

7/2/2021