df0 = read.csv("loans.csv")
df = read.csv("loans.csv")
str(df)
## 'data.frame': 9578 obs. of 15 variables:
## $ customer.id : int 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 ...
## $ credit.policy : chr "1" "1" "1" "1" ...
## $ purpose : chr "debt_consolidation" "credit_card" "debt_consolidation" "debt_consolidation" ...
## $ int.rate : num 0.119 0.107 0.136 0.101 0.143 ...
## $ installment : num 829 228 367 162 103 ...
## $ log.annual.inc : num 11.4 11.1 10.4 11.4 11.3 ...
## $ dti : chr "19.48" "14.29" "11.63" "8.1" ...
## $ fico : int 737 707 682 712 667 727 667 722 682 707 ...
## $ days.with.cr.line: num 5640 2760 4710 2700 4066 ...
## $ revol.bal : int 28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
## $ revol.util : chr "52.1" "76.7" "25.6" "73.2" ...
## $ inq.last.6mths : int 0 0 1 1 0 0 0 0 1 1 ...
## $ delinq.2yrs : chr "0" "0" "0" "0" ...
## $ pub.rec : chr "0" "0" "0" "0" ...
## $ not.fully.paid : int 0 0 0 0 0 0 1 1 0 0 ...
summary(df)
## customer.id credit.policy purpose int.rate
## Min. :10001 Length:9578 Length:9578 Min. : 0.0600
## 1st Qu.:12395 Class :character Class :character 1st Qu.: 0.1039
## Median :14790 Mode :character Mode :character Median : 0.1221
## Mean :14790 Mean : 0.1255
## 3rd Qu.:17184 3rd Qu.: 0.1407
## Max. :19578 Max. :14.7000
##
## installment log.annual.inc dti fico
## Min. : 15.67 Min. : 7.548 Length:9578 Min. : 612.0
## 1st Qu.:163.77 1st Qu.:10.558 Class :character 1st Qu.: 682.0
## Median :268.95 Median :10.928 Mode :character Median : 707.0
## Mean :319.09 Mean :10.932 Mean : 711.2
## 3rd Qu.:432.76 3rd Qu.:11.290 3rd Qu.: 737.0
## Max. :940.14 Max. :14.528 Max. :1812.0
## NA's :5
## days.with.cr.line revol.bal revol.util inq.last.6mths
## Min. : 179 Min. : 0 Length:9578 Min. : 0.000
## 1st Qu.: 2820 1st Qu.: 3187 Class :character 1st Qu.: 0.000
## Median : 4140 Median : 8596 Mode :character Median : 1.000
## Mean : 4562 Mean : 16915 Mean : 1.572
## 3rd Qu.: 5730 3rd Qu.: 18252 3rd Qu.: 2.000
## Max. :17640 Max. :1207359 Max. :33.000
## NA's :29 NA's :1 NA's :30
## delinq.2yrs pub.rec not.fully.paid
## Length:9578 Length:9578 Min. :0.0000
## Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Median :0.0000
## Mean :0.1601
## 3rd Qu.:0.0000
## Max. :1.0000
##
df\(credit.policy have [zero] x1 > convert done [one] x1 > convert done df\)pub.rec have [NO] x1 > convert done df\(log.annual.inc have NA, Blank > convert to NA and omit df\)dti have [six] x1 > convert done df\(days.with.cr.line have NA > convert to NA and omit df\)revol.bal have Blank > convert to NA and omit df\(revol.util have NA, [one] > convert done df\)inq.last.6mths have NA, Blank > convert to NA and omit df\(delinq.2yrs have NA, [yes] > convert done df\)pub.rec have NA, [NO] > convert done
df$int.rate have 14.7 , 13.25 which not correct > Delete done
df = replace(df, df == "one",1)
df = replace(df, df == "zero",0)
df = replace(df, df == "NO",0)
df = replace(df, df == "yes",1)
df = replace(df, df == "six",6)
df = replace(df, df == "?", NA)
df = na.omit(df)
df = subset(df, select= -c(customer.id))
df = subset.data.frame(df,df$int.rate<1 & df$fico <1000)
df = replace(df, df == "debt_consolidation", 1)
df$P_debt_consolidation = ifelse(df$purpose == 1,1,0)
df = replace(df, df == "credit_card", 2)
df$P_credit_card = ifelse(df$purpose == 2,1,0)
df = replace(df, df == "all_other", 3)
df$P_all_other = ifelse(df$purpose == 3,1,0)
df = replace(df, df == "home_improvement", 4)
df$P_home_improvement = ifelse(df$purpose == 4,1,0)
df = replace(df, df == "small_business", 5)
df$P_small_business = ifelse(df$purpose == 5,1,0)
df = replace(df, df == "major_purchase", 6)
df$P_major_purchase = ifelse(df$purpose == 6,1,0)
df = replace(df, df == "educational", 7)
df$P_educational = ifelse(df$purpose == 7,1,0)
##df = subset(df, select= -c(purpose))
test
# check outlines
df[sapply(df, is.character)] = lapply(df[sapply(df, is.character)], as.factor)
str(df)
## 'data.frame': 9508 obs. of 21 variables:
## $ credit.policy : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ purpose : Factor w/ 7 levels "1","2","3","4",..: 1 2 1 1 2 2 1 3 4 1 ...
## $ int.rate : num 0.119 0.107 0.136 0.101 0.143 ...
## $ installment : num 829 228 367 162 103 ...
## $ log.annual.inc : num 11.4 11.1 10.4 11.4 11.3 ...
## $ dti : Factor w/ 2527 levels "0","0.01","0.02",..: 1119 612 348 2341 678 877 1944 293 904 187 ...
## $ fico : int 737 707 682 712 667 727 667 722 682 707 ...
## $ days.with.cr.line : num 5640 2760 4710 2700 4066 ...
## $ revol.bal : int 28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
## $ revol.util : Factor w/ 1036 levels "0","0.04","0.1",..: 513 784 212 749 365 502 785 691 503 186 ...
## $ inq.last.6mths : int 0 0 1 1 0 0 0 0 1 1 ...
## $ delinq.2yrs : Factor w/ 11 levels "0","1","11","13",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ pub.rec : Factor w/ 7 levels "0","1","2","3",..: 1 1 1 1 1 1 2 1 1 1 ...
## $ not.fully.paid : int 0 0 0 0 0 0 1 1 0 0 ...
## $ P_debt_consolidation: num 1 0 1 1 0 0 1 0 0 1 ...
## $ P_credit_card : num 0 1 0 0 1 1 0 0 0 0 ...
## $ P_all_other : num 0 0 0 0 0 0 0 1 0 0 ...
## $ P_home_improvement : num 0 0 0 0 0 0 0 0 1 0 ...
## $ P_small_business : num 0 0 0 0 0 0 0 0 0 0 ...
## $ P_major_purchase : num 0 0 0 0 0 0 0 0 0 0 ...
## $ P_educational : num 0 0 0 0 0 0 0 0 0 0 ...
summary(df)
## credit.policy purpose int.rate installment log.annual.inc
## 0:1830 1:3945 Min. :0.0600 Min. : 15.67 Min. : 7.548
## 1:7678 2:1258 1st Qu.:0.1039 1st Qu.:164.02 1st Qu.:10.565
## 3:2289 Median :0.1221 Median :269.76 Median :10.933
## 4: 626 Mean :0.1227 Mean :320.21 Mean :10.934
## 5: 618 3rd Qu.:0.1407 3rd Qu.:435.40 3rd Qu.:11.290
## 6: 432 Max. :0.2164 Max. :940.14 Max. :14.528
## 7: 340
## dti fico days.with.cr.line revol.bal
## 0 : 86 Min. :612.0 Min. : 180 Min. : 0
## 0.6 : 16 1st Qu.:682.0 1st Qu.: 2820 1st Qu.: 3273
## 12 : 13 Median :707.0 Median : 4140 Median : 8688
## 13.16 : 13 Mean :710.8 Mean : 4567 Mean : 16987
## 15.1 : 13 3rd Qu.:737.0 3rd Qu.: 5730 3rd Qu.: 18345
## 19.2 : 13 Max. :827.0 Max. :17640 Max. :1207359
## (Other):9354
## revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
## 0 : 295 Min. : 0.000 0 :8396 0 :8952 Min. :0.0000
## 0.5 : 24 1st Qu.: 0.000 1 : 826 1 : 529 1st Qu.:0.0000
## 0.3 : 22 Median : 1.000 2 : 191 2 : 19 Median :0.0000
## 73.7 : 22 Mean : 1.574 3 : 65 3 : 5 Mean :0.1597
## 3.3 : 21 3rd Qu.: 2.000 4 : 18 4 : 1 3rd Qu.:0.0000
## 47.8 : 21 Max. :33.000 5 : 6 5 : 1 Max. :1.0000
## (Other):9103 (Other): 6 no: 1
## P_debt_consolidation P_credit_card P_all_other P_home_improvement
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.4149 Mean :0.1323 Mean :0.2407 Mean :0.06584
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## P_small_business P_major_purchase P_educational
## Min. :0.000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.000 Median :0.00000 Median :0.00000
## Mean :0.065 Mean :0.04544 Mean :0.03576
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.00000 Max. :1.00000
##
set df1 as numeric of df in order to make cor:
df1 = data.frame(df)
df1[sapply(df1, is.factor)] = lapply(df1[sapply(df1, is.factor)], as.numeric)
str(df1)
## 'data.frame': 9508 obs. of 21 variables:
## $ credit.policy : num 2 2 2 2 2 2 2 2 2 2 ...
## $ purpose : num 1 2 1 1 2 2 1 3 4 1 ...
## $ int.rate : num 0.119 0.107 0.136 0.101 0.143 ...
## $ installment : num 829 228 367 162 103 ...
## $ log.annual.inc : num 11.4 11.1 10.4 11.4 11.3 ...
## $ dti : num 1119 612 348 2341 678 ...
## $ fico : int 737 707 682 712 667 727 667 722 682 707 ...
## $ days.with.cr.line : num 5640 2760 4710 2700 4066 ...
## $ revol.bal : int 28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
## $ revol.util : num 513 784 212 749 365 502 785 691 503 186 ...
## $ inq.last.6mths : int 0 0 1 1 0 0 0 0 1 1 ...
## $ delinq.2yrs : num 1 1 1 1 2 1 1 1 1 1 ...
## $ pub.rec : num 1 1 1 1 1 1 2 1 1 1 ...
## $ not.fully.paid : int 0 0 0 0 0 0 1 1 0 0 ...
## $ P_debt_consolidation: num 1 0 1 1 0 0 1 0 0 1 ...
## $ P_credit_card : num 0 1 0 0 1 1 0 0 0 0 ...
## $ P_all_other : num 0 0 0 0 0 0 0 1 0 0 ...
## $ P_home_improvement : num 0 0 0 0 0 0 0 0 1 0 ...
## $ P_small_business : num 0 0 0 0 0 0 0 0 0 0 ...
## $ P_major_purchase : num 0 0 0 0 0 0 0 0 0 0 ...
## $ P_educational : num 0 0 0 0 0 0 0 0 0 0 ...
cor_matrix = cor(df1, method = "spearman")
cor_matrix
## credit.policy purpose int.rate installment
## credit.policy 1.0000000000 -0.016163243 -0.29922247 0.067491035
## purpose -0.0161632433 1.000000000 -0.09928159 -0.182333107
## int.rate -0.2992224717 -0.099281592 1.00000000 0.242631447
## installment 0.0674910354 -0.182333107 0.24263145 1.000000000
## log.annual.inc 0.0286474657 0.019648270 0.04404270 0.432329014
## dti -0.0162452831 -0.027347376 0.02911917 -0.007310878
## fico 0.3584555240 0.154259175 -0.74718746 0.084843434
## days.with.cr.line 0.1125471337 -0.015164185 -0.13364416 0.200905272
## revol.bal -0.0205474651 -0.231370464 0.14771106 0.346562909
## revol.util -0.0695959186 -0.193042477 0.39482798 0.080842359
## inq.last.6mths -0.4318228891 0.063312657 0.17903857 -0.004468728
## delinq.2yrs -0.0588612939 0.004056783 0.17340105 -0.007928289
## pub.rec -0.0492471260 -0.041899617 0.09588996 -0.026970454
## not.fully.paid -0.1615671719 0.042274849 0.15259239 0.041371573
## P_debt_consolidation 0.0158603644 -0.893813193 0.13266839 0.201333421
## P_credit_card 0.0008870614 -0.026821593 -0.04563619 0.006639555
## P_all_other -0.0146246685 0.342401111 -0.11823802 -0.226672310
## P_home_improvement 0.0037498403 0.309081287 -0.05223355 0.014932395
## P_small_business -0.0054691112 0.369541322 0.12815333 0.124572630
## P_major_purchase 0.0232461523 0.349490831 -0.06674728 -0.091850569
## P_educational -0.0338489284 0.336856931 -0.02203136 -0.106749245
## log.annual.inc dti fico days.with.cr.line
## credit.policy 0.028647466 -0.016245283 0.358455524 0.11254713
## purpose 0.019648270 -0.027347376 0.154259175 -0.01516419
## int.rate 0.044042698 0.029119172 -0.747187460 -0.13364416
## installment 0.432329014 -0.007310878 0.084843434 0.20090527
## log.annual.inc 1.000000000 0.004640402 0.105031553 0.39843259
## dti 0.004640402 1.000000000 -0.024115760 0.02579815
## fico 0.105031553 -0.024115760 1.000000000 0.25079899
## days.with.cr.line 0.398432592 0.025798152 0.250798993 1.00000000
## revol.bal 0.417874687 0.063032290 -0.095392931 0.32368390
## revol.util 0.062705436 0.036064542 -0.409084190 0.01312478
## inq.last.6mths 0.031006619 -0.008345033 -0.176888073 -0.04233460
## delinq.2yrs 0.030011419 -0.001526905 -0.237049054 0.09530589
## pub.rec 0.012733237 0.019297467 -0.149090010 0.10033473
## not.fully.paid -0.033219039 0.008954960 -0.146916680 -0.02556690
## P_debt_consolidation -0.039963448 0.026759035 -0.145136472 0.00377366
## P_credit_card 0.080749340 0.003626599 -0.002884949 0.05086867
## P_all_other -0.076930921 -0.020059170 0.059301122 -0.07107850
## P_home_improvement 0.115563397 -0.009124255 0.083982881 0.07567048
## P_small_business 0.093334405 -0.005533118 0.064942701 0.03699011
## P_major_purchase -0.041573804 -0.011677143 0.059422296 -0.02817956
## P_educational -0.095804357 0.001193172 -0.011195196 -0.05774714
## revol.bal revol.util inq.last.6mths delinq.2yrs
## credit.policy -0.020547465 -0.06959592 -0.431822889 -0.0588612939
## purpose -0.231370464 -0.19304248 0.063312657 0.0040567828
## int.rate 0.147711064 0.39482798 0.179038574 0.1734010479
## installment 0.346562909 0.08084236 -0.004468728 -0.0079282887
## log.annual.inc 0.417874687 0.06270544 0.031006619 0.0300114187
## dti 0.063032290 0.03606454 -0.008345033 -0.0015269046
## fico -0.095392931 -0.40908419 -0.176888073 -0.2370490544
## days.with.cr.line 0.323683905 0.01312478 -0.042334603 0.0953058892
## revol.bal 1.000000000 0.40480070 -0.018620415 -0.0549307994
## revol.util 0.404800697 1.00000000 -0.017866928 -0.0294949073
## inq.last.6mths -0.018620415 -0.01786693 1.000000000 0.0201724639
## delinq.2yrs -0.054930799 -0.02949491 0.020172464 1.0000000000
## pub.rec -0.026008365 0.05803940 0.058052758 0.0017620569
## not.fully.paid 0.021789714 0.05518818 0.133370677 0.0147114210
## P_debt_consolidation 0.175763917 0.16122353 -0.038436981 -0.0018603797
## P_credit_card 0.153992538 0.06625554 -0.042582187 -0.0113995073
## P_all_other -0.199252296 -0.10682264 0.009305541 0.0142434370
## P_home_improvement -0.041272508 -0.08849297 0.052120662 -0.0098447523
## P_small_business 0.001951696 -0.04303562 0.044796715 0.0007633006
## P_major_purchase -0.125488357 -0.07786746 0.006632473 -0.0014745587
## P_educational -0.095326847 -0.04008113 0.021741160 0.0067313051
## pub.rec not.fully.paid P_debt_consolidation
## credit.policy -0.0492471260 -0.161567172 0.01586036
## purpose -0.0418996166 0.042274849 -0.89381319
## int.rate 0.0958899618 0.152592385 0.13266839
## installment -0.0269704540 0.041371573 0.20133342
## log.annual.inc 0.0127332373 -0.033219039 -0.03996345
## dti 0.0192974667 0.008954960 0.02675904
## fico -0.1490900098 -0.146916680 -0.14513647
## days.with.cr.line 0.1003347325 -0.025566898 0.00377366
## revol.bal -0.0260083651 0.021789714 0.17576392
## revol.util 0.0580393951 0.055188184 0.16122353
## inq.last.6mths 0.0580527580 0.133370677 -0.03843698
## delinq.2yrs 0.0017620569 0.014711421 -0.00186038
## pub.rec 1.0000000000 0.058682336 0.03900768
## not.fully.paid 0.0586823359 1.000000000 -0.01797229
## P_debt_consolidation 0.0390076776 -0.017972286 1.00000000
## P_credit_card 0.0074384874 -0.047326552 -0.32883824
## P_all_other -0.0322863373 0.009100103 -0.47419105
## P_home_improvement 0.0008380379 0.005853833 -0.22356343
## P_small_business -0.0039592890 0.085415242 -0.22203034
## P_major_purchase -0.0238199764 -0.027535311 -0.18372303
## P_educational -0.0118868597 0.022757838 -0.16217023
## P_credit_card P_all_other P_home_improvement
## credit.policy 0.0008870614 -0.014624669 0.0037498403
## purpose -0.0268215934 0.342401111 0.3090812873
## int.rate -0.0456361945 -0.118238016 -0.0522335467
## installment 0.0066395554 -0.226672310 0.0149323948
## log.annual.inc 0.0807493403 -0.076930921 0.1155633970
## dti 0.0036265986 -0.020059170 -0.0091242551
## fico -0.0028849487 0.059301122 0.0839828811
## days.with.cr.line 0.0508686721 -0.071078499 0.0756704755
## revol.bal 0.1539925385 -0.199252296 -0.0412725080
## revol.util 0.0662555353 -0.106822638 -0.0884929705
## inq.last.6mths -0.0425821868 0.009305541 0.0521206615
## delinq.2yrs -0.0113995073 0.014243437 -0.0098447523
## pub.rec 0.0074384874 -0.032286337 0.0008380379
## not.fully.paid -0.0473265520 0.009100103 0.0058538335
## P_debt_consolidation -0.3288382438 -0.474191046 -0.2235634258
## P_credit_card 1.0000000000 -0.219886072 -0.1036680975
## P_all_other -0.2198860723 1.000000000 -0.1494913823
## P_home_improvement -0.1036680975 -0.149491382 1.0000000000
## P_small_business -0.1029571953 -0.148466248 -0.0699963089
## P_major_purchase -0.0851937973 -0.122851087 -0.0579197144
## P_educational -0.0751995951 -0.108439257 -0.0511250727
## P_small_business P_major_purchase P_educational
## credit.policy -0.0054691112 0.023246152 -0.033848928
## purpose 0.3695413218 0.349490831 0.336856931
## int.rate 0.1281533343 -0.066747278 -0.022031356
## installment 0.1245726297 -0.091850569 -0.106749245
## log.annual.inc 0.0933344050 -0.041573804 -0.095804357
## dti -0.0055331184 -0.011677143 0.001193172
## fico 0.0649427007 0.059422296 -0.011195196
## days.with.cr.line 0.0369901069 -0.028179562 -0.057747140
## revol.bal 0.0019516958 -0.125488357 -0.095326847
## revol.util -0.0430356156 -0.077867465 -0.040081128
## inq.last.6mths 0.0447967146 0.006632473 0.021741160
## delinq.2yrs 0.0007633006 -0.001474559 0.006731305
## pub.rec -0.0039592890 -0.023819976 -0.011886860
## not.fully.paid 0.0854152419 -0.027535311 0.022757838
## P_debt_consolidation -0.2220303435 -0.183723032 -0.162170229
## P_credit_card -0.1029571953 -0.085193797 -0.075199595
## P_all_other -0.1484662477 -0.122851087 -0.108439257
## P_home_improvement -0.0699963089 -0.057919714 -0.051125073
## P_small_business 1.0000000000 -0.057522531 -0.050774483
## P_major_purchase -0.0575225310 1.000000000 -0.042014267
## P_educational -0.0507744834 -0.042014267 1.000000000
corrplot.mixed(cor_matrix)
plot(df$fico,df$days.with.cr.line)
plot(df$fico,df$log.annual.inc)
plot(df$fico,df$int.rate)
plot(df$fico,df$dti)
FICO credit scores are a method of quantifying and evaluating an individuals creditworthiness. 300-329 - BAD 630-689 - Fair 690-719 - Good 720-850 - Excellent
Adjusted R squared is thr measuring impact of independent variables on the correlation, only increases if the new predictor enhances the model above what would be obtained by probability. Conversely, it will decrease when a predictor improves the model less than what is predicted by chance.
from the result:
lm1 dependent fico with most independent x12
Adjust r = 0.6768 > rmse = 21.38428
lm2 dependent fico with significant independent only x8
Adjust r = 0.6411 > rmse = 22.81494
lm3 dependent fico with most independent - highest significant x11
Adjust r = 0.3884 > rmse = 28.88878
lm2 is slightly better than lm1 by 0.01 but both of them have same adjust r. less rmse mean model have higher correlation coefficient.
if the correlation coefficient is 1, the RMSE will be 0, because all of the points lie on the regression line (and therefore there are no errors)
check significant value by looking at T-value that far away from 0 as much as possible. which could indicate a relationship exists.
# linear regression models
set.seed(150)
index = sample.split(df$fico, SplitRatio = 0.70) # split ratio 70%
train_lm = subset(df1, index == TRUE) # create the train data set
test_lm = subset(df1, index == FALSE) # create the test data set
lm1 = lm(fico ~.-not.fully.paid -pub.rec -P_educational -P_major_purchase, train_lm)
## fico with most independent
summary(lm1)
##
## Call:
## lm(formula = fico ~ . - not.fully.paid - pub.rec - P_educational -
## P_major_purchase, data = train_lm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.156 -14.368 -2.354 12.052 98.804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.322e+02 1.360e+01 61.184 < 2e-16 ***
## credit.policy 1.059e+01 8.534e-01 12.405 < 2e-16 ***
## purpose -4.176e+00 1.846e+00 -2.263 0.02369 *
## int.rate -9.512e+02 1.235e+01 -76.990 < 2e-16 ***
## installment 4.618e-02 1.564e-03 29.521 < 2e-16 ***
## log.annual.inc -2.004e-01 5.305e-01 -0.378 0.70566
## dti -3.353e-05 3.491e-04 -0.096 0.92350
## days.with.cr.line 1.956e-03 1.169e-04 16.731 < 2e-16 ***
## revol.bal -7.086e-06 8.931e-06 -0.793 0.42756
## revol.util -1.917e-02 1.010e-03 -18.976 < 2e-16 ***
## inq.last.6mths 1.247e-01 1.413e-01 0.882 0.37760
## delinq.2yrs -5.437e+00 3.363e-01 -16.168 < 2e-16 ***
## P_debt_consolidation -2.914e+01 1.012e+01 -2.880 0.00399 **
## P_credit_card -2.412e+01 8.301e+00 -2.906 0.00368 **
## P_all_other -1.441e+01 6.457e+00 -2.232 0.02563 *
## P_home_improvement -8.945e+00 4.722e+00 -1.894 0.05824 .
## P_small_business 8.829e+00 3.040e+00 2.904 0.00369 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.57 on 6639 degrees of freedom
## Multiple R-squared: 0.6776, Adjusted R-squared: 0.6768
## F-statistic: 872.1 on 16 and 6639 DF, p-value: < 2.2e-16
test_lm$predicted_fico1 = predict(lm1, test_lm)
rmse(actual = test_lm$fico, predicted = test_lm$predicted_fico1)
## [1] 21.38428
lm2 = lm(fico ~. -log.annual.inc -dti -revol.bal -inq.last.6mths-pub.rec
-P_educational-P_home_improvement -P_major_purchase -P_debt_consolidation
-days.with.cr.line -revol.util -P_credit_card , train_lm)
## fico with significant independent only
summary(lm2)
##
## Call:
## lm(formula = fico ~ . - log.annual.inc - dti - revol.bal - inq.last.6mths -
## pub.rec - P_educational - P_home_improvement - P_major_purchase -
## P_debt_consolidation - days.with.cr.line - revol.util - P_credit_card,
## data = train_lm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -97.384 -14.777 -2.923 12.286 102.741
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.064e+02 2.306e+00 349.707 < 2e-16 ***
## credit.policy 9.729e+00 7.541e-01 12.901 < 2e-16 ***
## purpose 1.849e+00 1.836e-01 10.076 < 2e-16 ***
## int.rate -1.071e+03 1.167e+01 -91.699 < 2e-16 ***
## installment 5.333e-02 1.447e-03 36.849 < 2e-16 ***
## delinq.2yrs -4.042e+00 3.456e-01 -11.696 < 2e-16 ***
## not.fully.paid -5.219e+00 7.702e-01 -6.775 1.35e-11 ***
## P_all_other 4.012e+00 6.790e-01 5.908 3.62e-09 ***
## P_small_business 1.741e+01 1.293e+00 13.460 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.56 on 6647 degrees of freedom
## Multiple R-squared: 0.6469, Adjusted R-squared: 0.6464
## F-statistic: 1522 on 8 and 6647 DF, p-value: < 2.2e-16
test_lm$predicted_fico2 = predict(lm2, test_lm)
rmse(actual = test_lm$fico, predicted = test_lm$predicted_fico2)
## [1] 22.59684
lm3 = lm(fico ~.-not.fully.paid -pub.rec -int.rate -P_educational-P_major_purchase , train_lm)
## fico with most independent - highest significant
summary(lm3)
##
## Call:
## lm(formula = fico ~ . - not.fully.paid - pub.rec - int.rate -
## P_educational - P_major_purchase, data = train_lm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.826 -21.414 -1.241 19.896 108.666
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.157e+02 1.860e+01 38.487 < 2e-16 ***
## credit.policy 2.544e+01 1.144e+00 22.249 < 2e-16 ***
## purpose -6.554e+00 2.539e+00 -2.581 0.009862 **
## installment 9.903e-03 2.052e-03 4.825 1.43e-06 ***
## log.annual.inc 1.269e+00 7.293e-01 1.740 0.081865 .
## dti -3.091e-04 4.802e-04 -0.644 0.519862
## days.with.cr.line 3.464e-03 1.586e-04 21.841 < 2e-16 ***
## revol.bal 2.907e-05 1.227e-05 2.369 0.017867 *
## revol.util -4.978e-02 1.278e-03 -38.961 < 2e-16 ***
## inq.last.6mths -7.036e-01 1.939e-01 -3.629 0.000287 ***
## delinq.2yrs -1.021e+01 4.547e-01 -22.459 < 2e-16 ***
## P_debt_consolidation -4.358e+01 1.392e+01 -3.132 0.001745 **
## P_credit_card -3.242e+01 1.142e+01 -2.839 0.004532 **
## P_all_other -2.149e+01 8.882e+00 -2.419 0.015571 *
## P_home_improvement -1.450e+01 6.496e+00 -2.232 0.025636 *
## P_small_business -7.890e+00 4.171e+00 -1.892 0.058581 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.67 on 6640 degrees of freedom
## Multiple R-squared: 0.3898, Adjusted R-squared: 0.3884
## F-statistic: 282.7 on 15 and 6640 DF, p-value: < 2.2e-16
test_lm$predicted_fico3 = predict(lm3, test_lm)
rmse(actual = test_lm$fico, predicted = test_lm$predicted_fico3)
## [1] 28.88878
Baseline module has an accuracy of (7990/(7990+1518)) = 84 %
In data set baseline
1-Not fully paid back - 1517
0-fully paid back - 7990
Total = 9507
table(df$not.fully.paid)
##
## 0 1
## 7990 1518
ggplot(df, aes(int.rate, as.factor(not.fully.paid))) + geom_boxplot() + coord_flip()
ggplot(df, aes(inq.last.6mths, as.factor(not.fully.paid))) + geom_boxplot() + coord_flip()
Randomly split
set.seed(42)
split = sample.split (df1$not.fully.paid, SplitRatio = 0.70)
train_glm = subset(df1, split == TRUE)
test_glm = subset(df1, split == FALSE)
#checking baseline [train and test set] (result same)
table(train_glm$not.fully.paid)
##
## 0 1
## 5593 1063
nrow(train_glm)
## [1] 6656
(5593/(5593+1063))
## [1] 0.8402945
table(test_glm$not.fully.paid)
##
## 0 1
## 2397 455
nrow(test_glm)
## [1] 2852
(2397/(2397+445))
## [1] 0.8434201
model1 = glm(not.fully.paid ~ ., family = binomial, train_glm)
summary(model1)
##
## Call:
## glm(formula = not.fully.paid ~ ., family = binomial, data = train_glm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9423 -0.6169 -0.4978 -0.3769 2.4416
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.744e+00 2.273e+00 2.087 0.03687 *
## credit.policy -3.210e-01 1.016e-01 -3.160 0.00158 **
## purpose 4.581e-01 2.423e-01 1.891 0.05868 .
## int.rate 3.132e+00 2.058e+00 1.522 0.12794
## installment 1.069e-03 2.090e-04 5.115 3.13e-07 ***
## log.annual.inc -3.557e-01 7.109e-02 -5.003 5.63e-07 ***
## dti 1.858e-05 4.696e-05 0.396 0.69242
## fico -8.536e-03 1.635e-03 -5.222 1.77e-07 ***
## days.with.cr.line 1.790e-05 1.555e-05 1.151 0.24977
## revol.bal 1.838e-06 1.094e-06 1.680 0.09291 .
## revol.util 7.554e-05 1.359e-04 0.556 0.57838
## inq.last.6mths 8.329e-02 1.643e-02 5.069 4.01e-07 ***
## delinq.2yrs -9.118e-02 4.390e-02 -2.077 0.03782 *
## pub.rec 2.722e-01 1.122e-01 2.425 0.01531 *
## P_debt_consolidation 2.155e+00 1.354e+00 1.592 0.11146
## P_credit_card 1.587e+00 1.116e+00 1.422 0.15509
## P_all_other 1.622e+00 8.727e-01 1.858 0.06314 .
## P_home_improvement 1.117e+00 6.444e-01 1.733 0.08310 .
## P_small_business 1.122e+00 4.131e-01 2.715 0.00663 **
## P_major_purchase NA NA NA NA
## P_educational NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5846.4 on 6655 degrees of freedom
## Residual deviance: 5480.4 on 6637 degrees of freedom
## AIC: 5518.4
##
## Number of Fisher Scoring iterations: 5
Predict the probability of the test set loans not being paid back in full. Store these values in a variable named PredictedRisk and add it to your test set.
-What is the accuracy of the logistic regression model on the test set using a threshold of 0.25?
0.80 or 80 % = (2179+112)/(2179+218+343+112)
-What is the AUC of the model?
0.68 or 68 % (result in Q7)
PredictedRisk = predict(model1, type = "response", test_glm)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
PredictedRisk_cat = ifelse(PredictedRisk > 0.25,1,0)
table_acc = table(test_glm$not.fully.paid, PredictedRisk_cat)
table_acc
## PredictedRisk_cat
## 0 1
## 0 2170 227
## 1 319 136
accuracy_PredictedRisk = round(sum(diag(table_acc))/ sum(table_acc),3)
print("accuracy of the logistic regression model")
## [1] "accuracy of the logistic regression model"
accuracy_PredictedRisk
## [1] 0.809
#manual Cal Accuracy of glm
round((2179+112)/(2179+218+343+112),3)
## [1] 0.803
[ANS] Threshold values is 0.18 at maximum False positive at 25%
ROC - Receiver Operator Characteristic
# ROC will show us the overall performance of the model
ROCRpred = prediction (PredictedRisk, test_glm$not.fully.paid )
ROCRperf = performance (ROCRpred, "tpr", "fpr")
as.numeric (performance (ROCRpred, "auc") @y.values) # higher auc value is better
## [1] 0.6901796
plot (ROCRperf,
colorize = TRUE,
main ="ROC CURVE",
print.cutoffs.at = seq (0, 1, by = 0.01535),
text.adj = c(-0.2, 1.7)
)
abline(h=0.493, v=0.25)
Can you create a simpler model with very much similar evaluation metrics? Use this new model to make predictions for observations in the test set.
In data set (total = 2852 obs)
1-Not fully paid back = 458 obs
0-fully paid back = 2394 obs
Predicted would not fully paid back threshold of 0.25 (total = 2852 obs)
1-Not fully paid back = 397 obs
0-fully paid back = 2455 obs
AUC of this module at 0.25 threshold is 0.70 or 70%
and predicted of people who not paid in full is 414/2852 = 0.15 or 15%
table(test_lm$not.fully.paid)
##
## 0 1
## 2394 458
model2 = glm(not.fully.paid ~ ., family = binomial, train_glm)
PredictedRisk_Q8 = predict(model2, type = "response", test_lm)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
PredictedRisk_cat_Q8 = ifelse(PredictedRisk_Q8 > 0.25,1,0)
table_acc_Q8 = table(test_lm$not.fully.paid, PredictedRisk_cat_Q8)
table_acc_Q8
## PredictedRisk_cat_Q8
## 0 1
## 0 2133 261
## 1 305 153
ROCRpred_Q8 = prediction (PredictedRisk_Q8, test_lm$not.fully.paid )
ROCRperf_Q8 = performance (ROCRpred_Q8, "tpr", "fpr")
as.numeric (performance (ROCRpred_Q8, "auc") @y.values) # higher auc value is better
## [1] 0.7066091
plot (ROCRperf_Q8,
colorize = TRUE,
main ="ROC CURVE",
print.cutoffs.at = seq (0, 1, by = 0.05),
text.adj = c(-0.2, 1.7)
)
abline(h=0.332, v=0.111)
Compute the profit of a $1 investment in each loan, save your result to a variable named Profit.
Hint: Carefully think about the profit if the loan defaults and if the loan is paid back in full. This then allows you to determine the expected profit.
Analyze now a strategy in which the investor purchases loans with a high interest rate (15% or higher) to maximize return, but among these loans selects the ones with the lowest predicted risk of not being paid back in full.
[ANS]
1.Int.rate > 15% and Fully.paid.back get [610$],
select lowest predicted risk Not.paid.back <= 30% get [-194$]
Best case get 610 + 194 = 804$
worst case get 610 - 194 = 416$
different 388$
2.Int.rate > 15% and Fully.paid.back get [610$],
select lowest predicted risk Not.paid.back <= 25% get [-133$]
Best case get 610 + 133 = 743$
worst case get 610 - 133 = 477$
different 266$
3.Int.rate > 15% and Fully.paid.back get [610$],
select lowest predicted risk Not.paid.back <= 20% get [-72$]
Best case get 610 + 72 = 682$
worst case get 610 - 72 = 538$
different 144$
df2 = subset.data.frame(df1)
df2$profit = (1+ df1$int.rate)^3 - 1
df2$profit[ df1$not.fully.paid == 1] = -1
df2$pred = predict(model1,df2, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
profit_15_get = subset.data.frame(df2,df2$int.rate >0.15 & df2$not.fully.paid == 0 )
sum(profit_15_get$profit)
## [1] 610.542
profit_15_Notget = subset.data.frame(df2,df2$int.rate >0.15 & df2$not.fully.paid == 1 & df2$pred <= 0.30)
X=sum(profit_15_Notget$profit)
X
## [1] -194
profit_15_Notget = subset.data.frame(df2,df2$int.rate >0.15 & df2$not.fully.paid == 1 & df2$pred <= 0.25)
X1=sum(profit_15_Notget$profit)
X1
## [1] -133
profit_15_Notget = subset.data.frame(df2,df2$int.rate >0.15 & df2$not.fully.paid == 1 & df2$pred <= 0.20)
X2=sum(profit_15_Notget$profit)
X2
## [1] -72
Create a dataset called [HighInterest] consisting of the test set loans with an interest rate of at least 15%.
Not fully paid back which have Int.rate >15 = 345 case = proportion = 24.6 %
fully paid back which have Int.rate >15 = 1,054 case = proportion = 75.3 %
HighInterest = subset.data.frame(df1)
HighInterest$profit = (1+ df1$int.rate)^3 - 1
HighInterest$profit[ df1$not.fully.paid == 1] = -1
HighInterest = HighInterest[which(HighInterest$int.rate >0.15),]
z = mean(HighInterest$profit)
cat(" average profit is",z)
## average profit is 0.1898084
summary(HighInterest$profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.0000 0.5213 0.5468 0.1898 0.5873 0.7808
X = HighInterest[which(HighInterest$not.fully.paid ==1),]
y = HighInterest[which(HighInterest$not.fully.paid ==0),]
Next, sort the loans in the [Highlnterest] dataset by the variable [PredictedRisk.]
Create a new dataset called [SelectedLoans] that consists of 100 loans with the lowest [PredictedRisk.]
What is the profit to an investor who invested $1 in each of these 100 loans?
investor will get 26$ from 100 loan that have lowest Predicted Risk rate.
How does this compare to investing in all loans?
option 1 > investor will get 26$ from 100 loan which come from lowest Predicted Risk rate and high interest rate.
option 2 > investor will get 20$ from 100 loan which come from random data not specific.
that make it 6$ different from 100 cases , So choosing loan from 100 lowest risk is better than randomly pick.
HighInterest$PredictedRisk = predict(model1,HighInterest, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
HighInterest$PredictedRisk = sort(HighInterest$PredictedRisk)
SelectedLoans_Low = subset.data.frame(HighInterest)
SelectedLoans_Low = head(SelectedLoans_Low,100)
sum(SelectedLoans_Low$profit)
## [1] 25.65067
SelectedLoans_random = subset.data.frame(df2)
SelectedLoans_random = head(SelectedLoans_random,100)
sum(SelectedLoans_random$profit)
## [1] 19.58962
Create and optimize regression and classification models using both a single decision tree and a random forest (or a boosted decision tree).
Compare these models with the models you created at Task 3 and Task 6 by using rmse value for regression, and AUC for classification.
FICO prediction
Decision tree - [Fico]——-RMSE = 22.93
Linear regression - [Fico]—RMSE = 21.38
Random forrest - [Fico]——RMSE = 17.34 !!! Win No.1
!!When there are large number of features with less data-sets(with low noise), linear regressions may outperform Decision trees/random forests. In general cases, Decision trees will be having better average accuracy.
Not fully paid prediction
Decision tree - Not fully paid prediction———AUC = 68.0 %
Logistic regression - Not fully paid prediction—AUC = 60.6 %
Random forrest - Not fully paid prediction——–AUC = 79.2 % !!! WIN
but [The response has five or fewer unique values. Are you sure you want to do regression?]
## Decision tree - fico (continuous - 620, 700,800, 624, ... - Linear )
Tree_fico = rpart(fico ~ .,train_lm)
printcp(Tree_fico)
##
## Regression tree:
## rpart(formula = fico ~ ., data = train_lm)
##
## Variables actually used in tree construction:
## [1] installment int.rate P_small_business
##
## Root node error: 9581443/6656 = 1439.5
##
## n= 6656
##
## CP nsplit rel error xerror xstd
## 1 0.411887 0 1.00000 1.00043 0.0153868
## 2 0.094183 1 0.58811 0.59283 0.0112220
## 3 0.039921 2 0.49393 0.49318 0.0103565
## 4 0.026514 3 0.45401 0.46014 0.0100568
## 5 0.018855 4 0.42749 0.43092 0.0096806
## 6 0.016429 5 0.40864 0.41679 0.0089073
## 7 0.011134 6 0.39221 0.39740 0.0082037
## 8 0.010000 8 0.36994 0.37744 0.0081526
plotcp(Tree_fico, upper = "splits")
optimal_tree_lm <- rpart(
formula = fico ~ .,
data = train_lm,
method = "anova",
control = list(minsplit = 9, maxdepth = 5, cp = 0.011)
)
## rpart to generate a full tree by using cp = 0 (no penalty results in a fully grown tree)
## minsplit: the minimum number of data points required to attempt a split before it is forced to create a terminal node
## maxdepth: the maximum number of internal nodes between the root node and the terminal nodes.
pred = predict(optimal_tree_lm, test_lm )
rmse_fico = sqrt(mean((test_lm$fico - pred)^2))
rmse_fico
## [1] 22.93026
rpart.plot(optimal_tree_lm)
rpart.rules(optimal_tree_lm)
## fico
## 678 when int.rate >= 0.132 & P_small_business is 0
## 696 when int.rate is 0.119 to 0.132 & installment < 499
## 707 when int.rate >= 0.132 & P_small_business is 1
## 711 when int.rate is 0.096 to 0.119 & installment < 328
## 722 when int.rate is 0.119 to 0.132 & installment >= 499
## 737 when int.rate is 0.096 to 0.119 & installment >= 328
## 738 when int.rate is 0.082 to 0.096 & installment < 236
## 761 when int.rate is 0.082 to 0.096 & installment >= 236
## 767 when int.rate < 0.082
## preparing df3
df3 = data.frame(df1)
df3$not.fully.paid = as.factor(df3$not.fully.paid )
df3$inq.last.6mths = as.numeric(df3$inq.last.6mths)
df3$delinq.2yrs = as.numeric(df3$delinq.2yrs)
df3$purpose = as.factor(df3$purpose)
df3$P_debt_consolidation = as.factor(df3$P_debt_consolidation)
df3$P_credit_card = as.factor(df3$P_credit_card )
df3$P_all_other = as.factor(df3$P_all_other )
df3$P_all_other = as.factor(df3$P_all_other)
df3$P_home_improvement = as.factor(df3$P_home_improvement)
df3$P_small_business = as.factor(df3$P_small_business)
df3$P_major_purchase = as.factor(df3$P_major_purchase )
df3$P_educational = as.factor(df3$P_educational)
#df3$inq.last.6mths = as.factor(df3$inq.last.6mths)
#df3$delinq.2yrs = as.factor(df3$delinq.2yrs )
df3 <- subset(df3, select = -c(1,15:21))
str(df3)
## 'data.frame': 9508 obs. of 13 variables:
## $ purpose : Factor w/ 7 levels "1","2","3","4",..: 1 2 1 1 2 2 1 3 4 1 ...
## $ int.rate : num 0.119 0.107 0.136 0.101 0.143 ...
## $ installment : num 829 228 367 162 103 ...
## $ log.annual.inc : num 11.4 11.1 10.4 11.4 11.3 ...
## $ dti : num 1119 612 348 2341 678 ...
## $ fico : int 737 707 682 712 667 727 667 722 682 707 ...
## $ days.with.cr.line: num 5640 2760 4710 2700 4066 ...
## $ revol.bal : int 28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
## $ revol.util : num 513 784 212 749 365 502 785 691 503 186 ...
## $ inq.last.6mths : num 0 0 1 1 0 0 0 0 1 1 ...
## $ delinq.2yrs : num 1 1 1 1 2 1 1 1 1 1 ...
## $ pub.rec : num 1 1 1 1 1 1 2 1 1 1 ...
## $ not.fully.paid : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 1 1 ...
## Decision tree - Not.fully.paid ( category = 0,1 - Classification )
set.seed(42)
split = sample.split (df3$not.fully.paid, SplitRatio = 0.70)
train_glmf = subset(df3, split == TRUE)
test_glmf = subset(df3, split == FALSE)
TreeModel_NFP = tree(not.fully.paid ~ ., data = train_glmf, method = "class" )
summary(TreeModel_NFP)
##
## Classification tree:
## tree(formula = not.fully.paid ~ ., data = train_glmf, method = "class")
## Variables actually used in tree construction:
## [1] "int.rate" "inq.last.6mths"
## Number of terminal nodes: 3
## Residual mean deviance: 0.8415 = 5598 / 6653
## Misclassification error rate: 0.1597 = 1063 / 6656
# plot the tree model
plot (TreeModel_NFP)
text (TreeModel_NFP)
tree.pred = predict(TreeModel_NFP, test_glmf, type = "class")
tree.pred2 = predict(TreeModel_NFP, test_glmf, type = "vector")
tree.pred_cat = ifelse(tree.pred2[ , 2] > 0.25, 1, 0)
table(test_glmf$not.fully.paid, tree.pred_cat)
## tree.pred_cat
## 0 1
## 0 1998 399
## 1 310 145
# ROC will show us the overall performance of the model
ROCRpred = prediction (tree.pred2[,2], test_glmf$not.fully.paid)
as.numeric (performance (ROCRpred, "auc") @y.values)
## [1] 0.606758
ROCRperf = performance (ROCRpred, "tpr", "fpr")
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.05), text.adj = c(-0.8, 1.7))
cv.TreeModel1 = cv.tree(TreeModel_NFP, FUN = prune.misclass)
cv.TreeModel1
## $size
## [1] 3 1
##
## $dev
## [1] 1063 1063
##
## $k
## [1] -Inf 0
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.TreeModel1$size, cv.TreeModel1$dev, type = "b", xlab = "Tree size", ylab = "Deviance")
prune.TreeModel1 = prune.misclass(TreeModel_NFP, best = 3)
plot(prune.TreeModel1)
text(prune.TreeModel1, pretty = 0)
summary(TreeModel_NFP)
##
## Classification tree:
## tree(formula = not.fully.paid ~ ., data = train_glmf, method = "class")
## Variables actually used in tree construction:
## [1] "int.rate" "inq.last.6mths"
## Number of terminal nodes: 3
## Residual mean deviance: 0.8415 = 5598 / 6653
## Misclassification error rate: 0.1597 = 1063 / 6656
summary(prune.TreeModel1)
##
## Classification tree:
## tree(formula = not.fully.paid ~ ., data = train_glmf, method = "class")
## Variables actually used in tree construction:
## [1] "int.rate" "inq.last.6mths"
## Number of terminal nodes: 3
## Residual mean deviance: 0.8415 = 5598 / 6653
## Misclassification error rate: 0.1597 = 1063 / 6656
prune.pred = predict(prune.TreeModel1, test_glmf, type = "class")
tree.pred2 = predict(TreeModel_NFP, test_glmf, type = "vector")
tree.pred_cat = ifelse(tree.pred2[ , 2] > 0.25, 1, 0)
table(test_glmf$not.fully.paid, tree.pred_cat)
## tree.pred_cat
## 0 1
## 0 1998 399
## 1 310 145
##——Random forrest - regression - Fico
## Decision tree - fico (continuous - 620, 700,800, 624, ... - Linear )
rf = randomForest (fico ~ ., data = train_lm, ntree = 200, nodesize = 2, importance = TRUE)
## rpart to generate a full tree by using cp = 0 (no penalty results in a fully grown tree)
## minsplit: the minimum number of data points required to attempt a split before it is forced to create a terminal node
## maxdepth: the maximum number of internal nodes between the root node and the terminal nodes.
pred_rf = predict(rf, test_lm )
rmse_fico_rf = sqrt(mean((test_lm$fico - pred_rf)^2))
rmse_fico_rf
## [1] 17.38607
varImpPlot(rf)
##——Random forrest - Classification - Not fully paid
rf_class = randomForest(not.fully.paid ~ ., data = train_glm, mtry = 0, ntree = 20, nodesize = 10, importance = TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
rf_p_train <- predict(rf_class,newdata = train_glm , type = "class")
rf_pr_train <- prediction(rf_p_train, train_glm$not.fully.paid)
r_auc_train1 <- performance(rf_pr_train, measure = "auc")@y.values[[1]]
r_auc_train1
## [1] 0.7987195
ROCRperf_rf = performance (rf_pr_train, "tpr", "fpr")
plot (ROCRperf_rf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.1), text.adj = c(-0.2, 1.7))
Conduct clustering analysis using any method of your choice and try to find meaningful clusters. Interpret why the clusters make sense. (optional)
#km.out = kmeans(df3$fico, 2, nstart = 5)
#plot(df, col = km.out$cluster)