This loan data set lists the outcomes of n= 5611 loans. This data set comes from the consumer-to-consumer lending market where borrows post loan listings and lenders invest in those loans by bidding on the borrower’s loan rates.
library(leaps)
library(nutshell)
## Loading required package: nutshell.bbdb
## Loading required package: nutshell.audioscrobbler
library(lattice)
library(car)
LD <- read.csv("~/DataMining/Data/LoanData.csv")
LD$Status=recode(LD$Status,"'Current'=1; else=0")
LD$Status=as.numeric(levels(LD$Status)[LD$Status])
n = length(LD$Status)
n1 = floor(n*(.6))
n1
## [1] 3366
n2 = n-n1
train = sample(1:n,n1)
XLD <- model.matrix(Status~., data = LD)[,-1]
XLD[1:3,]
## Credit.GradeAA Credit.GradeB Credit.GradeC Credit.GradeD Credit.GradeE
## 1 0 0 1 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## Credit.GradeHR Credit.GradeNC Amount Age Borrower.Rate
## 1 0 0 5000 4 0.150
## 2 1 0 1900 6 0.265
## 3 1 0 1000 3 0.150
## Debt.To.Income.Ratio
## 1 0.04
## 2 0.02
## 3 0.02
xtrain <- XLD[train,]
xnew <- XLD[-train,]
ytrain <- LD$Status[train]
ynew <- LD$Status[-train]
m1=glm(Status~.,family=binomial,data=data.frame(Status=ytrain,xtrain))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m1)
##
## Call:
## glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain,
## xtrain))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3129 0.1161 0.2184 0.3800 1.8005
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.437e+00 5.871e-01 14.372 < 2e-16 ***
## Credit.GradeAA 8.974e-01 8.217e-01 1.092 0.274772
## Credit.GradeB -5.527e-01 4.825e-01 -1.145 0.252061
## Credit.GradeC -1.771e-01 4.966e-01 -0.357 0.721312
## Credit.GradeD -1.833e-01 4.991e-01 -0.367 0.713439
## Credit.GradeE -8.407e-01 5.098e-01 -1.649 0.099111 .
## Credit.GradeHR -1.399e+00 5.239e-01 -2.671 0.007558 **
## Credit.GradeNC -2.385e+00 6.495e-01 -3.672 0.000241 ***
## Amount -7.598e-05 1.963e-05 -3.870 0.000109 ***
## Age -4.086e-01 3.049e-02 -13.399 < 2e-16 ***
## Borrower.Rate -1.219e+01 1.735e+00 -7.025 2.14e-12 ***
## Debt.To.Income.Ratio 4.825e-01 4.324e-01 1.116 0.264536
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1791.0 on 3365 degrees of freedom
## Residual deviance: 1367.4 on 3354 degrees of freedom
## AIC: 1391.4
##
## Number of Fisher Scoring iterations: 16
ptest <- predict(m1,newdata=data.frame(xnew),type="response")
data.frame(ynew,ptest)[1:10,]
## ynew ptest
## 1 1 0.9883048
## 3 1 0.9804968
## 5 1 0.9927480
## 6 1 0.8772461
## 8 1 0.8760390
## 9 1 0.9995406
## 13 1 0.9893572
## 15 1 0.9037922
## 18 1 0.7440687
## 20 1 0.8347782
gg1=floor(ptest+0.5)
ttt=table(ynew,gg1)
ttt
## gg1
## ynew 0 1
## 0 19 154
## 1 24 2048
Below is the overall total error of this model and that is about 7.3%, this is extremely good and depicts an accurate model.
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.07928731
bb=cbind(ptest,ynew)
head(bb)
## ptest ynew
## 1 0.9883048 1
## 3 0.9804968 1
## 5 0.9927480 1
## 6 0.8772461 1
## 8 0.8760390 1
## 9 0.9995406 1
bb1=bb[order(ptest,decreasing=TRUE),]
head(bb1)
## ptest ynew
## 5596 1 1
## 5599 1 1
## 5600 1 1
## 5604 1 1
## 5608 1 1
## 5609 1 1
This orders the cases in our test set based on their success probability with the actual shown next to it
xbar=mean(ynew)
xbar
## [1] 0.9229399
##calculating the lift
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+bb1[i,2]
}
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
aaa[1:100,]
## ay ax
## 5596 1.0000000 1 1 0.9229399
## 5599 1.0000000 1 2 1.8458797
## 5600 1.0000000 1 3 2.7688196
## 5604 1.0000000 1 4 3.6917595
## 5608 1.0000000 1 5 4.6146993
## 5609 1.0000000 1 6 5.5376392
## 5610 1.0000000 1 7 6.4605791
## 4130 0.9997004 1 8 7.3835189
## 5585 0.9996901 1 9 8.3064588
## 1417 0.9996818 1 10 9.2293987
## 1418 0.9996679 1 11 10.1523385
## 3602 0.9996570 1 12 11.0752784
## 290 0.9996432 1 13 11.9982183
## 2150 0.9996237 1 14 12.9211581
## 1784 0.9996183 1 15 13.8440980
## 474 0.9996099 1 16 14.7670379
## 448 0.9995972 1 17 15.6899777
## 363 0.9995941 1 18 16.6129176
## 2372 0.9995871 1 19 17.5358575
## 125 0.9995791 1 20 18.4587973
## 864 0.9995537 1 21 19.3817372
## 1602 0.9995492 1 22 20.3046771
## 1227 0.9995408 1 23 21.2276169
## 9 0.9995406 1 24 22.1505568
## 2382 0.9995360 1 25 23.0734967
## 1962 0.9995206 1 26 23.9964365
## 678 0.9995060 1 27 24.9193764
## 4402 0.9995059 1 28 25.8423163
## 1233 0.9995031 1 29 26.7652561
## 2151 0.9995015 1 30 27.6881960
## 5579 0.9994971 1 31 28.6111359
## 4007 0.9994826 1 32 29.5340757
## 365 0.9994711 1 33 30.4570156
## 175 0.9994607 1 34 31.3799555
## 3609 0.9994416 1 35 32.3028953
## 857 0.9994362 1 36 33.2258352
## 2963 0.9994134 1 37 34.1487751
## 1228 0.9993985 1 38 35.0717149
## 685 0.9993822 1 39 35.9946548
## 680 0.9993754 1 40 36.9175947
## 256 0.9993631 1 41 37.8405345
## 856 0.9993623 1 42 38.7634744
## 5567 0.9993540 1 43 39.6864143
## 1236 0.9993278 1 44 40.6093541
## 1049 0.9993059 1 45 41.5322940
## 5294 0.9992885 1 46 42.4552339
## 5377 0.9992747 1 47 43.3781737
## 5036 0.9992615 1 48 44.3011136
## 1051 0.9992540 1 49 45.2240535
## 682 0.9992426 1 50 46.1469933
## 1050 0.9992178 1 51 47.0699332
## 3373 0.9992001 1 52 47.9928731
## 131 0.9991791 1 53 48.9158129
## 681 0.9991742 1 54 49.8387528
## 145 0.9991672 1 55 50.7616927
## 2570 0.9991535 1 56 51.6846325
## 848 0.9991087 1 57 52.6075724
## 4923 0.9990978 1 58 53.5305122
## 1611 0.9990873 1 59 54.4534521
## 4725 0.9990864 1 60 55.3763920
## 153 0.9990697 1 61 56.2993318
## 845 0.9990564 1 62 57.2222717
## 674 0.9990553 1 63 58.1452116
## 4843 0.9990350 1 64 59.0681514
## 1423 0.9990270 1 65 59.9910913
## 2738 0.9989489 1 66 60.9140312
## 69 0.9989389 1 67 61.8369710
## 5528 0.9989365 1 68 62.7599109
## 5003 0.9989293 1 69 63.6828508
## 1057 0.9989070 1 70 64.6057906
## 4788 0.9988950 1 71 65.5287305
## 2142 0.9988923 1 72 66.4516704
## 5325 0.9988760 1 73 67.3746102
## 1414 0.9988708 1 74 68.2975501
## 3713 0.9988680 1 75 69.2204900
## 1774 0.9988611 1 76 70.1434298
## 2569 0.9988519 1 77 71.0663697
## 2563 0.9988360 1 78 71.9893096
## 1967 0.9988020 1 79 72.9122494
## 1953 0.9987977 1 80 73.8351893
## 4131 0.9987969 1 81 74.7581292
## 2759 0.9987564 1 82 75.6810690
## 2161 0.9987459 1 83 76.6040089
## 686 0.9987451 1 84 77.5269488
## 1590 0.9987259 1 85 78.4498886
## 129 0.9987210 1 86 79.3728285
## 2992 0.9987024 1 87 80.2957684
## 851 0.9986340 1 88 81.2187082
## 277 0.9986079 1 89 82.1416481
## 3382 0.9986031 1 90 83.0645880
## 3187 0.9985116 1 91 83.9875278
## 1600 0.9985004 1 92 84.9104677
## 2145 0.9984995 1 93 85.8334076
## 1777 0.9984884 1 94 86.7563474
## 114 0.9984568 1 95 87.6792873
## 5272 0.9983902 1 96 88.6022272
## 54 0.9983891 1 97 89.5251670
## 2942 0.9983599 1 98 90.4481069
## 4786 0.9983512 1 99 91.3710468
## 5569 0.9983435 1 100 92.2939866
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: CUM successes sorted by pred val/success prob")
points(axis,ax,type="l")
This is data taken from the Titanic. With this we were trying to predict the probability of a passenger surviving.
Tit <- read.csv("~/DataMining/Data/Titanic.csv")
Tit = Tit[-9:-11]
Tit$Gender=recode(Tit$Gender, "'male'=1; else=0")
Tit$Gender=as.numeric(levels(Tit$Gender)[Tit$Gender])
n = length(Tit$Survived)
n1 = floor(n*(.6))
n1
## [1] 625
n2 = n-n1
train = sample(1:n,n1)
XTit <- model.matrix(Survived~., data = Tit)[,-1]
XTit[1:3,]
## Class Gender Age NumbSibOrSpsAbd NumbParOrChildAbd Fare
## 1 1 0 29.0000 0 0 211.3375
## 2 1 1 0.9167 1 2 151.5500
## 3 1 0 2.0000 1 2 151.5500
## EmbarkedAtQ EmbarkedAtS
## 1 0 1
## 2 0 1
## 3 0 1
xtrain <- XTit[train,]
xnew <- XTit[-train,]
ytrain <- Tit$Survived[train]
ynew <- Tit$Survived[-train]
m2=glm(Survived~.,family=binomial,data=data.frame(Survived=ytrain,xtrain))
summary(m2)
##
## Call:
## glm(formula = Survived ~ ., family = binomial, data = data.frame(Survived = ytrain,
## xtrain))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5974 -0.6682 -0.3900 0.6059 2.6027
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.5618752 0.6916769 8.041 8.90e-16 ***
## Class -1.0460612 0.1797180 -5.821 5.86e-09 ***
## Gender -2.8652214 0.2383768 -12.020 < 2e-16 ***
## Age -0.0349747 0.0086158 -4.059 4.92e-05 ***
## NumbSibOrSpsAbd -0.2175489 0.1508650 -1.442 0.14930
## NumbParOrChildAbd -0.0278985 0.1392369 -0.200 0.84119
## Fare 0.0002674 0.0027563 0.097 0.92272
## EmbarkedAtQ -1.8987683 0.6113230 -3.106 0.00190 **
## EmbarkedAtS -0.8745325 0.2901323 -3.014 0.00258 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 850.69 on 624 degrees of freedom
## Residual deviance: 551.05 on 616 degrees of freedom
## AIC: 569.05
##
## Number of Fisher Scoring iterations: 5
ptest <- predict(m2,newdata=data.frame(xnew),type="response")
data.frame(ynew,ptest)[1:10,]
## ynew ptest
## 1 1 0.9360474
## 6 1 0.2899497
## 9 1 0.7967797
## 10 0 0.3058959
## 12 1 0.9765592
## 14 1 0.9400838
## 16 0 0.7004592
## 19 0 0.6014765
## 21 1 0.8539687
## 23 1 0.9572098
gg1=floor(ptest+0.5)
ttt=table(ynew,gg1)
ttt
## gg1
## ynew 0 1
## 0 208 48
## 1 56 106
error=(ttt[1,2]+ttt[2,1])/n2
error ##relatively low error gives us a good chance of being right
## [1] 0.2488038
This last part gives us an error of 22.24%. This is starting to become a little high in terms of being precise with our predictions.
bb=cbind(ptest,ynew)
head(bb)
## ptest ynew
## 1 0.9360474 1
## 6 0.2899497 1
## 9 0.7967797 1
## 10 0.3058959 0
## 12 0.9765592 1
## 14 0.9400838 1
bb1=bb[order(ptest,decreasing=TRUE),]
head(bb1)
## ptest ynew
## 141 0.9809942 1
## 202 0.9766336 1
## 12 0.9765592 1
## 94 0.9760076 1
## 109 0.9759625 1
## 136 0.9758413 1
##orders cases in test set based on their success prob with actual shown next to
xbar=mean(ynew)
xbar
## [1] 0.3875598
##calculating the lift
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+bb1[i,2]
}
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
aaa[1:100,]
## ay ax
## 141 0.9809942 1 1 0.3875598
## 202 0.9766336 1 2 0.7751196
## 12 0.9765592 1 3 1.1626794
## 94 0.9760076 1 4 1.5502392
## 109 0.9759625 1 5 1.9377990
## 136 0.9758413 1 6 2.3253589
## 175 0.9756283 1 7 2.7129187
## 189 0.9713667 1 8 3.1004785
## 220 0.9696909 1 9 3.4880383
## 66 0.9684919 1 10 3.8755981
## 221 0.9664505 1 11 4.2631579
## 97 0.9632153 0 11 4.6507177
## 193 0.9601098 1 12 5.0382775
## 431 0.9597881 1 13 5.4258373
## 23 0.9572098 1 14 5.8133971
## 432 0.9569999 1 15 6.2009569
## 39 0.9518615 1 16 6.5885167
## 259 0.9495977 1 17 6.9760766
## 117 0.9493066 1 18 7.3636364
## 191 0.9492130 1 19 7.7511962
## 167 0.9491968 1 20 8.1387560
## 277 0.9485298 1 21 8.5263158
## 67 0.9484541 1 22 8.9138756
## 52 0.9483570 1 23 9.3014354
## 435 0.9467678 1 24 9.6889952
## 36 0.9457639 1 25 10.0765550
## 237 0.9432262 1 26 10.4641148
## 247 0.9431697 1 27 10.8516746
## 174 0.9431273 1 28 11.2392344
## 149 0.9412963 0 28 11.6267943
## 14 0.9400838 1 29 12.0143541
## 24 0.9362143 1 30 12.4019139
## 1 0.9360474 1 31 12.7894737
## 91 0.9327717 1 32 13.1770335
## 270 0.9326716 1 33 13.5645933
## 203 0.9319503 1 34 13.9521531
## 111 0.9316781 1 35 14.3397129
## 223 0.9312428 1 36 14.7272727
## 163 0.9308368 1 37 15.1148325
## 47 0.9306278 1 38 15.5023923
## 206 0.9276786 1 39 15.8899522
## 169 0.9259759 1 40 16.2775120
## 28 0.9208121 1 41 16.6650718
## 434 0.9196854 1 42 17.0526316
## 217 0.9187522 1 43 17.4401914
## 349 0.9183564 1 44 17.8277512
## 390 0.9090111 1 45 18.2153110
## 336 0.9060766 1 46 18.6028708
## 71 0.9041204 1 47 18.9904306
## 534 0.9016793 1 48 19.3779904
## 113 0.9014833 1 49 19.7655502
## 145 0.9014571 1 50 20.1531100
## 231 0.8884162 1 51 20.5406699
## 101 0.8832805 1 52 20.9282297
## 252 0.8775496 1 53 21.3157895
## 505 0.8713955 1 54 21.7033493
## 622 0.8661743 0 54 22.0909091
## 273 0.8660011 1 55 22.4784689
## 59 0.8576952 1 56 22.8660287
## 609 0.8544717 0 56 23.2535885
## 21 0.8539687 1 57 23.6411483
## 309 0.8531223 1 58 24.0287081
## 401 0.8486053 0 58 24.4162679
## 384 0.8460443 1 59 24.8038278
## 1038 0.8436165 1 60 25.1913876
## 313 0.8435825 1 61 25.5789474
## 520 0.8393918 1 62 25.9665072
## 345 0.8347129 1 63 26.3540670
## 519 0.8347000 1 64 26.7416268
## 321 0.8303462 1 65 27.1291866
## 471 0.8297364 1 66 27.5167464
## 340 0.8248349 0 66 27.9043062
## 879 0.8202339 1 67 28.2918660
## 604 0.8184308 1 68 28.6794258
## 393 0.8175538 1 69 29.0669856
## 329 0.8150895 1 70 29.4545455
## 777 0.8107049 1 71 29.8421053
## 987 0.8106300 0 71 30.2296651
## 726 0.7986087 1 72 30.6172249
## 328 0.7974377 0 72 31.0047847
## 364 0.7974377 1 73 31.3923445
## 532 0.7974377 1 74 31.7799043
## 9 0.7967797 1 75 32.1674641
## 1005 0.7953862 1 76 32.5550239
## 250 0.7823861 0 76 32.9425837
## 960 0.7812905 1 77 33.3301435
## 338 0.7811832 1 78 33.7177033
## 367 0.7806839 0 78 34.1052632
## 182 0.7806393 1 79 34.4928230
## 497 0.7759201 1 80 34.8803828
## 511 0.7684726 1 81 35.2679426
## 75 0.7621585 1 82 35.6555024
## 452 0.7574173 1 83 36.0430622
## 183 0.7557460 1 84 36.4306220
## 289 0.7550136 1 85 36.8181818
## 1007 0.7154994 1 86 37.2057416
## 208 0.7145152 0 86 37.5933014
## 556 0.7097593 1 87 37.9808612
## 16 0.7004592 0 87 38.3684211
## 324 0.6996717 0 87 38.7559809
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: CUM successes sorted by pred val/success prob")
points(axis,ax,type="l")