Homework 3

Loan Data

This loan data set lists the outcomes of n= 5611 loans. This data set comes from the consumer-to-consumer lending market where borrows post loan listings and lenders invest in those loans by bidding on the borrower’s loan rates.

library(leaps)
library(nutshell)

## Loading required package: nutshell.bbdb

## Loading required package: nutshell.audioscrobbler

library(lattice)
library(car)
LD <- read.csv("~/DataMining/Data/LoanData.csv")
LD$Status=recode(LD$Status,"'Current'=1; else=0")
LD$Status=as.numeric(levels(LD$Status)[LD$Status])

n = length(LD$Status)
n1 = floor(n*(.6))
n1

## [1] 3366

n2 = n-n1
train = sample(1:n,n1)

XLD <- model.matrix(Status~., data = LD)[,-1]
XLD[1:3,]

##   Credit.GradeAA Credit.GradeB Credit.GradeC Credit.GradeD Credit.GradeE
## 1              0             0             1             0             0
## 2              0             0             0             0             0
## 3              0             0             0             0             0
##   Credit.GradeHR Credit.GradeNC Amount Age Borrower.Rate
## 1              0              0   5000   4         0.150
## 2              1              0   1900   6         0.265
## 3              1              0   1000   3         0.150
##   Debt.To.Income.Ratio
## 1                 0.04
## 2                 0.02
## 3                 0.02

xtrain <- XLD[train,]
xnew <- XLD[-train,]
ytrain <- LD$Status[train]
ynew <- LD$Status[-train]
m1=glm(Status~.,family=binomial,data=data.frame(Status=ytrain,xtrain))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(m1)

## 
## Call:
## glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain, 
##     xtrain))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3129   0.1161   0.2184   0.3800   1.8005  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           8.437e+00  5.871e-01  14.372  < 2e-16 ***
## Credit.GradeAA        8.974e-01  8.217e-01   1.092 0.274772    
## Credit.GradeB        -5.527e-01  4.825e-01  -1.145 0.252061    
## Credit.GradeC        -1.771e-01  4.966e-01  -0.357 0.721312    
## Credit.GradeD        -1.833e-01  4.991e-01  -0.367 0.713439    
## Credit.GradeE        -8.407e-01  5.098e-01  -1.649 0.099111 .  
## Credit.GradeHR       -1.399e+00  5.239e-01  -2.671 0.007558 ** 
## Credit.GradeNC       -2.385e+00  6.495e-01  -3.672 0.000241 ***
## Amount               -7.598e-05  1.963e-05  -3.870 0.000109 ***
## Age                  -4.086e-01  3.049e-02 -13.399  < 2e-16 ***
## Borrower.Rate        -1.219e+01  1.735e+00  -7.025 2.14e-12 ***
## Debt.To.Income.Ratio  4.825e-01  4.324e-01   1.116 0.264536    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1791.0  on 3365  degrees of freedom
## Residual deviance: 1367.4  on 3354  degrees of freedom
## AIC: 1391.4
## 
## Number of Fisher Scoring iterations: 16

ptest <- predict(m1,newdata=data.frame(xnew),type="response")
data.frame(ynew,ptest)[1:10,]

##    ynew     ptest
## 1     1 0.9883048
## 3     1 0.9804968
## 5     1 0.9927480
## 6     1 0.8772461
## 8     1 0.8760390
## 9     1 0.9995406
## 13    1 0.9893572
## 15    1 0.9037922
## 18    1 0.7440687
## 20    1 0.8347782

gg1=floor(ptest+0.5)    
ttt=table(ynew,gg1)
ttt

##     gg1
## ynew    0    1
##    0   19  154
##    1   24 2048

Below is the overall total error of this model and that is about 7.3%, this is extremely good and depicts an accurate model.

error=(ttt[1,2]+ttt[2,1])/n2
error

## [1] 0.07928731

bb=cbind(ptest,ynew)
head(bb)

##       ptest ynew
## 1 0.9883048    1
## 3 0.9804968    1
## 5 0.9927480    1
## 6 0.8772461    1
## 8 0.8760390    1
## 9 0.9995406    1

bb1=bb[order(ptest,decreasing=TRUE),]
head(bb1)

##      ptest ynew
## 5596     1    1
## 5599     1    1
## 5600     1    1
## 5604     1    1
## 5608     1    1
## 5609     1    1

This orders the cases in our test set based on their success probability with the actual shown next to it

xbar=mean(ynew)
xbar

## [1] 0.9229399

##calculating the lift
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
aaa[1:100,]

##                   ay         ax
## 5596 1.0000000 1   1  0.9229399
## 5599 1.0000000 1   2  1.8458797
## 5600 1.0000000 1   3  2.7688196
## 5604 1.0000000 1   4  3.6917595
## 5608 1.0000000 1   5  4.6146993
## 5609 1.0000000 1   6  5.5376392
## 5610 1.0000000 1   7  6.4605791
## 4130 0.9997004 1   8  7.3835189
## 5585 0.9996901 1   9  8.3064588
## 1417 0.9996818 1  10  9.2293987
## 1418 0.9996679 1  11 10.1523385
## 3602 0.9996570 1  12 11.0752784
## 290  0.9996432 1  13 11.9982183
## 2150 0.9996237 1  14 12.9211581
## 1784 0.9996183 1  15 13.8440980
## 474  0.9996099 1  16 14.7670379
## 448  0.9995972 1  17 15.6899777
## 363  0.9995941 1  18 16.6129176
## 2372 0.9995871 1  19 17.5358575
## 125  0.9995791 1  20 18.4587973
## 864  0.9995537 1  21 19.3817372
## 1602 0.9995492 1  22 20.3046771
## 1227 0.9995408 1  23 21.2276169
## 9    0.9995406 1  24 22.1505568
## 2382 0.9995360 1  25 23.0734967
## 1962 0.9995206 1  26 23.9964365
## 678  0.9995060 1  27 24.9193764
## 4402 0.9995059 1  28 25.8423163
## 1233 0.9995031 1  29 26.7652561
## 2151 0.9995015 1  30 27.6881960
## 5579 0.9994971 1  31 28.6111359
## 4007 0.9994826 1  32 29.5340757
## 365  0.9994711 1  33 30.4570156
## 175  0.9994607 1  34 31.3799555
## 3609 0.9994416 1  35 32.3028953
## 857  0.9994362 1  36 33.2258352
## 2963 0.9994134 1  37 34.1487751
## 1228 0.9993985 1  38 35.0717149
## 685  0.9993822 1  39 35.9946548
## 680  0.9993754 1  40 36.9175947
## 256  0.9993631 1  41 37.8405345
## 856  0.9993623 1  42 38.7634744
## 5567 0.9993540 1  43 39.6864143
## 1236 0.9993278 1  44 40.6093541
## 1049 0.9993059 1  45 41.5322940
## 5294 0.9992885 1  46 42.4552339
## 5377 0.9992747 1  47 43.3781737
## 5036 0.9992615 1  48 44.3011136
## 1051 0.9992540 1  49 45.2240535
## 682  0.9992426 1  50 46.1469933
## 1050 0.9992178 1  51 47.0699332
## 3373 0.9992001 1  52 47.9928731
## 131  0.9991791 1  53 48.9158129
## 681  0.9991742 1  54 49.8387528
## 145  0.9991672 1  55 50.7616927
## 2570 0.9991535 1  56 51.6846325
## 848  0.9991087 1  57 52.6075724
## 4923 0.9990978 1  58 53.5305122
## 1611 0.9990873 1  59 54.4534521
## 4725 0.9990864 1  60 55.3763920
## 153  0.9990697 1  61 56.2993318
## 845  0.9990564 1  62 57.2222717
## 674  0.9990553 1  63 58.1452116
## 4843 0.9990350 1  64 59.0681514
## 1423 0.9990270 1  65 59.9910913
## 2738 0.9989489 1  66 60.9140312
## 69   0.9989389 1  67 61.8369710
## 5528 0.9989365 1  68 62.7599109
## 5003 0.9989293 1  69 63.6828508
## 1057 0.9989070 1  70 64.6057906
## 4788 0.9988950 1  71 65.5287305
## 2142 0.9988923 1  72 66.4516704
## 5325 0.9988760 1  73 67.3746102
## 1414 0.9988708 1  74 68.2975501
## 3713 0.9988680 1  75 69.2204900
## 1774 0.9988611 1  76 70.1434298
## 2569 0.9988519 1  77 71.0663697
## 2563 0.9988360 1  78 71.9893096
## 1967 0.9988020 1  79 72.9122494
## 1953 0.9987977 1  80 73.8351893
## 4131 0.9987969 1  81 74.7581292
## 2759 0.9987564 1  82 75.6810690
## 2161 0.9987459 1  83 76.6040089
## 686  0.9987451 1  84 77.5269488
## 1590 0.9987259 1  85 78.4498886
## 129  0.9987210 1  86 79.3728285
## 2992 0.9987024 1  87 80.2957684
## 851  0.9986340 1  88 81.2187082
## 277  0.9986079 1  89 82.1416481
## 3382 0.9986031 1  90 83.0645880
## 3187 0.9985116 1  91 83.9875278
## 1600 0.9985004 1  92 84.9104677
## 2145 0.9984995 1  93 85.8334076
## 1777 0.9984884 1  94 86.7563474
## 114  0.9984568 1  95 87.6792873
## 5272 0.9983902 1  96 88.6022272
## 54   0.9983891 1  97 89.5251670
## 2942 0.9983599 1  98 90.4481069
## 4786 0.9983512 1  99 91.3710468
## 5569 0.9983435 1 100 92.2939866

plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: CUM successes sorted by pred val/success prob")
points(axis,ax,type="l")

Titanic Data

This is data taken from the Titanic. With this we were trying to predict the probability of a passenger surviving.

Tit <- read.csv("~/DataMining/Data/Titanic.csv")
Tit = Tit[-9:-11]

Tit$Gender=recode(Tit$Gender, "'male'=1; else=0")
Tit$Gender=as.numeric(levels(Tit$Gender)[Tit$Gender])

n = length(Tit$Survived)
n1 = floor(n*(.6))
n1

## [1] 625

n2 = n-n1
train = sample(1:n,n1)

XTit <- model.matrix(Survived~., data = Tit)[,-1]
XTit[1:3,]

##   Class Gender     Age NumbSibOrSpsAbd NumbParOrChildAbd     Fare
## 1     1      0 29.0000               0                 0 211.3375
## 2     1      1  0.9167               1                 2 151.5500
## 3     1      0  2.0000               1                 2 151.5500
##   EmbarkedAtQ EmbarkedAtS
## 1           0           1
## 2           0           1
## 3           0           1

xtrain <- XTit[train,]
xnew <- XTit[-train,]
ytrain <- Tit$Survived[train]
ynew <- Tit$Survived[-train]
m2=glm(Survived~.,family=binomial,data=data.frame(Survived=ytrain,xtrain))
summary(m2)

## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = data.frame(Survived = ytrain, 
##     xtrain))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5974  -0.6682  -0.3900   0.6059   2.6027  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        5.5618752  0.6916769   8.041 8.90e-16 ***
## Class             -1.0460612  0.1797180  -5.821 5.86e-09 ***
## Gender            -2.8652214  0.2383768 -12.020  < 2e-16 ***
## Age               -0.0349747  0.0086158  -4.059 4.92e-05 ***
## NumbSibOrSpsAbd   -0.2175489  0.1508650  -1.442  0.14930    
## NumbParOrChildAbd -0.0278985  0.1392369  -0.200  0.84119    
## Fare               0.0002674  0.0027563   0.097  0.92272    
## EmbarkedAtQ       -1.8987683  0.6113230  -3.106  0.00190 ** 
## EmbarkedAtS       -0.8745325  0.2901323  -3.014  0.00258 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 850.69  on 624  degrees of freedom
## Residual deviance: 551.05  on 616  degrees of freedom
## AIC: 569.05
## 
## Number of Fisher Scoring iterations: 5

ptest <- predict(m2,newdata=data.frame(xnew),type="response")
data.frame(ynew,ptest)[1:10,]

##    ynew     ptest
## 1     1 0.9360474
## 6     1 0.2899497
## 9     1 0.7967797
## 10    0 0.3058959
## 12    1 0.9765592
## 14    1 0.9400838
## 16    0 0.7004592
## 19    0 0.6014765
## 21    1 0.8539687
## 23    1 0.9572098

gg1=floor(ptest+0.5)
ttt=table(ynew,gg1)
ttt

##     gg1
## ynew   0   1
##    0 208  48
##    1  56 106

error=(ttt[1,2]+ttt[2,1])/n2
error  ##relatively low error gives us a good chance of being right

## [1] 0.2488038

This last part gives us an error of 22.24%. This is starting to become a little high in terms of being precise with our predictions.

bb=cbind(ptest,ynew)
head(bb)

##        ptest ynew
## 1  0.9360474    1
## 6  0.2899497    1
## 9  0.7967797    1
## 10 0.3058959    0
## 12 0.9765592    1
## 14 0.9400838    1

bb1=bb[order(ptest,decreasing=TRUE),]
head(bb1)

##         ptest ynew
## 141 0.9809942    1
## 202 0.9766336    1
## 12  0.9765592    1
## 94  0.9760076    1
## 109 0.9759625    1
## 136 0.9758413    1

##orders cases in test set based on their success prob with actual shown next to

xbar=mean(ynew)
xbar

## [1] 0.3875598

##calculating the lift
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
aaa[1:100,]

##                  ay         ax
## 141  0.9809942 1  1  0.3875598
## 202  0.9766336 1  2  0.7751196
## 12   0.9765592 1  3  1.1626794
## 94   0.9760076 1  4  1.5502392
## 109  0.9759625 1  5  1.9377990
## 136  0.9758413 1  6  2.3253589
## 175  0.9756283 1  7  2.7129187
## 189  0.9713667 1  8  3.1004785
## 220  0.9696909 1  9  3.4880383
## 66   0.9684919 1 10  3.8755981
## 221  0.9664505 1 11  4.2631579
## 97   0.9632153 0 11  4.6507177
## 193  0.9601098 1 12  5.0382775
## 431  0.9597881 1 13  5.4258373
## 23   0.9572098 1 14  5.8133971
## 432  0.9569999 1 15  6.2009569
## 39   0.9518615 1 16  6.5885167
## 259  0.9495977 1 17  6.9760766
## 117  0.9493066 1 18  7.3636364
## 191  0.9492130 1 19  7.7511962
## 167  0.9491968 1 20  8.1387560
## 277  0.9485298 1 21  8.5263158
## 67   0.9484541 1 22  8.9138756
## 52   0.9483570 1 23  9.3014354
## 435  0.9467678 1 24  9.6889952
## 36   0.9457639 1 25 10.0765550
## 237  0.9432262 1 26 10.4641148
## 247  0.9431697 1 27 10.8516746
## 174  0.9431273 1 28 11.2392344
## 149  0.9412963 0 28 11.6267943
## 14   0.9400838 1 29 12.0143541
## 24   0.9362143 1 30 12.4019139
## 1    0.9360474 1 31 12.7894737
## 91   0.9327717 1 32 13.1770335
## 270  0.9326716 1 33 13.5645933
## 203  0.9319503 1 34 13.9521531
## 111  0.9316781 1 35 14.3397129
## 223  0.9312428 1 36 14.7272727
## 163  0.9308368 1 37 15.1148325
## 47   0.9306278 1 38 15.5023923
## 206  0.9276786 1 39 15.8899522
## 169  0.9259759 1 40 16.2775120
## 28   0.9208121 1 41 16.6650718
## 434  0.9196854 1 42 17.0526316
## 217  0.9187522 1 43 17.4401914
## 349  0.9183564 1 44 17.8277512
## 390  0.9090111 1 45 18.2153110
## 336  0.9060766 1 46 18.6028708
## 71   0.9041204 1 47 18.9904306
## 534  0.9016793 1 48 19.3779904
## 113  0.9014833 1 49 19.7655502
## 145  0.9014571 1 50 20.1531100
## 231  0.8884162 1 51 20.5406699
## 101  0.8832805 1 52 20.9282297
## 252  0.8775496 1 53 21.3157895
## 505  0.8713955 1 54 21.7033493
## 622  0.8661743 0 54 22.0909091
## 273  0.8660011 1 55 22.4784689
## 59   0.8576952 1 56 22.8660287
## 609  0.8544717 0 56 23.2535885
## 21   0.8539687 1 57 23.6411483
## 309  0.8531223 1 58 24.0287081
## 401  0.8486053 0 58 24.4162679
## 384  0.8460443 1 59 24.8038278
## 1038 0.8436165 1 60 25.1913876
## 313  0.8435825 1 61 25.5789474
## 520  0.8393918 1 62 25.9665072
## 345  0.8347129 1 63 26.3540670
## 519  0.8347000 1 64 26.7416268
## 321  0.8303462 1 65 27.1291866
## 471  0.8297364 1 66 27.5167464
## 340  0.8248349 0 66 27.9043062
## 879  0.8202339 1 67 28.2918660
## 604  0.8184308 1 68 28.6794258
## 393  0.8175538 1 69 29.0669856
## 329  0.8150895 1 70 29.4545455
## 777  0.8107049 1 71 29.8421053
## 987  0.8106300 0 71 30.2296651
## 726  0.7986087 1 72 30.6172249
## 328  0.7974377 0 72 31.0047847
## 364  0.7974377 1 73 31.3923445
## 532  0.7974377 1 74 31.7799043
## 9    0.7967797 1 75 32.1674641
## 1005 0.7953862 1 76 32.5550239
## 250  0.7823861 0 76 32.9425837
## 960  0.7812905 1 77 33.3301435
## 338  0.7811832 1 78 33.7177033
## 367  0.7806839 0 78 34.1052632
## 182  0.7806393 1 79 34.4928230
## 497  0.7759201 1 80 34.8803828
## 511  0.7684726 1 81 35.2679426
## 75   0.7621585 1 82 35.6555024
## 452  0.7574173 1 83 36.0430622
## 183  0.7557460 1 84 36.4306220
## 289  0.7550136 1 85 36.8181818
## 1007 0.7154994 1 86 37.2057416
## 208  0.7145152 0 86 37.5933014
## 556  0.7097593 1 87 37.9808612
## 16   0.7004592 0 87 38.3684211
## 324  0.6996717 0 87 38.7559809

plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: CUM successes sorted by pred val/success prob")
points(axis,ax,type="l")

Homework 3

Jones

October 16, 2017

Loan Data

Titanic Data