Data Set 1: LoanData.csv

library(car) 
set.seed(1)

This loan data set lists the outcomes of n= 5611 loans. The following models will attempt to accurately predict if a loan will be good (current) or bad (late or default).

ld <- read.csv("C:/DataMining/Data/LoanData.csv")
ld[1:3,]
##    Status Credit.Grade Amount Age Borrower.Rate Debt.To.Income.Ratio
## 1 Current            C   5000   4         0.150                 0.04
## 2 Current           HR   1900   6         0.265                 0.02
## 3 Current           HR   1000   3         0.150                 0.02
ld$Status=recode(ld$Status,"'Current'=1;else=0")
table(ld$Status)
## 
##    0    1 
##  425 5186
ld$Status=as.numeric(levels(ld$Status)[ld$Status])
table(ld$Status)
## 
##    0    1 
##  425 5186
response=ld$Status
hist(response)

MeanRes=mean(response) 
MeanRes
## [1] 0.9242559
v1=rep(1,dim(ld)[1])
v2=rep(0,dim(ld)[1])
ld$Credit.GradeAA=ifelse(ld$Credit.Grade=="AA",v1,v2)
ld$Credit.GradeA=ifelse(ld$Credit.Grade=="A",v1,v2)
ld$Credit.GradeB=ifelse(ld$Credit.Grade=="B",v1,v2)
ld$Credit.GradeC=ifelse(ld$Credit.Grade=="C",v1,v2)
ld$Credit.GradeD=ifelse(ld$Credit.Grade=="D",v1,v2)
ld$Credit.GradeE=ifelse(ld$Credit.Grade=="E",v1,v2)
ld$Credit.GradeHR=ifelse(ld$Credit.Grade=="HR",v1,v2)
xx=cbind(response,Amount=ld$Amount,Age=ld$Age,BorrowerRate=ld$Borrower.Rate,DebtIncomeRatio=ld$Debt.To.Income.Ratio,
         CreditGradeAA=ld$Credit.GradeAA,CreditGradeA=ld$Credit.GradeA,CreditGradeB=ld$Credit.GradeB,CreditGradeC=ld$Credit.GradeC,
         CreditGradeD=ld$Credit.GradeD,CreditGradeE=ld$Credit.GradeE,CreditGradeHR=ld$Credit.GradeHR)
xx[1:3,]
##      response Amount Age BorrowerRate DebtIncomeRatio CreditGradeAA
## [1,]        1   5000   4        0.150            0.04             0
## [2,]        1   1900   6        0.265            0.02             0
## [3,]        1   1000   3        0.150            0.02             0
##      CreditGradeA CreditGradeB CreditGradeC CreditGradeD CreditGradeE
## [1,]            0            0            1            0            0
## [2,]            0            0            0            0            0
## [3,]            0            0            0            0            0
##      CreditGradeHR
## [1,]             0
## [2,]             1
## [3,]             1

The data is separated into a training set and a test set.

n=dim(ld)[1]
n
## [1] 5611
n1=floor(n*(0.6))
n1
## [1] 3366
n2=n-n1
n2
## [1] 2245
train=sample(1:n,n1)

The following is a model fitted on all the data.

m1=glm(response~.,family=binomial,data=data.frame(xx))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m1)
## 
## Call:
## glm(formula = response ~ ., family = binomial, data = data.frame(xx))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2427   0.1372   0.2384   0.3949   2.1454  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      5.965e+00  4.773e-01  12.497  < 2e-16 ***
## Amount          -4.575e-05  1.585e-05  -2.887 0.003888 ** 
## Age             -3.651e-01  2.212e-02 -16.507  < 2e-16 ***
## BorrowerRate    -1.114e+01  1.276e+00  -8.728  < 2e-16 ***
## DebtIncomeRatio  1.917e-01  2.495e-01   0.768 0.442242    
## CreditGradeAA    2.277e+00  6.053e-01   3.762 0.000169 ***
## CreditGradeA     1.662e+00  5.041e-01   3.297 0.000979 ***
## CreditGradeB     1.369e+00  4.326e-01   3.165 0.001550 ** 
## CreditGradeC     1.699e+00  3.982e-01   4.267 1.98e-05 ***
## CreditGradeD     1.628e+00  3.750e-01   4.341 1.42e-05 ***
## CreditGradeE     1.043e+00  3.525e-01   2.958 0.003101 ** 
## CreditGradeHR    5.177e-01  3.473e-01   1.491 0.136006    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3010.3  on 5610  degrees of freedom
## Residual deviance: 2371.0  on 5599  degrees of freedom
## AIC: 2395
## 
## Number of Fisher Scoring iterations: 16
xx=xx[,-1]
xtrain <- xx[train,]
xnew <- xx[-train,]
ytrain <- response[train]
ynew <- response[-train]

The following is a model fitted on the training data set.

m2=glm(response~.,family=binomial,data=data.frame(response=ytrain,xtrain))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m2)
## 
## Call:
## glm(formula = response ~ ., family = binomial, data = data.frame(response = ytrain, 
##     xtrain))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2961   0.1453   0.2465   0.3981   2.0036  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      6.517e+00  6.565e-01   9.927  < 2e-16 ***
## Amount          -3.427e-05  2.072e-05  -1.654   0.0981 .  
## Age             -3.481e-01  2.815e-02 -12.365  < 2e-16 ***
## BorrowerRate    -1.199e+01  1.636e+00  -7.331 2.28e-13 ***
## DebtIncomeRatio  1.724e-03  3.958e-02   0.044   0.9653    
## CreditGradeAA    1.647e+00  8.077e-01   2.040   0.0414 *  
## CreditGradeA     7.227e-01  6.522e-01   1.108   0.2678    
## CreditGradeB     6.974e-01  5.892e-01   1.184   0.2365    
## CreditGradeC     1.271e+00  5.634e-01   2.257   0.0240 *  
## CreditGradeD     1.123e+00  5.267e-01   2.132   0.0330 *  
## CreditGradeE     7.294e-01  5.022e-01   1.452   0.1464    
## CreditGradeHR    6.914e-02  4.958e-01   0.139   0.8891    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1806.1  on 3365  degrees of freedom
## Residual deviance: 1439.8  on 3354  degrees of freedom
## AIC: 1463.8
## 
## Number of Fisher Scoring iterations: 14
ptest=predict(m2,newdata=data.frame(xnew),type="response")  
hist(ptest)

The following shows results for a probability of 0.5 or larger coding as a 1 (“Good”).

plot(ynew~ptest)

gg1=floor(ptest+0.5)
ttt=table(ynew,gg1)
ttt
##     gg1
## ynew    0    1
##    0   14  156
##    1    9 2066
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.07349666

The model is not very good at predicting bad loans, out of the 170 bad loans it predicted 14 of them.

The following shows results for a probability of 0.3 or larger coding as a 1 (“Good”).

gg2=floor(ptest+0.7)
ttt=table(ynew,gg2)
ttt
##     gg2
## ynew    0    1
##    0    9  161
##    1    2 2073
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.07260579

When the floor is raised to 0.3 or model is better at predicting “good” loans as “good” but becomes worse at predicting “bad” loans as “bad”.

bb=cbind(ptest,ynew)
head(bb)
##       ptest ynew
## 1 0.9881787    1
## 2 0.9340965    1
## 3 0.7077864    1
## 4 0.9846826    1
## 5 0.8992271    1
## 6 0.8699288    1
bb1=bb[order(ptest,decreasing=TRUE),]
head(bb1)
##      ptest ynew
## 2242     1    1
## 2243     1    1
## 2244     1    1
## 2245     1    1
## 2240     1    1
## 2241     1    1

The following is the overall success probability in the test data set.

xbar=mean(ynew)
xbar
## [1] 0.9242762
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

aaa=cbind(bb1[,1],bb1[,2],ay,ax)
aaa[1:20,]
##                  ay         ax
## 2242 1.0000000 1  1  0.9242762
## 2243 1.0000000 1  2  1.8485523
## 2244 1.0000000 1  3  2.7728285
## 2245 1.0000000 1  4  3.6971047
## 2240 1.0000000 1  5  4.6213808
## 2241 1.0000000 1  6  5.5456570
## 2239 1.0000000 1  7  6.4699332
## 2238 0.9999998 1  8  7.3942094
## 2237 0.9999997 1  9  8.3184855
## 2235 0.9999995 1 10  9.2427617
## 2236 0.9999984 1 11 10.1670379
## 2234 0.9999425 1 12 11.0913140
## 2233 0.9999231 1 13 12.0155902
## 89   0.9990858 1 14 12.9398664
## 1639 0.9990356 1 15 13.8641425
## 261  0.9990352 1 16 14.7884187
## 164  0.9990352 1 17 15.7126949
## 2232 0.9990091 1 18 16.6369710
## 104  0.9989887 1 19 17.5612472
## 32   0.9989756 1 20 18.4855234
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

This model is not very good, it has very little area underneath the curve.

Data Set 2: Titanic.csv

library(car) 
set.seed(1)

This data lists the characteristics of 1043 passengers on the Titanic and if they survived. The following model will attempt to accurately predict if a passenger will survive (coded as 1) the sinking of the Titanic.tt <- read.csv(“C:/DataMining/Data/Titanic.csv”)

tt <- read.csv("C:/DataMining/Data/Titanic.csv")
tt[1:3,]
##   Class Survived Gender     Age NumbSibOrSpsAbd NumbParOrChildAbd     Fare
## 1     1        1 female 29.0000               0                 0 211.3375
## 2     1        1   male  0.9167               1                 2 151.5500
## 3     1        0 female  2.0000               1                 2 151.5500
##   EmbarkedAt              HomeAndDestination         Home KnownDestination
## 1          S                    St Louis, MO St Louis, MO                 
## 2          S Montreal, PQ / Chesterville, ON Montreal, PQ Chesterville, ON
## 3          S Montreal, PQ / Chesterville, ON Montreal, PQ Chesterville, ON
tt=tt[,c(-9,-10,-11)]
response=tt$Survived

Indicator variables are created for number of siblings or a spouse aboard and number of parents (for a child) or children (for a parent) who were also aboard and where they embarked.

v3=rep(1,dim(tt)[1])
v4=rep(0,dim(tt)[1])
tt$Class1=ifelse(tt$NumbSibOrSpsAbd==1,v3,v4)
tt$Class2=ifelse(tt$NumbSibOrSpsAbd==2,v3,v4)
tt$siblingspouseaboard1=ifelse(tt$NumbSibOrSpsAbd==1,v3,v4)
tt$siblingspouseaboard2=ifelse(tt$NumbSibOrSpsAbd==2,v3,v4)
tt$siblingspouseaboard3=ifelse(tt$NumbSibOrSpsAbd==3,v3,v4)
tt$siblingspouseaboard4=ifelse(tt$NumbSibOrSpsAbd==4,v3,v4)
tt$siblingspouseaboard5=ifelse(tt$NumbSibOrSpsAbd==5,v3,v4)
tt$siblingspouseaboard8=ifelse(tt$NumbSibOrSpsAbd==8,v3,v4)
tt$parentchildrenaboard1=ifelse(tt$NumbParOrChildAbd==1,v3,v4)
tt$parentchildrenaboard2=ifelse(tt$NumbParOrChildAbd==2,v3,v4)
tt$parentchildrenaboard4=ifelse(tt$NumbParOrChildAbd==4,v3,v4)
tt$parentchildrenaboard5=ifelse(tt$NumbParOrChildAbd==5,v3,v4)
tt$parentchildrenaboard6=ifelse(tt$NumbParOrChildAbd==6,v3,v4)
tt$embarkedSouthampton=ifelse(tt$EmbarkedAt=="S",v3,v4)
tt$embarkedCherbourg=ifelse(tt$EmbarkedAt=="C",v3,v4)
tt$embarkedQueenstown=ifelse(tt$EmbarkedAt=="Q",v3,v4)
tt$female=ifelse(tt$Gender=="female",v3,v4)

tt=tt[,c(-1,-3,-5,-6,-8)]
tt=cbind(response,Age=tt$Age,Fare=tt$Fare,Female=tt$female,class1=tt$class1,class2=tt$class2,siblingspouse1=tt$siblingspouseaboard1,siblingspouse2=tt$siblingspouseaboard2,siblingspouse3=tt$siblingspouseaboard3,siblingspouse4=tt$siblingspouseaboard4,siblingspouse5=tt$siblingspouseaboard5,siblingspouse8=tt$siblingspouseaboard8,parentchildrenaboard1=tt$parentchildrenaboard1,parentchildrenaboard2=tt$parentchildrenaboard2,parentchildrenaboard3=tt$parentchildrenaboard3,parentchildrenaboard4=tt$parentchildrenaboard4,parentchildrenaboard5=tt$parentchildrenaboard5,parentchildrenaboard6=tt$parentchildrenaboard6,embarkedSouthampton=tt$embarkedSouthampton,embarkedCherbourg=tt$embarkedCherbourg,embarkedQueenstown=tt$embarkedQueenstown)

The data is separated into a training set and a test set.

n=dim(tt)[1]
n
## [1] 1043
n1=floor(n*(0.6))
n1
## [1] 625
n2=n-n1
n2
## [1] 418
train=sample(1:n,n1)

The following is a model fitted on all the data.

m3=glm(response~.,family=binomial,data=data.frame(tt))
summary(m3)
## 
## Call:
## glm(formula = response ~ ., family = binomial, data = data.frame(tt))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3760  -0.6276  -0.5254   0.6713   2.4577  
## 
## Coefficients: (1 not defined because of singularities)
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -2.279e+00  4.287e-01  -5.316 1.06e-07 ***
## Age                   -1.476e-02  6.206e-03  -2.379 0.017366 *  
## Fare                   8.235e-03  2.135e-03   3.858 0.000114 ***
## Female                 2.496e+00  1.720e-01  14.515  < 2e-16 ***
## siblingspouse1        -7.717e-02  1.952e-01  -0.395 0.692513    
## siblingspouse2        -5.466e-01  4.612e-01  -1.185 0.235973    
## siblingspouse3        -1.717e+00  6.850e-01  -2.506 0.012194 *  
## siblingspouse4        -2.308e+00  7.363e-01  -3.135 0.001718 ** 
## siblingspouse5        -1.613e+01  5.064e+02  -0.032 0.974598    
## siblingspouse8        -1.505e+01  1.455e+03  -0.010 0.991750    
## parentchildrenaboard1  5.836e-01  2.424e-01   2.408 0.016062 *  
## parentchildrenaboard2  4.328e-01  3.234e-01   1.338 0.180765    
## parentchildrenaboard4 -2.278e+00  1.278e+00  -1.782 0.074798 .  
## parentchildrenaboard5 -1.819e+00  1.158e+00  -1.571 0.116172    
## parentchildrenaboard6 -1.547e+01  8.635e+02  -0.018 0.985710    
## embarkedSouthampton    9.698e-01  3.973e-01   2.441 0.014640 *  
## embarkedCherbourg      1.721e+00  4.338e-01   3.968 7.25e-05 ***
## embarkedQueenstown            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  988.03  on 1026  degrees of freedom
## AIC: 1022
## 
## Number of Fisher Scoring iterations: 14
tt=tt[,-1]
xtrain <- tt[train,]
xnew <- tt[-train,]
ytrain <- response[train]
ynew <- response[-train]
## The following is a model fitted on the training data set.
m4=glm(response~.,family=binomial,data=data.frame(response=ytrain,xtrain))
summary(m4)
## 
## Call:
## glm(formula = response ~ ., family = binomial, data = data.frame(response = ytrain, 
##     xtrain))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5031  -0.6194  -0.5026   0.6867   2.3396  
## 
## Coefficients: (2 not defined because of singularities)
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -2.103e+00  5.223e-01  -4.026 5.68e-05 ***
## Age                   -1.907e-02  8.258e-03  -2.309  0.02096 *  
## Fare                   1.269e-02  3.235e-03   3.923 8.74e-05 ***
## Female                 2.421e+00  2.242e-01  10.795  < 2e-16 ***
## siblingspouse1        -1.354e-01  2.561e-01  -0.529  0.59714    
## siblingspouse2        -7.247e-01  6.078e-01  -1.192  0.23309    
## siblingspouse3        -3.590e+00  1.214e+00  -2.956  0.00311 ** 
## siblingspouse4        -2.367e+00  9.384e-01  -2.523  0.01165 *  
## siblingspouse5        -1.747e+01  8.435e+02  -0.021  0.98347    
## siblingspouse8                NA         NA      NA       NA    
## parentchildrenaboard1  5.746e-01  3.021e-01   1.902  0.05720 .  
## parentchildrenaboard2  6.555e-01  4.457e-01   1.471  0.14136    
## parentchildrenaboard4 -1.721e+01  1.598e+03  -0.011  0.99141    
## parentchildrenaboard5 -1.689e+01  1.672e+03  -0.010  0.99194    
## parentchildrenaboard6 -1.640e+01  1.441e+03  -0.011  0.99092    
## embarkedSouthampton    8.032e-01  4.768e-01   1.684  0.09211 .  
## embarkedCherbourg      1.502e+00  5.251e-01   2.861  0.00422 ** 
## embarkedQueenstown            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 843.64  on 624  degrees of freedom
## Residual deviance: 581.27  on 609  degrees of freedom
## AIC: 613.27
## 
## Number of Fisher Scoring iterations: 15
ptest=predict(m4,newdata=data.frame(xnew),type="response")  
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
hist(ptest)

The following shows results for a probability of 0.5 or larger coding as a 1 (“Survived”).

plot(ynew~ptest)

gg1=floor(ptest+0.5)
ttt=table(ynew,gg1)
ttt
##     gg1
## ynew   0   1
##    0 210  36
##    1  57 115
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.222488

This model has an overall 0.223 error in predicting the survival or fatality of a passenger. The model correctly predicts 210/246 fatalities and correctly predicts 114/171 survivals.

The following shows results for a probability of 0.3 or larger coding as a 1 (“Survived”).

gg2=floor(ptest+0.7)
ttt=table(ynew,gg2)
ttt
##     gg2
## ynew   0   1
##    0 196  50
##    1  41 131
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.2177033

When the floor is raised to 0.3 the model correctly predicts more survivals as survivals but incorrectly predicts more fatalities as survivals.

gg2=floor(ptest+0.7)
ttt=table(ynew,gg2)
ttt
##     gg2
## ynew   0   1
##    0 196  50
##    1  41 131
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.2177033

When the floor is raised to 0.3 the model correctly predicts more survivals as survivals but incorrectly predicts more fatalities as survivals.

bb=cbind(ptest,ynew)
head(bb)
##       ptest ynew
## 1 0.9626943    1
## 2 0.9714365    0
## 3 0.6392746    0
## 4 0.6844087    1
## 5 0.5098368    1
## 6 0.2099026    0
bb1=bb[order(ptest,decreasing=TRUE),]
head(bb1)
##         ptest ynew
## 113 0.9995269    1
## 31  0.9886168    1
## 20  0.9865147    1
## 8   0.9856515    1
## 118 0.9818502    1
## 14  0.9803041    1

The following is the overall success probability in the test data set.

xbar=mean(ynew)
xbar
## [1] 0.4114833
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

aaa=cbind(bb1[,1],bb1[,2],ay,ax)
aaa[1:20,]
##                 ay        ax
## 113 0.9995269 1  1 0.4114833
## 31  0.9886168 1  2 0.8229665
## 20  0.9865147 1  3 1.2344498
## 8   0.9856515 1  4 1.6459330
## 118 0.9818502 1  5 2.0574163
## 14  0.9803041 1  6 2.4688995
## 95  0.9723015 1  7 2.8803828
## 2   0.9714365 0  7 3.2918660
## 1   0.9626943 1  8 3.7033493
## 105 0.9609897 1  9 4.1148325
## 119 0.9496209 1 10 4.5263158
## 109 0.9489939 1 11 4.9377990
## 28  0.9477187 1 12 5.3492823
## 181 0.9452778 1 13 5.7607656
## 64  0.9440584 1 14 6.1722488
## 85  0.9406016 1 15 6.5837321
## 343 0.9344197 1 16 6.9952153
## 108 0.9308297 0 16 7.4066986
## 34  0.9290092 1 17 7.8181818
## 15  0.9226421 1 18 8.2296651
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

This model has some lift on the ROC graph but isn’t and does a generally good job at predicting survival.