Data Set 1: LoanData.csv
library(car)
set.seed(1)
This loan data set lists the outcomes of n= 5611 loans. The following models will attempt to accurately predict if a loan will be good (current) or bad (late or default).
ld <- read.csv("C:/DataMining/Data/LoanData.csv")
ld[1:3,]
## Status Credit.Grade Amount Age Borrower.Rate Debt.To.Income.Ratio
## 1 Current C 5000 4 0.150 0.04
## 2 Current HR 1900 6 0.265 0.02
## 3 Current HR 1000 3 0.150 0.02
ld$Status=recode(ld$Status,"'Current'=1;else=0")
table(ld$Status)
##
## 0 1
## 425 5186
ld$Status=as.numeric(levels(ld$Status)[ld$Status])
table(ld$Status)
##
## 0 1
## 425 5186
response=ld$Status
hist(response)

MeanRes=mean(response)
MeanRes
## [1] 0.9242559
v1=rep(1,dim(ld)[1])
v2=rep(0,dim(ld)[1])
ld$Credit.GradeAA=ifelse(ld$Credit.Grade=="AA",v1,v2)
ld$Credit.GradeA=ifelse(ld$Credit.Grade=="A",v1,v2)
ld$Credit.GradeB=ifelse(ld$Credit.Grade=="B",v1,v2)
ld$Credit.GradeC=ifelse(ld$Credit.Grade=="C",v1,v2)
ld$Credit.GradeD=ifelse(ld$Credit.Grade=="D",v1,v2)
ld$Credit.GradeE=ifelse(ld$Credit.Grade=="E",v1,v2)
ld$Credit.GradeHR=ifelse(ld$Credit.Grade=="HR",v1,v2)
xx=cbind(response,Amount=ld$Amount,Age=ld$Age,BorrowerRate=ld$Borrower.Rate,DebtIncomeRatio=ld$Debt.To.Income.Ratio,
CreditGradeAA=ld$Credit.GradeAA,CreditGradeA=ld$Credit.GradeA,CreditGradeB=ld$Credit.GradeB,CreditGradeC=ld$Credit.GradeC,
CreditGradeD=ld$Credit.GradeD,CreditGradeE=ld$Credit.GradeE,CreditGradeHR=ld$Credit.GradeHR)
xx[1:3,]
## response Amount Age BorrowerRate DebtIncomeRatio CreditGradeAA
## [1,] 1 5000 4 0.150 0.04 0
## [2,] 1 1900 6 0.265 0.02 0
## [3,] 1 1000 3 0.150 0.02 0
## CreditGradeA CreditGradeB CreditGradeC CreditGradeD CreditGradeE
## [1,] 0 0 1 0 0
## [2,] 0 0 0 0 0
## [3,] 0 0 0 0 0
## CreditGradeHR
## [1,] 0
## [2,] 1
## [3,] 1
The data is separated into a training set and a test set.
n=dim(ld)[1]
n
## [1] 5611
n1=floor(n*(0.6))
n1
## [1] 3366
n2=n-n1
n2
## [1] 2245
train=sample(1:n,n1)
The following is a model fitted on all the data.
m1=glm(response~.,family=binomial,data=data.frame(xx))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m1)
##
## Call:
## glm(formula = response ~ ., family = binomial, data = data.frame(xx))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2427 0.1372 0.2384 0.3949 2.1454
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.965e+00 4.773e-01 12.497 < 2e-16 ***
## Amount -4.575e-05 1.585e-05 -2.887 0.003888 **
## Age -3.651e-01 2.212e-02 -16.507 < 2e-16 ***
## BorrowerRate -1.114e+01 1.276e+00 -8.728 < 2e-16 ***
## DebtIncomeRatio 1.917e-01 2.495e-01 0.768 0.442242
## CreditGradeAA 2.277e+00 6.053e-01 3.762 0.000169 ***
## CreditGradeA 1.662e+00 5.041e-01 3.297 0.000979 ***
## CreditGradeB 1.369e+00 4.326e-01 3.165 0.001550 **
## CreditGradeC 1.699e+00 3.982e-01 4.267 1.98e-05 ***
## CreditGradeD 1.628e+00 3.750e-01 4.341 1.42e-05 ***
## CreditGradeE 1.043e+00 3.525e-01 2.958 0.003101 **
## CreditGradeHR 5.177e-01 3.473e-01 1.491 0.136006
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3010.3 on 5610 degrees of freedom
## Residual deviance: 2371.0 on 5599 degrees of freedom
## AIC: 2395
##
## Number of Fisher Scoring iterations: 16
xx=xx[,-1]
xtrain <- xx[train,]
xnew <- xx[-train,]
ytrain <- response[train]
ynew <- response[-train]
The following is a model fitted on the training data set.
m2=glm(response~.,family=binomial,data=data.frame(response=ytrain,xtrain))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m2)
##
## Call:
## glm(formula = response ~ ., family = binomial, data = data.frame(response = ytrain,
## xtrain))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2961 0.1453 0.2465 0.3981 2.0036
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.517e+00 6.565e-01 9.927 < 2e-16 ***
## Amount -3.427e-05 2.072e-05 -1.654 0.0981 .
## Age -3.481e-01 2.815e-02 -12.365 < 2e-16 ***
## BorrowerRate -1.199e+01 1.636e+00 -7.331 2.28e-13 ***
## DebtIncomeRatio 1.724e-03 3.958e-02 0.044 0.9653
## CreditGradeAA 1.647e+00 8.077e-01 2.040 0.0414 *
## CreditGradeA 7.227e-01 6.522e-01 1.108 0.2678
## CreditGradeB 6.974e-01 5.892e-01 1.184 0.2365
## CreditGradeC 1.271e+00 5.634e-01 2.257 0.0240 *
## CreditGradeD 1.123e+00 5.267e-01 2.132 0.0330 *
## CreditGradeE 7.294e-01 5.022e-01 1.452 0.1464
## CreditGradeHR 6.914e-02 4.958e-01 0.139 0.8891
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1806.1 on 3365 degrees of freedom
## Residual deviance: 1439.8 on 3354 degrees of freedom
## AIC: 1463.8
##
## Number of Fisher Scoring iterations: 14
ptest=predict(m2,newdata=data.frame(xnew),type="response")
hist(ptest)

The following shows results for a probability of 0.5 or larger coding as a 1 (“Good”).
plot(ynew~ptest)

gg1=floor(ptest+0.5)
ttt=table(ynew,gg1)
ttt
## gg1
## ynew 0 1
## 0 14 156
## 1 9 2066
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.07349666
The model is not very good at predicting bad loans, out of the 170 bad loans it predicted 14 of them.
The following shows results for a probability of 0.3 or larger coding as a 1 (“Good”).
gg2=floor(ptest+0.7)
ttt=table(ynew,gg2)
ttt
## gg2
## ynew 0 1
## 0 9 161
## 1 2 2073
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.07260579
When the floor is raised to 0.3 or model is better at predicting “good” loans as “good” but becomes worse at predicting “bad” loans as “bad”.
bb=cbind(ptest,ynew)
head(bb)
## ptest ynew
## 1 0.9881787 1
## 2 0.9340965 1
## 3 0.7077864 1
## 4 0.9846826 1
## 5 0.8992271 1
## 6 0.8699288 1
bb1=bb[order(ptest,decreasing=TRUE),]
head(bb1)
## ptest ynew
## 2242 1 1
## 2243 1 1
## 2244 1 1
## 2245 1 1
## 2240 1 1
## 2241 1 1
The following is the overall success probability in the test data set.
xbar=mean(ynew)
xbar
## [1] 0.9242762
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+bb1[i,2]
}
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
aaa[1:20,]
## ay ax
## 2242 1.0000000 1 1 0.9242762
## 2243 1.0000000 1 2 1.8485523
## 2244 1.0000000 1 3 2.7728285
## 2245 1.0000000 1 4 3.6971047
## 2240 1.0000000 1 5 4.6213808
## 2241 1.0000000 1 6 5.5456570
## 2239 1.0000000 1 7 6.4699332
## 2238 0.9999998 1 8 7.3942094
## 2237 0.9999997 1 9 8.3184855
## 2235 0.9999995 1 10 9.2427617
## 2236 0.9999984 1 11 10.1670379
## 2234 0.9999425 1 12 11.0913140
## 2233 0.9999231 1 13 12.0155902
## 89 0.9990858 1 14 12.9398664
## 1639 0.9990356 1 15 13.8641425
## 261 0.9990352 1 16 14.7884187
## 164 0.9990352 1 17 15.7126949
## 2232 0.9990091 1 18 16.6369710
## 104 0.9989887 1 19 17.5612472
## 32 0.9989756 1 20 18.4855234
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

This model is not very good, it has very little area underneath the curve.
Data Set 2: Titanic.csv
library(car)
set.seed(1)
This data lists the characteristics of 1043 passengers on the Titanic and if they survived. The following model will attempt to accurately predict if a passenger will survive (coded as 1) the sinking of the Titanic.tt <- read.csv(“C:/DataMining/Data/Titanic.csv”)
tt <- read.csv("C:/DataMining/Data/Titanic.csv")
tt[1:3,]
## Class Survived Gender Age NumbSibOrSpsAbd NumbParOrChildAbd Fare
## 1 1 1 female 29.0000 0 0 211.3375
## 2 1 1 male 0.9167 1 2 151.5500
## 3 1 0 female 2.0000 1 2 151.5500
## EmbarkedAt HomeAndDestination Home KnownDestination
## 1 S St Louis, MO St Louis, MO
## 2 S Montreal, PQ / Chesterville, ON Montreal, PQ Chesterville, ON
## 3 S Montreal, PQ / Chesterville, ON Montreal, PQ Chesterville, ON
tt=tt[,c(-9,-10,-11)]
response=tt$Survived
Indicator variables are created for number of siblings or a spouse aboard and number of parents (for a child) or children (for a parent) who were also aboard and where they embarked.
v3=rep(1,dim(tt)[1])
v4=rep(0,dim(tt)[1])
tt$Class1=ifelse(tt$NumbSibOrSpsAbd==1,v3,v4)
tt$Class2=ifelse(tt$NumbSibOrSpsAbd==2,v3,v4)
tt$siblingspouseaboard1=ifelse(tt$NumbSibOrSpsAbd==1,v3,v4)
tt$siblingspouseaboard2=ifelse(tt$NumbSibOrSpsAbd==2,v3,v4)
tt$siblingspouseaboard3=ifelse(tt$NumbSibOrSpsAbd==3,v3,v4)
tt$siblingspouseaboard4=ifelse(tt$NumbSibOrSpsAbd==4,v3,v4)
tt$siblingspouseaboard5=ifelse(tt$NumbSibOrSpsAbd==5,v3,v4)
tt$siblingspouseaboard8=ifelse(tt$NumbSibOrSpsAbd==8,v3,v4)
tt$parentchildrenaboard1=ifelse(tt$NumbParOrChildAbd==1,v3,v4)
tt$parentchildrenaboard2=ifelse(tt$NumbParOrChildAbd==2,v3,v4)
tt$parentchildrenaboard4=ifelse(tt$NumbParOrChildAbd==4,v3,v4)
tt$parentchildrenaboard5=ifelse(tt$NumbParOrChildAbd==5,v3,v4)
tt$parentchildrenaboard6=ifelse(tt$NumbParOrChildAbd==6,v3,v4)
tt$embarkedSouthampton=ifelse(tt$EmbarkedAt=="S",v3,v4)
tt$embarkedCherbourg=ifelse(tt$EmbarkedAt=="C",v3,v4)
tt$embarkedQueenstown=ifelse(tt$EmbarkedAt=="Q",v3,v4)
tt$female=ifelse(tt$Gender=="female",v3,v4)
tt=tt[,c(-1,-3,-5,-6,-8)]
tt=cbind(response,Age=tt$Age,Fare=tt$Fare,Female=tt$female,class1=tt$class1,class2=tt$class2,siblingspouse1=tt$siblingspouseaboard1,siblingspouse2=tt$siblingspouseaboard2,siblingspouse3=tt$siblingspouseaboard3,siblingspouse4=tt$siblingspouseaboard4,siblingspouse5=tt$siblingspouseaboard5,siblingspouse8=tt$siblingspouseaboard8,parentchildrenaboard1=tt$parentchildrenaboard1,parentchildrenaboard2=tt$parentchildrenaboard2,parentchildrenaboard3=tt$parentchildrenaboard3,parentchildrenaboard4=tt$parentchildrenaboard4,parentchildrenaboard5=tt$parentchildrenaboard5,parentchildrenaboard6=tt$parentchildrenaboard6,embarkedSouthampton=tt$embarkedSouthampton,embarkedCherbourg=tt$embarkedCherbourg,embarkedQueenstown=tt$embarkedQueenstown)
The data is separated into a training set and a test set.
n=dim(tt)[1]
n
## [1] 1043
n1=floor(n*(0.6))
n1
## [1] 625
n2=n-n1
n2
## [1] 418
train=sample(1:n,n1)
The following is a model fitted on all the data.
m3=glm(response~.,family=binomial,data=data.frame(tt))
summary(m3)
##
## Call:
## glm(formula = response ~ ., family = binomial, data = data.frame(tt))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3760 -0.6276 -0.5254 0.6713 2.4577
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.279e+00 4.287e-01 -5.316 1.06e-07 ***
## Age -1.476e-02 6.206e-03 -2.379 0.017366 *
## Fare 8.235e-03 2.135e-03 3.858 0.000114 ***
## Female 2.496e+00 1.720e-01 14.515 < 2e-16 ***
## siblingspouse1 -7.717e-02 1.952e-01 -0.395 0.692513
## siblingspouse2 -5.466e-01 4.612e-01 -1.185 0.235973
## siblingspouse3 -1.717e+00 6.850e-01 -2.506 0.012194 *
## siblingspouse4 -2.308e+00 7.363e-01 -3.135 0.001718 **
## siblingspouse5 -1.613e+01 5.064e+02 -0.032 0.974598
## siblingspouse8 -1.505e+01 1.455e+03 -0.010 0.991750
## parentchildrenaboard1 5.836e-01 2.424e-01 2.408 0.016062 *
## parentchildrenaboard2 4.328e-01 3.234e-01 1.338 0.180765
## parentchildrenaboard4 -2.278e+00 1.278e+00 -1.782 0.074798 .
## parentchildrenaboard5 -1.819e+00 1.158e+00 -1.571 0.116172
## parentchildrenaboard6 -1.547e+01 8.635e+02 -0.018 0.985710
## embarkedSouthampton 9.698e-01 3.973e-01 2.441 0.014640 *
## embarkedCherbourg 1.721e+00 4.338e-01 3.968 7.25e-05 ***
## embarkedQueenstown NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1409.99 on 1042 degrees of freedom
## Residual deviance: 988.03 on 1026 degrees of freedom
## AIC: 1022
##
## Number of Fisher Scoring iterations: 14
tt=tt[,-1]
xtrain <- tt[train,]
xnew <- tt[-train,]
ytrain <- response[train]
ynew <- response[-train]
## The following is a model fitted on the training data set.
m4=glm(response~.,family=binomial,data=data.frame(response=ytrain,xtrain))
summary(m4)
##
## Call:
## glm(formula = response ~ ., family = binomial, data = data.frame(response = ytrain,
## xtrain))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5031 -0.6194 -0.5026 0.6867 2.3396
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.103e+00 5.223e-01 -4.026 5.68e-05 ***
## Age -1.907e-02 8.258e-03 -2.309 0.02096 *
## Fare 1.269e-02 3.235e-03 3.923 8.74e-05 ***
## Female 2.421e+00 2.242e-01 10.795 < 2e-16 ***
## siblingspouse1 -1.354e-01 2.561e-01 -0.529 0.59714
## siblingspouse2 -7.247e-01 6.078e-01 -1.192 0.23309
## siblingspouse3 -3.590e+00 1.214e+00 -2.956 0.00311 **
## siblingspouse4 -2.367e+00 9.384e-01 -2.523 0.01165 *
## siblingspouse5 -1.747e+01 8.435e+02 -0.021 0.98347
## siblingspouse8 NA NA NA NA
## parentchildrenaboard1 5.746e-01 3.021e-01 1.902 0.05720 .
## parentchildrenaboard2 6.555e-01 4.457e-01 1.471 0.14136
## parentchildrenaboard4 -1.721e+01 1.598e+03 -0.011 0.99141
## parentchildrenaboard5 -1.689e+01 1.672e+03 -0.010 0.99194
## parentchildrenaboard6 -1.640e+01 1.441e+03 -0.011 0.99092
## embarkedSouthampton 8.032e-01 4.768e-01 1.684 0.09211 .
## embarkedCherbourg 1.502e+00 5.251e-01 2.861 0.00422 **
## embarkedQueenstown NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 843.64 on 624 degrees of freedom
## Residual deviance: 581.27 on 609 degrees of freedom
## AIC: 613.27
##
## Number of Fisher Scoring iterations: 15
ptest=predict(m4,newdata=data.frame(xnew),type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
hist(ptest)

The following shows results for a probability of 0.5 or larger coding as a 1 (“Survived”).
plot(ynew~ptest)

gg1=floor(ptest+0.5)
ttt=table(ynew,gg1)
ttt
## gg1
## ynew 0 1
## 0 210 36
## 1 57 115
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.222488
This model has an overall 0.223 error in predicting the survival or fatality of a passenger. The model correctly predicts 210/246 fatalities and correctly predicts 114/171 survivals.
The following shows results for a probability of 0.3 or larger coding as a 1 (“Survived”).
gg2=floor(ptest+0.7)
ttt=table(ynew,gg2)
ttt
## gg2
## ynew 0 1
## 0 196 50
## 1 41 131
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.2177033
When the floor is raised to 0.3 the model correctly predicts more survivals as survivals but incorrectly predicts more fatalities as survivals.
gg2=floor(ptest+0.7)
ttt=table(ynew,gg2)
ttt
## gg2
## ynew 0 1
## 0 196 50
## 1 41 131
error=(ttt[1,2]+ttt[2,1])/n2
error
## [1] 0.2177033
When the floor is raised to 0.3 the model correctly predicts more survivals as survivals but incorrectly predicts more fatalities as survivals.
bb=cbind(ptest,ynew)
head(bb)
## ptest ynew
## 1 0.9626943 1
## 2 0.9714365 0
## 3 0.6392746 0
## 4 0.6844087 1
## 5 0.5098368 1
## 6 0.2099026 0
bb1=bb[order(ptest,decreasing=TRUE),]
head(bb1)
## ptest ynew
## 113 0.9995269 1
## 31 0.9886168 1
## 20 0.9865147 1
## 8 0.9856515 1
## 118 0.9818502 1
## 14 0.9803041 1
The following is the overall success probability in the test data set.
xbar=mean(ynew)
xbar
## [1] 0.4114833
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+bb1[i,2]
}
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
aaa[1:20,]
## ay ax
## 113 0.9995269 1 1 0.4114833
## 31 0.9886168 1 2 0.8229665
## 20 0.9865147 1 3 1.2344498
## 8 0.9856515 1 4 1.6459330
## 118 0.9818502 1 5 2.0574163
## 14 0.9803041 1 6 2.4688995
## 95 0.9723015 1 7 2.8803828
## 2 0.9714365 0 7 3.2918660
## 1 0.9626943 1 8 3.7033493
## 105 0.9609897 1 9 4.1148325
## 119 0.9496209 1 10 4.5263158
## 109 0.9489939 1 11 4.9377990
## 28 0.9477187 1 12 5.3492823
## 181 0.9452778 1 13 5.7607656
## 64 0.9440584 1 14 6.1722488
## 85 0.9406016 1 15 6.5837321
## 343 0.9344197 1 16 6.9952153
## 108 0.9308297 0 16 7.4066986
## 34 0.9290092 1 17 7.8181818
## 15 0.9226421 1 18 8.2296651
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

This model has some lift on the ROC graph but isn’t and does a generally good job at predicting survival.