library(car)
set.seed(1)
download.file("https://www.biz.uiowa.edu/faculty/jledolter/datamining/LoanData.csv","LoanData.csv")
LD <- read.csv("LoanData.csv")
LD$Status = recode(LD$Status, "'Current'=1; else=0" )
LD$Status = as.numeric(levels(LD$Status)[LD$Status])
n=length(LD$Status)
n
## [1] 5611
n1=floor(n*(0.6))
n1
## [1] 3366
n2=n-n1
n2
## [1] 2245
train=sample(1:n,n1)
XLD <- model.matrix(Status~.,data=LD)[,-1]
XLD[1:3,]
## Credit.GradeAA Credit.GradeB Credit.GradeC Credit.GradeD Credit.GradeE
## 1 0 0 1 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## Credit.GradeHR Credit.GradeNC Amount Age Borrower.Rate
## 1 0 0 5000 4 0.150
## 2 1 0 1900 6 0.265
## 3 1 0 1000 3 0.150
## Debt.To.Income.Ratio
## 1 0.04
## 2 0.02
## 3 0.02
We know status is the dependent variable because all the other variables determine status. Current status means good loan status and bad or late are bad loan status’s. Recoding them will group the bad loan status’s together.
xtrain <- XLD[train, ]
xnew <- XLD[-train,]
ytrain <- LD$Status[train]
ynew <- LD$Status[-train]
m1=glm(Status~.,family=binomial,data=data.frame(Status=ytrain,xtrain))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m1)
##
## Call:
## glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain,
## xtrain))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2961 0.1453 0.2465 0.3981 2.0036
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.240e+00 5.089e-01 14.225 < 2e-16 ***
## Credit.GradeAA 9.247e-01 6.850e-01 1.350 0.1770
## Credit.GradeB -2.536e-02 4.489e-01 -0.056 0.9550
## Credit.GradeC 5.488e-01 4.575e-01 1.200 0.2303
## Credit.GradeD 4.003e-01 4.548e-01 0.880 0.3788
## Credit.GradeE 6.680e-03 4.640e-01 0.014 0.9885
## Credit.GradeHR -6.536e-01 4.739e-01 -1.379 0.1679
## Credit.GradeNC -7.227e-01 6.522e-01 -1.108 0.2678
## Amount -3.427e-05 2.072e-05 -1.654 0.0981 .
## Age -3.481e-01 2.815e-02 -12.365 < 2e-16 ***
## Borrower.Rate -1.199e+01 1.636e+00 -7.331 2.28e-13 ***
## Debt.To.Income.Ratio 1.724e-03 3.958e-02 0.044 0.9653
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1806.1 on 3365 degrees of freedom
## Residual deviance: 1439.8 on 3354 degrees of freedom
## AIC: 1463.8
##
## Number of Fisher Scoring iterations: 14
ptest <- predict(m1, newdata = data.frame(xnew), type = "response")
data.frame(ynew,ptest)[1:10,]
## ynew ptest
## 1 1 0.9881787
## 6 1 0.9340965
## 12 1 0.7077864
## 13 1 0.9846826
## 15 1 0.8992271
## 16 1 0.8699288
## 17 0 0.7148193
## 18 1 0.7831417
## 20 1 0.8274410
## 21 1 0.9954536
bb=cbind(ptest,ynew)
bb1=bb[order(ptest,decreasing=TRUE),]
xbar=mean(ynew)
xbar
## [1] 0.9242762
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+bb1[i,2]
}
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
plot(axis,ay,xlab="number of cases",ylab="number of successes",
main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")
titanic <- read.csv("/Users/hannahpeterson/Titanic.csv")
## delete last 3 columns
titanic = titanic[-9:-11]
Survived = titanic$Survived
## make dummy variables
a1 = rep(1,length(titanic$Survived))
a2 = rep(0,length(titanic$Survived))
titanic$GenderMale = ifelse(titanic$Gender =="male",a1,a2)
titanic =titanic[-3]
titanic$EmbarkedAtC = ifelse(titanic$EmbarkedAt == "C",a1,a2)
titanic$EmbarkedAtQ = ifelse(titanic$EmbarkedAt == "Q", a1,a2)
titanic = titanic[-7]
NewTitanic = cbind(Survived, Class=titanic$Class, Age=titanic$Age,NumbSibOrSpsAbd =titanic$NumbSibOrSpsAbd, NumbParOrChildAbd =titanic$NumbParOrChildAbd,Fare = titanic$Fare, GenderMale=titanic$GenderMale, EmbarkedAtC = titanic$EmbarkedAtC, EmbarkedAtQ = titanic$EmbarkedAtQ)
n = length(titanic$Survived)
n1=floor(n*(0.6))
n2 = n - n1
train = sample(1:n, n1)
m2 = glm(Survived~., family = binomial, data.frame(NewTitanic))
summary(m2)
##
## Call:
## glm(formula = Survived ~ ., family = binomial, data = data.frame(NewTitanic))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5726 -0.6846 -0.4058 0.6456 2.5376
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.534836 0.486022 9.331 < 2e-16 ***
## Class -1.009327 0.133464 -7.563 3.95e-14 ***
## Age -0.037687 0.006634 -5.681 1.34e-08 ***
## NumbSibOrSpsAbd -0.348021 0.108435 -3.209 0.00133 **
## NumbParOrChildAbd 0.049846 0.104193 0.478 0.63236
## Fare 0.000463 0.001934 0.239 0.81084
## GenderMale -2.608938 0.179304 -14.550 < 2e-16 ***
## EmbarkedAtC 0.679066 0.211566 3.210 0.00133 **
## EmbarkedAtQ -0.767411 0.406290 -1.889 0.05892 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1409.99 on 1042 degrees of freedom
## Residual deviance: 954.75 on 1034 degrees of freedom
## AIC: 972.75
##
## Number of Fisher Scoring iterations: 5
NewTitanic = NewTitanic[,-1]
Xtrain <- NewTitanic[train, ]
xnew <- NewTitanic[-train,]
ytrain <- Survived[train]
ynew <- Survived[-train]
ptest = predict(m2, newdata=data.frame(xnew), type="response")
hist(ptest)
g=floor(ptest+0.4) ## prob .4 or higher
t=table(ynew,g)
t
## g
## ynew 0 1
## 0 214 23
## 1 67 114
error=(t[1,2]+t[2,1])/n2
## % of Error
error
## [1] 0.215311
b=cbind(ptest,ynew)
b[1:10,]
## ptest ynew
## 1 0.6690433 1
## 2 0.3651202 0
## 3 0.9265782 1
## 4 0.9063001 1
## 5 0.9221009 1
## 6 0.7944524 1
## 7 0.6524415 1
## 8 0.9020214 1
## 9 0.5767017 1
## 10 0.8782562 1
b1=b[order(ptest,decreasing=TRUE),]
b1[1:10,]
## ptest ynew
## 28 0.9706331 1
## 21 0.9685112 1
## 37 0.9657294 1
## 51 0.9578416 1
## 90 0.9578213 1
## 77 0.9564087 1
## 65 0.9544449 1
## 54 0.9521481 1
## 57 0.9508543 1
## 165 0.9492367 1
xbar=mean(ynew)
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+bb1[i,2]
}
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")