Loan Data

library(car)
set.seed(1)
download.file("https://www.biz.uiowa.edu/faculty/jledolter/datamining/LoanData.csv","LoanData.csv")
LD <- read.csv("LoanData.csv")
LD$Status = recode(LD$Status, "'Current'=1; else=0" )
LD$Status = as.numeric(levels(LD$Status)[LD$Status])
n=length(LD$Status)
n
## [1] 5611
n1=floor(n*(0.6))
n1
## [1] 3366
n2=n-n1
n2
## [1] 2245
train=sample(1:n,n1)
XLD <- model.matrix(Status~.,data=LD)[,-1] 
XLD[1:3,]
##   Credit.GradeAA Credit.GradeB Credit.GradeC Credit.GradeD Credit.GradeE
## 1              0             0             1             0             0
## 2              0             0             0             0             0
## 3              0             0             0             0             0
##   Credit.GradeHR Credit.GradeNC Amount Age Borrower.Rate
## 1              0              0   5000   4         0.150
## 2              1              0   1900   6         0.265
## 3              1              0   1000   3         0.150
##   Debt.To.Income.Ratio
## 1                 0.04
## 2                 0.02
## 3                 0.02

We know status is the dependent variable because all the other variables determine status. Current status means good loan status and bad or late are bad loan status’s. Recoding them will group the bad loan status’s together.

xtrain <- XLD[train, ]
xnew <- XLD[-train,]
ytrain <- LD$Status[train]
ynew <- LD$Status[-train]
m1=glm(Status~.,family=binomial,data=data.frame(Status=ytrain,xtrain))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m1)
## 
## Call:
## glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain, 
##     xtrain))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2961   0.1453   0.2465   0.3981   2.0036  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           7.240e+00  5.089e-01  14.225  < 2e-16 ***
## Credit.GradeAA        9.247e-01  6.850e-01   1.350   0.1770    
## Credit.GradeB        -2.536e-02  4.489e-01  -0.056   0.9550    
## Credit.GradeC         5.488e-01  4.575e-01   1.200   0.2303    
## Credit.GradeD         4.003e-01  4.548e-01   0.880   0.3788    
## Credit.GradeE         6.680e-03  4.640e-01   0.014   0.9885    
## Credit.GradeHR       -6.536e-01  4.739e-01  -1.379   0.1679    
## Credit.GradeNC       -7.227e-01  6.522e-01  -1.108   0.2678    
## Amount               -3.427e-05  2.072e-05  -1.654   0.0981 .  
## Age                  -3.481e-01  2.815e-02 -12.365  < 2e-16 ***
## Borrower.Rate        -1.199e+01  1.636e+00  -7.331 2.28e-13 ***
## Debt.To.Income.Ratio  1.724e-03  3.958e-02   0.044   0.9653    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1806.1  on 3365  degrees of freedom
## Residual deviance: 1439.8  on 3354  degrees of freedom
## AIC: 1463.8
## 
## Number of Fisher Scoring iterations: 14

First 10 predictions

ptest <- predict(m1, newdata = data.frame(xnew), type = "response")
data.frame(ynew,ptest)[1:10,]
##    ynew     ptest
## 1     1 0.9881787
## 6     1 0.9340965
## 12    1 0.7077864
## 13    1 0.9846826
## 15    1 0.8992271
## 16    1 0.8699288
## 17    0 0.7148193
## 18    1 0.7831417
## 20    1 0.8274410
## 21    1 0.9954536
bb=cbind(ptest,ynew)

bb1=bb[order(ptest,decreasing=TRUE),]

xbar=mean(ynew)
xbar
## [1] 0.9242762
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

aaa=cbind(bb1[,1],bb1[,2],ay,ax)
plot(axis,ay,xlab="number of cases",ylab="number of successes",
     main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

Titanic

titanic <- read.csv("/Users/hannahpeterson/Titanic.csv")
## delete last 3 columns
titanic = titanic[-9:-11]
Survived = titanic$Survived
## make dummy variables
a1 = rep(1,length(titanic$Survived))
a2 = rep(0,length(titanic$Survived))

titanic$GenderMale = ifelse(titanic$Gender =="male",a1,a2)

titanic =titanic[-3]

titanic$EmbarkedAtC = ifelse(titanic$EmbarkedAt == "C",a1,a2)
titanic$EmbarkedAtQ = ifelse(titanic$EmbarkedAt == "Q", a1,a2)

titanic = titanic[-7]

NewTitanic = cbind(Survived, Class=titanic$Class, Age=titanic$Age,NumbSibOrSpsAbd =titanic$NumbSibOrSpsAbd, NumbParOrChildAbd =titanic$NumbParOrChildAbd,Fare = titanic$Fare, GenderMale=titanic$GenderMale, EmbarkedAtC = titanic$EmbarkedAtC, EmbarkedAtQ = titanic$EmbarkedAtQ)


n = length(titanic$Survived)

n1=floor(n*(0.6))

n2 = n - n1

train = sample(1:n, n1)

m2 = glm(Survived~., family = binomial, data.frame(NewTitanic))
summary(m2)
## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = data.frame(NewTitanic))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5726  -0.6846  -0.4058   0.6456   2.5376  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.534836   0.486022   9.331  < 2e-16 ***
## Class             -1.009327   0.133464  -7.563 3.95e-14 ***
## Age               -0.037687   0.006634  -5.681 1.34e-08 ***
## NumbSibOrSpsAbd   -0.348021   0.108435  -3.209  0.00133 ** 
## NumbParOrChildAbd  0.049846   0.104193   0.478  0.63236    
## Fare               0.000463   0.001934   0.239  0.81084    
## GenderMale        -2.608938   0.179304 -14.550  < 2e-16 ***
## EmbarkedAtC        0.679066   0.211566   3.210  0.00133 ** 
## EmbarkedAtQ       -0.767411   0.406290  -1.889  0.05892 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  954.75  on 1034  degrees of freedom
## AIC: 972.75
## 
## Number of Fisher Scoring iterations: 5
NewTitanic = NewTitanic[,-1]
Xtrain <- NewTitanic[train, ]
xnew <- NewTitanic[-train,]
ytrain <- Survived[train]
ynew <- Survived[-train]
ptest = predict(m2, newdata=data.frame(xnew), type="response")
hist(ptest)

g=floor(ptest+0.4) ## prob .4 or higher
t=table(ynew,g)
t
##     g
## ynew   0   1
##    0 214  23
##    1  67 114
error=(t[1,2]+t[2,1])/n2
## % of Error
error
## [1] 0.215311
b=cbind(ptest,ynew)
b[1:10,]
##        ptest ynew
## 1  0.6690433    1
## 2  0.3651202    0
## 3  0.9265782    1
## 4  0.9063001    1
## 5  0.9221009    1
## 6  0.7944524    1
## 7  0.6524415    1
## 8  0.9020214    1
## 9  0.5767017    1
## 10 0.8782562    1
b1=b[order(ptest,decreasing=TRUE),]
b1[1:10,]
##         ptest ynew
## 28  0.9706331    1
## 21  0.9685112    1
## 37  0.9657294    1
## 51  0.9578416    1
## 90  0.9578213    1
## 77  0.9564087    1
## 65  0.9544449    1
## 54  0.9521481    1
## 57  0.9508543    1
## 165 0.9492367    1
xbar=mean(ynew)

axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")