Loan Data

library(car)
set.seed(1)
download.file("https://www.biz.uiowa.edu/faculty/jledolter/datamining/LoanData.csv","LoanData.csv")
LD <- read.csv("LoanData.csv")

LD$Status = recode(LD$Status, "'Current'=1; else=0" )
LD$Status = as.numeric(levels(LD$Status)[LD$Status])
n=length(LD$Status)
n

## [1] 5611

n1=floor(n*(0.6))
n1

## [1] 3366

n2=n-n1
n2

## [1] 2245

train=sample(1:n,n1)
XLD <- model.matrix(Status~.,data=LD)[,-1] 
XLD[1:3,]

##   Credit.GradeAA Credit.GradeB Credit.GradeC Credit.GradeD Credit.GradeE
## 1              0             0             1             0             0
## 2              0             0             0             0             0
## 3              0             0             0             0             0
##   Credit.GradeHR Credit.GradeNC Amount Age Borrower.Rate
## 1              0              0   5000   4         0.150
## 2              1              0   1900   6         0.265
## 3              1              0   1000   3         0.150
##   Debt.To.Income.Ratio
## 1                 0.04
## 2                 0.02
## 3                 0.02

We know status is the dependent variable because all the other variables determine status. Current status means good loan status and bad or late are bad loan status’s. Recoding them will group the bad loan status’s together.

xtrain <- XLD[train, ]
xnew <- XLD[-train,]
ytrain <- LD$Status[train]
ynew <- LD$Status[-train]
m1=glm(Status~.,family=binomial,data=data.frame(Status=ytrain,xtrain))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(m1)

## 
## Call:
## glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain, 
##     xtrain))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2961   0.1453   0.2465   0.3981   2.0036  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           7.240e+00  5.089e-01  14.225  < 2e-16 ***
## Credit.GradeAA        9.247e-01  6.850e-01   1.350   0.1770    
## Credit.GradeB        -2.536e-02  4.489e-01  -0.056   0.9550    
## Credit.GradeC         5.488e-01  4.575e-01   1.200   0.2303    
## Credit.GradeD         4.003e-01  4.548e-01   0.880   0.3788    
## Credit.GradeE         6.680e-03  4.640e-01   0.014   0.9885    
## Credit.GradeHR       -6.536e-01  4.739e-01  -1.379   0.1679    
## Credit.GradeNC       -7.227e-01  6.522e-01  -1.108   0.2678    
## Amount               -3.427e-05  2.072e-05  -1.654   0.0981 .  
## Age                  -3.481e-01  2.815e-02 -12.365  < 2e-16 ***
## Borrower.Rate        -1.199e+01  1.636e+00  -7.331 2.28e-13 ***
## Debt.To.Income.Ratio  1.724e-03  3.958e-02   0.044   0.9653    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1806.1  on 3365  degrees of freedom
## Residual deviance: 1439.8  on 3354  degrees of freedom
## AIC: 1463.8
## 
## Number of Fisher Scoring iterations: 14

First 10 predictions

ptest <- predict(m1, newdata = data.frame(xnew), type = "response")
data.frame(ynew,ptest)[1:10,]

##    ynew     ptest
## 1     1 0.9881787
## 6     1 0.9340965
## 12    1 0.7077864
## 13    1 0.9846826
## 15    1 0.8992271
## 16    1 0.8699288
## 17    0 0.7148193
## 18    1 0.7831417
## 20    1 0.8274410
## 21    1 0.9954536

bb=cbind(ptest,ynew)

bb1=bb[order(ptest,decreasing=TRUE),]

xbar=mean(ynew)
xbar

## [1] 0.9242762

axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

aaa=cbind(bb1[,1],bb1[,2],ay,ax)

plot(axis,ay,xlab="number of cases",ylab="number of successes",
     main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

Titanic

titanic <- read.csv("/Users/hannahpeterson/Titanic.csv")
## delete last 3 columns
titanic = titanic[-9:-11]
Survived = titanic$Survived
## make dummy variables
a1 = rep(1,length(titanic$Survived))
a2 = rep(0,length(titanic$Survived))

titanic$GenderMale = ifelse(titanic$Gender =="male",a1,a2)

titanic =titanic[-3]

titanic$EmbarkedAtC = ifelse(titanic$EmbarkedAt == "C",a1,a2)
titanic$EmbarkedAtQ = ifelse(titanic$EmbarkedAt == "Q", a1,a2)

titanic = titanic[-7]

NewTitanic = cbind(Survived, Class=titanic$Class, Age=titanic$Age,NumbSibOrSpsAbd =titanic$NumbSibOrSpsAbd, NumbParOrChildAbd =titanic$NumbParOrChildAbd,Fare = titanic$Fare, GenderMale=titanic$GenderMale, EmbarkedAtC = titanic$EmbarkedAtC, EmbarkedAtQ = titanic$EmbarkedAtQ)


n = length(titanic$Survived)

n1=floor(n*(0.6))

n2 = n - n1

train = sample(1:n, n1)

m2 = glm(Survived~., family = binomial, data.frame(NewTitanic))
summary(m2)

## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = data.frame(NewTitanic))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5726  -0.6846  -0.4058   0.6456   2.5376  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.534836   0.486022   9.331  < 2e-16 ***
## Class             -1.009327   0.133464  -7.563 3.95e-14 ***
## Age               -0.037687   0.006634  -5.681 1.34e-08 ***
## NumbSibOrSpsAbd   -0.348021   0.108435  -3.209  0.00133 ** 
## NumbParOrChildAbd  0.049846   0.104193   0.478  0.63236    
## Fare               0.000463   0.001934   0.239  0.81084    
## GenderMale        -2.608938   0.179304 -14.550  < 2e-16 ***
## EmbarkedAtC        0.679066   0.211566   3.210  0.00133 ** 
## EmbarkedAtQ       -0.767411   0.406290  -1.889  0.05892 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  954.75  on 1034  degrees of freedom
## AIC: 972.75
## 
## Number of Fisher Scoring iterations: 5

NewTitanic = NewTitanic[,-1]
Xtrain <- NewTitanic[train, ]
xnew <- NewTitanic[-train,]
ytrain <- Survived[train]
ynew <- Survived[-train]

ptest = predict(m2, newdata=data.frame(xnew), type="response")
hist(ptest)

g=floor(ptest+0.4) ## prob .4 or higher
t=table(ynew,g)
t

##     g
## ynew   0   1
##    0 214  23
##    1  67 114

error=(t[1,2]+t[2,1])/n2
## % of Error
error

## [1] 0.215311

b=cbind(ptest,ynew)
b[1:10,]

##        ptest ynew
## 1  0.6690433    1
## 2  0.3651202    0
## 3  0.9265782    1
## 4  0.9063001    1
## 5  0.9221009    1
## 6  0.7944524    1
## 7  0.6524415    1
## 8  0.9020214    1
## 9  0.5767017    1
## 10 0.8782562    1

b1=b[order(ptest,decreasing=TRUE),]
b1[1:10,]

##         ptest ynew
## 28  0.9706331    1
## 21  0.9685112    1
## 37  0.9657294    1
## 51  0.9578416    1
## 90  0.9578213    1
## 77  0.9564087    1
## 65  0.9544449    1
## 54  0.9521481    1
## 57  0.9508543    1
## 165 0.9492367    1

xbar=mean(ynew)

axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

Homework 3

Hannah Peterson

October 16, 2017

Loan Data

First 10 predictions

Titanic