Loan Data Analysis
| Current |
C |
5000 |
4 |
0.1500 |
0.040 |
| Current |
HR |
1900 |
6 |
0.2650 |
0.020 |
| Current |
HR |
1000 |
3 |
0.1500 |
0.020 |
| Current |
HR |
1000 |
5 |
0.2900 |
0.020 |
| Current |
AA |
2550 |
8 |
0.0795 |
0.033 |
| Current |
NC |
1500 |
2 |
0.2600 |
0.030 |
Status shows which loan is bad or good. Current is good, late and bad is bad
loandata$Status = recode(loandata$Status, "'Current'=1; else=0" )
loandata$Status = as.numeric(levels(loandata$Status)[loandata$Status])
##Changing categorical value into numerical value
n = length(loandata$Status)
n1=floor(n*(0.6))
n2 = n - n1
train = sample(1:n, n1)
Xdel <- model.matrix(Status~., data = loandata)[,-1]
Xdel[1:3,]
## Credit.GradeAA Credit.GradeB Credit.GradeC Credit.GradeD Credit.GradeE
## 1 0 0 1 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## Credit.GradeHR Credit.GradeNC Amount Age Borrower.Rate
## 1 0 0 5000 4 0.150
## 2 1 0 1900 6 0.265
## 3 1 0 1000 3 0.150
## Debt.To.Income.Ratio
## 1 0.04
## 2 0.02
## 3 0.02
Xtrain <- Xdel[train, ]
xnew <- Xdel[-train,]
ytrain <- loandata$Status[train]
ynew <- loandata$Status[-train]
m1 = glm(Status~., family = binomial, data = data.frame(Status=ytrain, Xtrain))
m1
##
## Call: glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain,
## Xtrain))
##
## Coefficients:
## (Intercept) Credit.GradeAA Credit.GradeB
## 7.385e+00 7.563e-01 -2.407e-02
## Credit.GradeC Credit.GradeD Credit.GradeE
## 3.251e-01 -2.363e-01 -5.557e-01
## Credit.GradeHR Credit.GradeNC Amount
## -1.130e+00 -1.804e+00 -5.762e-05
## Age Borrower.Rate Debt.To.Income.Ratio
## -3.558e-01 -1.037e+01 1.998e-01
##
## Degrees of Freedom: 3365 Total (i.e. Null); 3354 Residual
## Null Deviance: 1826
## Residual Deviance: 1452 AIC: 1476
summary(m1)
##
## Call:
## glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain,
## Xtrain))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1719 0.1419 0.2442 0.3983 2.0635
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.385e+00 5.239e-01 14.096 < 2e-16 ***
## Credit.GradeAA 7.563e-01 6.857e-01 1.103 0.27006
## Credit.GradeB -2.407e-02 4.607e-01 -0.052 0.95834
## Credit.GradeC 3.251e-01 4.638e-01 0.701 0.48335
## Credit.GradeD -2.363e-01 4.442e-01 -0.532 0.59479
## Credit.GradeE -5.557e-01 4.643e-01 -1.197 0.23136
## Credit.GradeHR -1.130e+00 4.755e-01 -2.376 0.01748 *
## Credit.GradeNC -1.804e+00 6.706e-01 -2.690 0.00713 **
## Amount -5.762e-05 2.062e-05 -2.794 0.00520 **
## Age -3.558e-01 2.828e-02 -12.582 < 2e-16 ***
## Borrower.Rate -1.037e+01 1.602e+00 -6.472 9.66e-11 ***
## Debt.To.Income.Ratio 1.998e-01 3.707e-01 0.539 0.58988
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1826.0 on 3365 degrees of freedom
## Residual deviance: 1451.6 on 3354 degrees of freedom
## AIC: 1475.6
##
## Number of Fisher Scoring iterations: 17
First 10 predictions
ptest <- predict(m1, newdata = data.frame(xnew), type = "response")
data.frame(ynew,ptest)[1:10,]
## ynew ptest
## 3 1 0.9728342
## 4 1 0.8045158
## 5 1 0.9870006
## 8 1 0.8797900
## 9 1 0.9988177
## 11 1 0.9956890
## 12 1 0.6972082
## 13 1 0.9826625
## 25 1 0.9447024
## 27 1 0.9918913
Making logistic regression graph
bb=cbind(ptest,ynew)
bb1=bb[order(ptest,decreasing=TRUE),]
xbar=mean(ynew)
xbar
## [1] 0.9260579
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+bb1[i,2]
}
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
plot(axis,ay,xlab="number of cases",ylab="number of successes",
main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

Titanic Analysis
## delete last 3 columns
titanic = titanic[-9:-11]
Survived = titanic$Survived
## make dummy variables
a1 = rep(1,length(titanic$Survived))
a2 = rep(0,length(titanic$Survived))
titanic$GenderMale = ifelse(titanic$Gender =="male",a1,a2)
titanic =titanic[-3]
titanic$EmbarkedAtC = ifelse(titanic$EmbarkedAt == "C",a1,a2)
titanic$EmbarkedAtQ = ifelse(titanic$EmbarkedAt == "Q", a1,a2)
titanic = titanic[-7]
NewTitanic = cbind(Survived, Class=titanic$Class, Age=titanic$Age,
NumbSibOrSpsAbd =titanic$NumbSibOrSpsAbd, NumbParOrChildAbd =titanic$NumbParOrChildAbd,
Fare = titanic$Fare, GenderMale=titanic$GenderMale, EmbarkedAtC = titanic$EmbarkedAtC,
EmbarkedAtQ = titanic$EmbarkedAtQ)
n = length(titanic$Survived)
n1=floor(n*(0.6))
n2 = n - n1
train = sample(1:n, n1)
Logistic Regression Output
m2 = glm(Survived~., family = binomial, data.frame(NewTitanic))
summary(m2)
##
## Call:
## glm(formula = Survived ~ ., family = binomial, data = data.frame(NewTitanic))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5726 -0.6846 -0.4058 0.6456 2.5376
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.534836 0.486022 9.331 < 2e-16 ***
## Class -1.009327 0.133464 -7.563 3.95e-14 ***
## Age -0.037687 0.006634 -5.681 1.34e-08 ***
## NumbSibOrSpsAbd -0.348021 0.108435 -3.209 0.00133 **
## NumbParOrChildAbd 0.049846 0.104193 0.478 0.63236
## Fare 0.000463 0.001934 0.239 0.81084
## GenderMale -2.608938 0.179304 -14.550 < 2e-16 ***
## EmbarkedAtC 0.679066 0.211566 3.210 0.00133 **
## EmbarkedAtQ -0.767411 0.406290 -1.889 0.05892 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1409.99 on 1042 degrees of freedom
## Residual deviance: 954.75 on 1034 degrees of freedom
## AIC: 972.75
##
## Number of Fisher Scoring iterations: 5
Distribution of probabilities
NewTitanic = NewTitanic[,-1]
Xtrain <- NewTitanic[train, ]
xnew <- NewTitanic[-train,]
ytrain <- Survived[train]
ynew <- Survived[-train]
ptest = predict(m2, newdata = data.frame(xnew), type = "response")
hist(ptest)

Coding 1 (Survived) if probability is 0.4 or higher than 0.4
g=floor(ptest+0.4) ## prob .4 or higher
t=table(ynew,g)
t
## g
## ynew 0 1
## 0 235 24
## 1 60 99
error=(t[1,2]+t[2,1])/n2
## % of Error
error
## [1] 0.2009569
Showing first 10 prediction
b=cbind(ptest,ynew)
b[1:10,]
## ptest ynew
## 1 0.9262486 1
## 2 0.9634547 0
## 3 0.3651202 0
## 4 0.3969200 0
## 5 0.9230614 1
## 6 0.6524154 1
## 7 0.9386056 1
## 8 0.9265782 1
## 9 0.5860399 1
## 10 0.9601771 1
b1=b[order(ptest,decreasing=TRUE),]
b1[1:10,]
## ptest ynew
## 41 0.9693150 1
## 35 0.9685112 1
## 2 0.9634547 0
## 10 0.9601771 1
## 96 0.9578213 1
## 81 0.9548405 1
## 91 0.9518332 1
## 75 0.9513740 1
## 23 0.9497901 1
## 165 0.9492367 1
xbar=mean(ynew)
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+bb1[i,2]
}
The grapgh of the regression
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")
