Loan Data Analysis

Status Credit.Grade Amount Age Borrower.Rate Debt.To.Income.Ratio
Current C 5000 4 0.1500 0.040
Current HR 1900 6 0.2650 0.020
Current HR 1000 3 0.1500 0.020
Current HR 1000 5 0.2900 0.020
Current AA 2550 8 0.0795 0.033
Current NC 1500 2 0.2600 0.030

Status shows which loan is bad or good. Current is good, late and bad is bad

loandata$Status = recode(loandata$Status, "'Current'=1; else=0" )
loandata$Status = as.numeric(levels(loandata$Status)[loandata$Status])
##Changing categorical value into numerical value

n = length(loandata$Status)

n1=floor(n*(0.6))

n2 = n - n1

train = sample(1:n, n1)

Xdel <- model.matrix(Status~., data = loandata)[,-1]
Xdel[1:3,]
##   Credit.GradeAA Credit.GradeB Credit.GradeC Credit.GradeD Credit.GradeE
## 1              0             0             1             0             0
## 2              0             0             0             0             0
## 3              0             0             0             0             0
##   Credit.GradeHR Credit.GradeNC Amount Age Borrower.Rate
## 1              0              0   5000   4         0.150
## 2              1              0   1900   6         0.265
## 3              1              0   1000   3         0.150
##   Debt.To.Income.Ratio
## 1                 0.04
## 2                 0.02
## 3                 0.02
Xtrain <- Xdel[train, ]
xnew <- Xdel[-train,]
ytrain <- loandata$Status[train]
ynew <- loandata$Status[-train]
m1 = glm(Status~., family = binomial, data = data.frame(Status=ytrain, Xtrain))
m1
## 
## Call:  glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain, 
##     Xtrain))
## 
## Coefficients:
##          (Intercept)        Credit.GradeAA         Credit.GradeB  
##            7.385e+00             7.563e-01            -2.407e-02  
##        Credit.GradeC         Credit.GradeD         Credit.GradeE  
##            3.251e-01            -2.363e-01            -5.557e-01  
##       Credit.GradeHR        Credit.GradeNC                Amount  
##           -1.130e+00            -1.804e+00            -5.762e-05  
##                  Age         Borrower.Rate  Debt.To.Income.Ratio  
##           -3.558e-01            -1.037e+01             1.998e-01  
## 
## Degrees of Freedom: 3365 Total (i.e. Null);  3354 Residual
## Null Deviance:       1826 
## Residual Deviance: 1452  AIC: 1476
summary(m1)
## 
## Call:
## glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain, 
##     Xtrain))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1719   0.1419   0.2442   0.3983   2.0635  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           7.385e+00  5.239e-01  14.096  < 2e-16 ***
## Credit.GradeAA        7.563e-01  6.857e-01   1.103  0.27006    
## Credit.GradeB        -2.407e-02  4.607e-01  -0.052  0.95834    
## Credit.GradeC         3.251e-01  4.638e-01   0.701  0.48335    
## Credit.GradeD        -2.363e-01  4.442e-01  -0.532  0.59479    
## Credit.GradeE        -5.557e-01  4.643e-01  -1.197  0.23136    
## Credit.GradeHR       -1.130e+00  4.755e-01  -2.376  0.01748 *  
## Credit.GradeNC       -1.804e+00  6.706e-01  -2.690  0.00713 ** 
## Amount               -5.762e-05  2.062e-05  -2.794  0.00520 ** 
## Age                  -3.558e-01  2.828e-02 -12.582  < 2e-16 ***
## Borrower.Rate        -1.037e+01  1.602e+00  -6.472 9.66e-11 ***
## Debt.To.Income.Ratio  1.998e-01  3.707e-01   0.539  0.58988    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1826.0  on 3365  degrees of freedom
## Residual deviance: 1451.6  on 3354  degrees of freedom
## AIC: 1475.6
## 
## Number of Fisher Scoring iterations: 17

First 10 predictions

ptest <- predict(m1, newdata = data.frame(xnew), type = "response")
data.frame(ynew,ptest)[1:10,]
##    ynew     ptest
## 3     1 0.9728342
## 4     1 0.8045158
## 5     1 0.9870006
## 8     1 0.8797900
## 9     1 0.9988177
## 11    1 0.9956890
## 12    1 0.6972082
## 13    1 0.9826625
## 25    1 0.9447024
## 27    1 0.9918913

Making logistic regression graph

bb=cbind(ptest,ynew)

bb1=bb[order(ptest,decreasing=TRUE),]


xbar=mean(ynew)
xbar
## [1] 0.9260579
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

aaa=cbind(bb1[,1],bb1[,2],ay,ax)


plot(axis,ay,xlab="number of cases",ylab="number of successes",
     main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

Titanic Analysis

## delete last 3 columns
titanic = titanic[-9:-11]
Survived = titanic$Survived
## make dummy variables
a1 = rep(1,length(titanic$Survived))
a2 = rep(0,length(titanic$Survived))

titanic$GenderMale = ifelse(titanic$Gender =="male",a1,a2)

titanic =titanic[-3]

titanic$EmbarkedAtC = ifelse(titanic$EmbarkedAt == "C",a1,a2)
titanic$EmbarkedAtQ = ifelse(titanic$EmbarkedAt == "Q", a1,a2)

titanic = titanic[-7]

NewTitanic = cbind(Survived, Class=titanic$Class, Age=titanic$Age,
                   NumbSibOrSpsAbd =titanic$NumbSibOrSpsAbd, NumbParOrChildAbd =titanic$NumbParOrChildAbd,
                   Fare = titanic$Fare, GenderMale=titanic$GenderMale, EmbarkedAtC = titanic$EmbarkedAtC,
                   EmbarkedAtQ = titanic$EmbarkedAtQ)

n = length(titanic$Survived)

n1=floor(n*(0.6))

n2 = n - n1

train = sample(1:n, n1)

Logistic Regression Output

m2 = glm(Survived~., family = binomial, data.frame(NewTitanic))
summary(m2)
## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = data.frame(NewTitanic))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5726  -0.6846  -0.4058   0.6456   2.5376  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.534836   0.486022   9.331  < 2e-16 ***
## Class             -1.009327   0.133464  -7.563 3.95e-14 ***
## Age               -0.037687   0.006634  -5.681 1.34e-08 ***
## NumbSibOrSpsAbd   -0.348021   0.108435  -3.209  0.00133 ** 
## NumbParOrChildAbd  0.049846   0.104193   0.478  0.63236    
## Fare               0.000463   0.001934   0.239  0.81084    
## GenderMale        -2.608938   0.179304 -14.550  < 2e-16 ***
## EmbarkedAtC        0.679066   0.211566   3.210  0.00133 ** 
## EmbarkedAtQ       -0.767411   0.406290  -1.889  0.05892 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  954.75  on 1034  degrees of freedom
## AIC: 972.75
## 
## Number of Fisher Scoring iterations: 5

Distribution of probabilities

NewTitanic = NewTitanic[,-1]
Xtrain <- NewTitanic[train, ]
xnew <- NewTitanic[-train,]
ytrain <- Survived[train]
ynew <- Survived[-train]

ptest = predict(m2, newdata = data.frame(xnew), type = "response")
hist(ptest)

Coding 1 (Survived) if probability is 0.4 or higher than 0.4

g=floor(ptest+0.4) ## prob .4 or higher
t=table(ynew,g)
t
##     g
## ynew   0   1
##    0 235  24
##    1  60  99
error=(t[1,2]+t[2,1])/n2
## % of Error
error
## [1] 0.2009569

Showing first 10 prediction

b=cbind(ptest,ynew)
b[1:10,]
##        ptest ynew
## 1  0.9262486    1
## 2  0.9634547    0
## 3  0.3651202    0
## 4  0.3969200    0
## 5  0.9230614    1
## 6  0.6524154    1
## 7  0.9386056    1
## 8  0.9265782    1
## 9  0.5860399    1
## 10 0.9601771    1
b1=b[order(ptest,decreasing=TRUE),]
b1[1:10,]
##         ptest ynew
## 41  0.9693150    1
## 35  0.9685112    1
## 2   0.9634547    0
## 10  0.9601771    1
## 96  0.9578213    1
## 81  0.9548405    1
## 91  0.9518332    1
## 75  0.9513740    1
## 23  0.9497901    1
## 165 0.9492367    1
xbar=mean(ynew)

axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

The grapgh of the regression

plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")