Problem Definition
Predict Loan default

Setup

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra) 
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(Deducer)
## Loading required package: JGR
## Loading required package: rJava
## Loading required package: JavaGD
## Loading required package: iplots
## 
## Please type JGR() to launch console. Platform specific launchers (.exe and .app) can also be obtained at http://www.rforge.net/JGR/files/.
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Note Non-JGR console detected:
##  Deducer is best used from within JGR (http://jgr.markushelbig.org/).
##  To Bring up GUI dialogs, type deducer().
library(caret)
## Loading required package: lattice
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

Functions

Dataset

setwd("D:/Welingkar/Competitions/Anaholix/Round2/Q2")
dfrModel <- read.csv("./Train_Bank_Default.csv", header=T, stringsAsFactors=F)
head(dfrModel)
##   age ed employ address income debtinc  creddebt  othdebt default
## 1  41  3     17      12    176     9.3 11.359392 5.008608       1
## 2  27  1     10       6     31    17.3  1.362202 4.000798       0
## 3  40  1     15      14     55     5.5  0.856075 2.168925       0
## 4  41  1     15      14    120     2.9  2.658720 0.821280       0
## 5  24  2      2       0     28    17.3  1.787436 3.056564       1
## 6  41  2      5       5     25    10.2  0.392700 2.157300       0

Observation
Only Numeric data is their so no data changes required

Missing Data

#sum(is.na(dfrModel$Age))
lapply(dfrModel, FUN=detect_na)
## $age
## [1] 0
## 
## $ed
## [1] 0
## 
## $employ
## [1] 0
## 
## $address
## [1] 0
## 
## $income
## [1] 0
## 
## $debtinc
## [1] 0
## 
## $creddebt
## [1] 0
## 
## $othdebt
## [1] 0
## 
## $default
## [1] 0

Observation
1. There are no NA records in dataset.

Outliers Data

#detect_outliers(dfrModel$Age)
lapply(dfrModel, FUN=detect_outliers)
## $age
## integer(0)
## 
## $ed
## [1] 5 5 5 5 5
## 
## $employ
## integer(0)
## 
## $address
## integer(0)
## 
## $income
##  [1] 176 135 145 144 159 220 157 446 242 177 221 166 190 249 234 148 186
## [18] 136 253
## 
## $debtinc
## [1] 41.3
## 
## $creddebt
##  [1] 11.359392  6.048900  7.758900  9.876600  6.226794  9.600480 20.561310
##  [8]  9.593400 14.596200  8.166400  6.565583 15.016680  6.113800  6.935916
## [15]  6.948680  7.817144 16.031470 15.791776  6.111369  7.387380  6.911520
## [22]  7.320000  5.896743  6.588540  5.781564 14.231448  9.308376
## 
## $othdebt
##  [1] 16.66813 13.05121 12.42186 14.45273 12.65933 12.07569 17.20380
##  [8] 12.71401 27.03360 14.71932 15.40539 11.87445 12.95853 23.10422
## [15] 18.26913 20.61587 11.66334 15.14916 18.25738 17.79899 11.89352
## [22] 17.18455 11.72398
## 
## $default
## integer(0)

Observations
1. There are few outliers so we are going with Outliers

Outliers Graph

lapply(dfrModel, FUN=Graph_Boxplot)
## $age

## 
## $ed

## 
## $employ

## 
## $address

## 
## $income

## 
## $debtinc

## 
## $creddebt

## 
## $othdebt

## 
## $default

Observations
There are few outliers.
We will go with Outliers

Train Data

dim(dfrModel)
## [1] 700   9

Observations
There are 700 rows in the main data & 9 Variables

Missing Data

lapply(dfrModel, FUN=detect_na)
## $age
## [1] 0
## 
## $ed
## [1] 0
## 
## $employ
## [1] 0
## 
## $address
## [1] 0
## 
## $income
## [1] 0
## 
## $debtinc
## [1] 0
## 
## $creddebt
## [1] 0
## 
## $othdebt
## [1] 0
## 
## $default
## [1] 0

Observation
There is no data in the dataset with values as NA

Summary

#summary(dfrModel)
lapply(dfrModel, FUN=summary)
## $age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   29.00   34.00   34.86   40.00   56.00 
## 
## $ed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.723   2.000   5.000 
## 
## $employ
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   7.000   8.389  12.000  31.000 
## 
## $address
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   7.000   8.279  12.000  34.000 
## 
## $income
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    14.0    24.0    34.0    45.6    55.0   446.0 
## 
## $debtinc
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.40    5.00    8.60   10.26   14.12   41.30 
## 
## $creddebt
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0117  0.3691  0.8549  1.5536  1.9020 20.5613 
## 
## $othdebt
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.04558  1.04418  1.98757  3.05821  3.92306 27.03360 
## 
## $default
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2614  1.0000  1.0000

Correlation

vctCorr = numeric(0)
for (i in names(dfrModel)){
    cor.result <- cor(as.numeric(dfrModel$default), as.numeric(dfrModel[,i]))
    vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrModel)
dfrCorr
##         age          ed      employ     address      income     debtinc 
## -0.13765710  0.11467555 -0.28297839 -0.16445116 -0.07096966  0.38957476 
##    creddebt     othdebt     default 
##  0.24473973  0.14571257  1.00000000

Data For Visualization

dfrGraph <- gather(dfrModel, variable, value, -default)
head(dfrGraph)
##   default variable value
## 1       1      age    41
## 2       0      age    27
## 3       0      age    40
## 4       0      age    41
## 5       1      age    24
## 6       0      age    41

Data Visualization

ggplot(dfrGraph) +
    geom_jitter(aes(value,default, colour=variable)) + 
    facet_wrap(~variable, scales="free_x") +
    labs(title="Relation Of Diabetes [atient] With Other Features")

Observation
As per graph, There is some correlation between Outcome variable and other variables.

Summary

lapply(dfrModel, FUN=summary)
## $age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   29.00   34.00   34.86   40.00   56.00 
## 
## $ed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.723   2.000   5.000 
## 
## $employ
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   7.000   8.389  12.000  31.000 
## 
## $address
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   7.000   8.279  12.000  34.000 
## 
## $income
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    14.0    24.0    34.0    45.6    55.0   446.0 
## 
## $debtinc
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.40    5.00    8.60   10.26   14.12   41.30 
## 
## $creddebt
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0117  0.3691  0.8549  1.5536  1.9020 20.5613 
## 
## $othdebt
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.04558  1.04418  1.98757  3.05821  3.92306 27.03360 
## 
## $default
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2614  1.0000  1.0000

Observation
Mean and Median are nearly equal after doing data imputation whicih help to reduce Outliers.
Data summary is looking good we can continue with logistic model

Find Best Multi Logistic Model
Choose the best logistic model by using step().

stpModel=step(glm(data=dfrModel, formula=default~., family=binomial), trace=0, steps=100)
summary(stpModel)
## 
## Call:
## glm(formula = default ~ age + employ + address + debtinc + creddebt, 
##     family = binomial, data = dfrModel)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3555  -0.6521  -0.2949   0.2592   2.9132  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.63128    0.51268  -3.182  0.00146 ** 
## age          0.03256    0.01717   1.896  0.05799 .  
## employ      -0.26076    0.03011  -8.662  < 2e-16 ***
## address     -0.10365    0.02309  -4.490 7.13e-06 ***
## debtinc      0.08926    0.01855   4.813 1.49e-06 ***
## creddebt     0.57265    0.08723   6.565 5.20e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 804.36  on 699  degrees of freedom
## Residual deviance: 553.18  on 694  degrees of freedom
## AIC: 565.18
## 
## Number of Fisher Scoring iterations: 6

Observation
Best results given by default ~ age + employ + address + debtinc + creddebt
The degrees of Freedom for Null Variance is 700-1=699 when only outcome is considered so only one parameter
While for Reidual Deviance is 700-6=694 As no of parameters are 6 for residual deviance.

Make Final Multi Linear Model

# make model
mgmModel <- glm(data=dfrModel, formula=default ~ age + employ + address + debtinc + creddebt, family=binomial(link="logit"))
# print summary
summary(mgmModel)
## 
## Call:
## glm(formula = default ~ age + employ + address + debtinc + creddebt, 
##     family = binomial(link = "logit"), data = dfrModel)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3555  -0.6521  -0.2949   0.2592   2.9132  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.63128    0.51268  -3.182  0.00146 ** 
## age          0.03256    0.01717   1.896  0.05799 .  
## employ      -0.26076    0.03011  -8.662  < 2e-16 ***
## address     -0.10365    0.02309  -4.490 7.13e-06 ***
## debtinc      0.08926    0.01855   4.813 1.49e-06 ***
## creddebt     0.57265    0.08723   6.565 5.20e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 804.36  on 699  degrees of freedom
## Residual deviance: 553.18  on 694  degrees of freedom
## AIC: 565.18
## 
## Number of Fisher Scoring iterations: 6
pR2 = 1 - mgmModel$deviance / mgmModel$null.deviance
pR2
## [1] 0.3122816
mgmModel_null <- glm(dfrModel$default~1, family = binomial, data = dfrModel)
pR21= 1- logLik(mgmModel)/logLik(mgmModel_null)
pR21
## 'log Lik.' 0.3122816 (df=6)
library(rcompanion)
nagelkerke(mgmModel)
## $Models
##                                                                                                          
## Model: "glm, default ~ age + employ + address + debtinc + creddebt, binomial(link = \"logit\"), dfrModel"
## Null:  "glm, default ~ 1, binomial(link = \"logit\"), dfrModel"                                          
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                             0.312282
## Cox and Snell (ML)                   0.301514
## Nagelkerke (Cragg and Uhler)         0.441407
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq    p.value
##       -5     -125.59 251.19 3.0562e-52
## 
## $Number.of.observations
##           
## Model: 700
## Null:  700
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"
pR2(mgmModel)
##          llh      llhNull           G2     McFadden         r2ML 
## -276.5880367 -402.1821024  251.1881315    0.3122816    0.3015140 
##         r2CU 
##    0.4414066

Confusion Matrix

prdVal <- predict(mgmModel, type='response')
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dfrModel$default)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
## 
##    act
## prd   0   1
##   0 476  89
##   1  41  94
##                                           
##                Accuracy : 0.8143          
##                  95% CI : (0.7835, 0.8424)
##     No Information Rate : 0.7386          
##     P-Value [Acc > NIR] : 1.482e-06       
##                                           
##                   Kappa : 0.4746          
##  Mcnemar's Test P-Value : 3.753e-05       
##                                           
##             Sensitivity : 0.9207          
##             Specificity : 0.5137          
##          Pos Pred Value : 0.8425          
##          Neg Pred Value : 0.6963          
##              Prevalence : 0.7386          
##          Detection Rate : 0.6800          
##    Detection Prevalence : 0.8071          
##       Balanced Accuracy : 0.7172          
##                                           
##        'Positive' Class : 0               
## 

Observations
Accuracy is good which is around 0.8143

Regression Data

dfrPlot <- mutate(dfrModel, PrdVal=prdVal, POutcome=prdBln)
head(dfrPlot)
##   age ed employ address income debtinc  creddebt  othdebt default
## 1  41  3     17      12    176     9.3 11.359392 5.008608       1
## 2  27  1     10       6     31    17.3  1.362202 4.000798       0
## 3  40  1     15      14     55     5.5  0.856075 2.168925       0
## 4  41  1     15      14    120     2.9  2.658720 0.821280       0
## 5  24  2      2       0     28    17.3  1.787436 3.056564       1
## 6  41  2      5       5     25    10.2  0.392700 2.157300       0
##        PrdVal POutcome
## 1 0.796077056        1
## 2 0.160093796        0
## 3 0.008921696        0
## 4 0.020282165        0
## 5 0.767861564        1
## 6 0.272262072        0

Regression Visulaization

#dfrPlot
ggplot(dfrPlot, aes(x=PrdVal, y=POutcome))  + 
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method="gam", formula=y~s(log(x)), se=FALSE) +
    labs(title="Binomial Regression Curve") +
    labs(x="") +
    labs(y="")

ROC Visulaization

#rocplot(logistic.model,diag=TRUE,pred.prob.labels=FALSE,prob.label.digits=3,AUC=TRUE)
rocplot(mgmModel)

Observations
The closer the ROC gets to the optimal point of perfect prediction the closer the AUC gets to 1.
We can see that AUC value is around 0.8582 as well as Accuracy of our model is around 0.8143 which are quite equal so Model is good.

Test Data

setwd("D:/Welingkar/Competitions/Anaholix/Round2/Q2")
dfrTests <- read.csv("./Test_Bank_Default.csv", header=T, stringsAsFactors=F)
head(dfrTests)
##   age ed employ address income debtinc creddebt  othdebt
## 1  36  1     16      13     32    10.9 0.544128 2.943872
## 2  50  1      6      27     21    12.9 1.316574 1.392426
## 3  40  1      9       9     33    17.0 4.880700 0.729300
## 4  31  1      5       7     23     2.0 0.046000 0.414000
## 5  29  1      4       0     24     7.8 0.866736 1.005264
## 6  25  2      1       3     14     9.9 0.232848 1.153152

Missing Data

#sum(is.na(dfrModel$Age))
lapply(dfrTests, FUN=detect_na)
## $age
## [1] 0
## 
## $ed
## [1] 0
## 
## $employ
## [1] 0
## 
## $address
## [1] 0
## 
## $income
## [1] 0
## 
## $debtinc
## [1] 0
## 
## $creddebt
## [1] 0
## 
## $othdebt
## [1] 0

Predict

resVal <- predict(mgmModel, dfrTests, type="response")
prdSur <- ifelse(resVal > 0.5, 1, 0)

Observations
Accuracy is very good, it is around 0.77

Test Data Confusion Matrix

resVal <- predict(mgmModel, dfrTests, type="response")
prdSur <- ifelse(resVal > 0.5, 1, 0)
prdSur <- as.factor(prdSur)
levels(prdSur) <- c("0", "1")
dfrTests <- mutate(dfrTests, Result=resVal, Prd_Outcome=prdSur)
head(dfrTests)
##   age ed employ address income debtinc creddebt  othdebt      Result
## 1  36  1     16      13     32    10.9 0.544128 2.943872 0.009064256
## 2  50  1      6      27     21    12.9 1.316574 1.392426 0.078631691
## 3  40  1      9       9     33    17.0 4.880700 0.729300 0.668995166
## 4  31  1      5       7     23     2.0 0.046000 0.414000 0.079696639
## 5  29  1      4       0     24     7.8 0.866736 1.005264 0.368739528
## 6  25  2      1       3     14     9.9 0.232848 1.153152 0.408039825
##   Prd_Outcome
## 1           0
## 2           0
## 3           1
## 4           0
## 5           0
## 6           0
write.csv(dfrTests, "Test_Prediction_Q2.CSV")

Summary

There was total data set of 700 records of past bank loan default history of a private bank, It contains their details along with who is defaulted or not.
Data Set is divided in two part one is Train data for building a model while other one is Test data to test the model.