LOAD DATA INTO ENVIRONMENT

bankloan <- read.csv("BANK LOAN.csv")
head(bankloan)
##   SN AGE EMPLOY ADDRESS DEBTINC CREDDEBT OTHDEBT DEFAULTER
## 1  1   3     17      12     9.3    11.36    5.01         1
## 2  2   1     10       6    17.3     1.36    4.00         0
## 3  3   2     15      14     5.5     0.86    2.17         0
## 4  4   3     15      14     2.9     2.66    0.82         0
## 5  5   1      2       0    17.3     1.79    3.06         1
## 6  6   3      5       5    10.2     0.39    2.16         0
str(bankloan)
## 'data.frame':    700 obs. of  8 variables:
##  $ SN       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ AGE      : int  3 1 2 3 1 3 2 3 1 2 ...
##  $ EMPLOY   : int  17 10 15 15 2 5 20 12 3 0 ...
##  $ ADDRESS  : int  12 6 14 14 0 5 9 11 4 13 ...
##  $ DEBTINC  : num  9.3 17.3 5.5 2.9 17.3 10.2 30.6 3.6 24.4 19.7 ...
##  $ CREDDEBT : num  11.36 1.36 0.86 2.66 1.79 ...
##  $ OTHDEBT  : num  5.01 4 2.17 0.82 3.06 ...
##  $ DEFAULTER: int  1 0 0 0 1 0 0 0 1 0 ...

TIDY UP THE DATA

bankloan$AGE <- as.factor(bankloan$AGE)
bankloan$DEFAULTER <- as.factor(bankloan$DEFAULTER)
bankloan$SN <- NULL
str(bankloan)
## 'data.frame':    700 obs. of  7 variables:
##  $ AGE      : Factor w/ 3 levels "1","2","3": 3 1 2 3 1 3 2 3 1 2 ...
##  $ EMPLOY   : int  17 10 15 15 2 5 20 12 3 0 ...
##  $ ADDRESS  : int  12 6 14 14 0 5 9 11 4 13 ...
##  $ DEBTINC  : num  9.3 17.3 5.5 2.9 17.3 10.2 30.6 3.6 24.4 19.7 ...
##  $ CREDDEBT : num  11.36 1.36 0.86 2.66 1.79 ...
##  $ OTHDEBT  : num  5.01 4 2.17 0.82 3.06 ...
##  $ DEFAULTER: Factor w/ 2 levels "0","1": 2 1 1 1 2 1 1 1 2 1 ...

GENERATE FULL MODEL

riskmodel <- glm(DEFAULTER~AGE+EMPLOY+ADDRESS+DEBTINC+CREDDEBT+OTHDEBT, family = binomial, data = bankloan)
summary(riskmodel)
## 
## Call:
## glm(formula = DEFAULTER ~ AGE + EMPLOY + ADDRESS + DEBTINC + 
##     CREDDEBT + OTHDEBT, family = binomial, data = bankloan)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3495  -0.6601  -0.2974   0.2509   2.8583  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.78821    0.26407  -2.985  0.00284 ** 
## AGE2         0.25202    0.26651   0.946  0.34433    
## AGE3         0.62707    0.36056   1.739  0.08201 .  
## EMPLOY      -0.26172    0.03188  -8.211  < 2e-16 ***
## ADDRESS     -0.09964    0.02234  -4.459 8.22e-06 ***
## DEBTINC      0.08506    0.02212   3.845  0.00012 ***
## CREDDEBT     0.56336    0.08877   6.347 2.20e-10 ***
## OTHDEBT      0.02315    0.05709   0.405  0.68517    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 804.36  on 699  degrees of freedom
## Residual deviance: 553.41  on 692  degrees of freedom
## AIC: 569.41
## 
## Number of Fisher Scoring iterations: 6

CHECK FOR VARIANCES - NULL VS FULL MODEL

null <- glm(DEFAULTER~1, family = binomial, data = bankloan)
anova(null, riskmodel, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: DEFAULTER ~ 1
## Model 2: DEFAULTER ~ AGE + EMPLOY + ADDRESS + DEBTINC + CREDDEBT + OTHDEBT
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       699     804.36                          
## 2       692     553.41  7   250.96 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

APPEND PREDICTION PROBABILITY VECTOR TO DATAFRAME

bankloan$predprob <- round(fitted(riskmodel), 2)
head(bankloan)
##   AGE EMPLOY ADDRESS DEBTINC CREDDEBT OTHDEBT DEFAULTER predprob
## 1   3     17      12     9.3    11.36    5.01         1     0.82
## 2   1     10       6    17.3     1.36    4.00         0     0.16
## 3   2     15      14     5.5     0.86    2.17         0     0.01
## 4   3     15      14     2.9     2.66    0.82         0     0.02
## 5   1      2       0    17.3     1.79    3.06         1     0.78
## 6   3      5       5    10.2     0.39    2.16         0     0.30

AUC ROC CURVE

library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(gplots)
ROCRPRED <- prediction(bankloan$predprob, bankloan$DEFAULTER) # note we are using prediction() from ROCR
ROCEPERF <- performance(ROCRPRED, "tpr", "fpr")
ROCEPERF
## An object of class "performance"
## Slot "x.name":
## [1] "False positive rate"
## 
## Slot "y.name":
## [1] "True positive rate"
## 
## Slot "alpha.name":
## [1] "Cutoff"
## 
## Slot "x.values":
## [[1]]
##  [1] 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
##  [6] 0.000000000 0.000000000 0.001934236 0.001934236 0.001934236
## [11] 0.001934236 0.001934236 0.001934236 0.003868472 0.003868472
## [16] 0.003868472 0.005802708 0.007736944 0.007736944 0.007736944
## [21] 0.009671180 0.009671180 0.011605416 0.013539652 0.013539652
## [26] 0.013539652 0.015473888 0.021276596 0.021276596 0.025145068
## [31] 0.027079304 0.029013540 0.034816248 0.034816248 0.034816248
## [36] 0.036750484 0.036750484 0.038684720 0.040618956 0.042553191
## [41] 0.044487427 0.046421663 0.052224371 0.056092843 0.063829787
## [46] 0.073500967 0.083172147 0.087040619 0.088974855 0.092843327
## [51] 0.094777563 0.100580271 0.110251451 0.116054159 0.117988395
## [56] 0.123791103 0.131528046 0.141199226 0.154738878 0.160541586
## [61] 0.168278530 0.185686654 0.199226306 0.203094778 0.206963250
## [66] 0.206963250 0.212765957 0.218568665 0.226305609 0.235976789
## [71] 0.253384913 0.264990329 0.286266925 0.294003868 0.307543520
## [76] 0.324951644 0.342359768 0.359767892 0.371373308 0.379110251
## [81] 0.394584139 0.417794971 0.433268859 0.454545455 0.479690522
## [86] 0.504835590 0.533849130 0.545454545 0.570599613 0.605415861
## [91] 0.636363636 0.671179884 0.723404255 0.756286267 0.831721470
## [96] 0.916827853 1.000000000
## 
## 
## Slot "y.values":
## [[1]]
##  [1] 0.000000000 0.005464481 0.016393443 0.021857923 0.043715847
##  [6] 0.049180328 0.054644809 0.065573770 0.081967213 0.087431694
## [11] 0.098360656 0.103825137 0.109289617 0.109289617 0.114754098
## [16] 0.120218579 0.125683060 0.142076503 0.153005464 0.163934426
## [21] 0.180327869 0.202185792 0.207650273 0.229508197 0.245901639
## [26] 0.262295082 0.278688525 0.295081967 0.306010929 0.316939891
## [31] 0.333333333 0.338797814 0.344262295 0.355191257 0.360655738
## [36] 0.366120219 0.377049180 0.387978142 0.404371585 0.437158470
## [41] 0.442622951 0.442622951 0.480874317 0.486338798 0.491803279
## [46] 0.519125683 0.519125683 0.519125683 0.530054645 0.540983607
## [51] 0.557377049 0.568306011 0.590163934 0.595628415 0.606557377
## [56] 0.617486339 0.633879781 0.655737705 0.666666667 0.683060109
## [61] 0.688524590 0.699453552 0.710382514 0.726775956 0.743169399
## [66] 0.754098361 0.765027322 0.770491803 0.792349727 0.808743169
## [71] 0.814207650 0.825136612 0.825136612 0.836065574 0.846994536
## [76] 0.863387978 0.868852459 0.879781421 0.896174863 0.907103825
## [81] 0.923497268 0.923497268 0.923497268 0.923497268 0.923497268
## [86] 0.934426230 0.939890710 0.945355191 0.956284153 0.961748634
## [91] 0.972677596 0.978142077 0.983606557 0.994535519 1.000000000
## [96] 1.000000000 1.000000000
## 
## 
## Slot "alpha.values":
## [[1]]
##  [1]  Inf 1.00 0.99 0.98 0.97 0.96 0.95 0.94 0.93 0.91 0.90 0.88 0.87 0.86
## [15] 0.85 0.84 0.82 0.81 0.80 0.78 0.77 0.76 0.75 0.74 0.73 0.72 0.71 0.70
## [29] 0.69 0.68 0.67 0.66 0.65 0.64 0.63 0.62 0.61 0.60 0.59 0.57 0.56 0.55
## [43] 0.54 0.53 0.52 0.51 0.50 0.49 0.48 0.47 0.46 0.45 0.44 0.43 0.42 0.41
## [57] 0.40 0.39 0.38 0.37 0.36 0.35 0.34 0.33 0.32 0.31 0.30 0.29 0.28 0.27
## [71] 0.26 0.25 0.24 0.23 0.22 0.21 0.20 0.19 0.18 0.17 0.16 0.15 0.14 0.13
## [85] 0.12 0.11 0.10 0.09 0.08 0.07 0.06 0.05 0.04 0.03 0.02 0.01 0.00

GENERATE AUC-ROC PLOT

plot(ROCEPERF, colorize = T, print.cutoffs.at = seq(0.1, by=0.1))

CLASSIFICATION OF PREDICTION OUTCOME

bankloan$pred_defaulters <- ifelse(bankloan$predprob>0.3,1,0)
head(bankloan)
##   AGE EMPLOY ADDRESS DEBTINC CREDDEBT OTHDEBT DEFAULTER predprob
## 1   3     17      12     9.3    11.36    5.01         1     0.82
## 2   1     10       6    17.3     1.36    4.00         0     0.16
## 3   2     15      14     5.5     0.86    2.17         0     0.01
## 4   3     15      14     2.9     2.66    0.82         0     0.02
## 5   1      2       0    17.3     1.79    3.06         1     0.78
## 6   3      5       5    10.2     0.39    2.16         0     0.30
##   pred_defaulters
## 1               1
## 2               0
## 3               0
## 4               0
## 5               1
## 6               0

BUILD CONFUSION MATRIX

cMatrix <- table(bankloan$pred_defaulters, bankloan$DEFAULTER) ; cMatrix
##    
##       0   1
##   0 410  45
##   1 107 138

CALCULATE SENSITIVITY & SPECIFICITY

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
sensitivity(cMatrix)
## [1] 0.7930368
specificity(cMatrix)
## [1] 0.7540984