LOAD DATA INTO ENVIRONMENT
bankloan <- read.csv("BANK LOAN.csv")
head(bankloan)
## SN AGE EMPLOY ADDRESS DEBTINC CREDDEBT OTHDEBT DEFAULTER
## 1 1 3 17 12 9.3 11.36 5.01 1
## 2 2 1 10 6 17.3 1.36 4.00 0
## 3 3 2 15 14 5.5 0.86 2.17 0
## 4 4 3 15 14 2.9 2.66 0.82 0
## 5 5 1 2 0 17.3 1.79 3.06 1
## 6 6 3 5 5 10.2 0.39 2.16 0
str(bankloan)
## 'data.frame': 700 obs. of 8 variables:
## $ SN : int 1 2 3 4 5 6 7 8 9 10 ...
## $ AGE : int 3 1 2 3 1 3 2 3 1 2 ...
## $ EMPLOY : int 17 10 15 15 2 5 20 12 3 0 ...
## $ ADDRESS : int 12 6 14 14 0 5 9 11 4 13 ...
## $ DEBTINC : num 9.3 17.3 5.5 2.9 17.3 10.2 30.6 3.6 24.4 19.7 ...
## $ CREDDEBT : num 11.36 1.36 0.86 2.66 1.79 ...
## $ OTHDEBT : num 5.01 4 2.17 0.82 3.06 ...
## $ DEFAULTER: int 1 0 0 0 1 0 0 0 1 0 ...
TIDY UP THE DATA
bankloan$AGE <- as.factor(bankloan$AGE)
bankloan$DEFAULTER <- as.factor(bankloan$DEFAULTER)
bankloan$SN <- NULL
str(bankloan)
## 'data.frame': 700 obs. of 7 variables:
## $ AGE : Factor w/ 3 levels "1","2","3": 3 1 2 3 1 3 2 3 1 2 ...
## $ EMPLOY : int 17 10 15 15 2 5 20 12 3 0 ...
## $ ADDRESS : int 12 6 14 14 0 5 9 11 4 13 ...
## $ DEBTINC : num 9.3 17.3 5.5 2.9 17.3 10.2 30.6 3.6 24.4 19.7 ...
## $ CREDDEBT : num 11.36 1.36 0.86 2.66 1.79 ...
## $ OTHDEBT : num 5.01 4 2.17 0.82 3.06 ...
## $ DEFAULTER: Factor w/ 2 levels "0","1": 2 1 1 1 2 1 1 1 2 1 ...
GENERATE FULL MODEL
riskmodel <- glm(DEFAULTER~AGE+EMPLOY+ADDRESS+DEBTINC+CREDDEBT+OTHDEBT, family = binomial, data = bankloan)
summary(riskmodel)
##
## Call:
## glm(formula = DEFAULTER ~ AGE + EMPLOY + ADDRESS + DEBTINC +
## CREDDEBT + OTHDEBT, family = binomial, data = bankloan)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3495 -0.6601 -0.2974 0.2509 2.8583
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.78821 0.26407 -2.985 0.00284 **
## AGE2 0.25202 0.26651 0.946 0.34433
## AGE3 0.62707 0.36056 1.739 0.08201 .
## EMPLOY -0.26172 0.03188 -8.211 < 2e-16 ***
## ADDRESS -0.09964 0.02234 -4.459 8.22e-06 ***
## DEBTINC 0.08506 0.02212 3.845 0.00012 ***
## CREDDEBT 0.56336 0.08877 6.347 2.20e-10 ***
## OTHDEBT 0.02315 0.05709 0.405 0.68517
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 804.36 on 699 degrees of freedom
## Residual deviance: 553.41 on 692 degrees of freedom
## AIC: 569.41
##
## Number of Fisher Scoring iterations: 6
CHECK FOR VARIANCES - NULL VS FULL MODEL
null <- glm(DEFAULTER~1, family = binomial, data = bankloan)
anova(null, riskmodel, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: DEFAULTER ~ 1
## Model 2: DEFAULTER ~ AGE + EMPLOY + ADDRESS + DEBTINC + CREDDEBT + OTHDEBT
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 699 804.36
## 2 692 553.41 7 250.96 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
APPEND PREDICTION PROBABILITY VECTOR TO DATAFRAME
bankloan$predprob <- round(fitted(riskmodel), 2)
head(bankloan)
## AGE EMPLOY ADDRESS DEBTINC CREDDEBT OTHDEBT DEFAULTER predprob
## 1 3 17 12 9.3 11.36 5.01 1 0.82
## 2 1 10 6 17.3 1.36 4.00 0 0.16
## 3 2 15 14 5.5 0.86 2.17 0 0.01
## 4 3 15 14 2.9 2.66 0.82 0 0.02
## 5 1 2 0 17.3 1.79 3.06 1 0.78
## 6 3 5 5 10.2 0.39 2.16 0 0.30
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(gplots)
ROCRPRED <- prediction(bankloan$predprob, bankloan$DEFAULTER) # note we are using prediction() from ROCR
ROCEPERF <- performance(ROCRPRED, "tpr", "fpr")
ROCEPERF
## An object of class "performance"
## Slot "x.name":
## [1] "False positive rate"
##
## Slot "y.name":
## [1] "True positive rate"
##
## Slot "alpha.name":
## [1] "Cutoff"
##
## Slot "x.values":
## [[1]]
## [1] 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## [6] 0.000000000 0.000000000 0.001934236 0.001934236 0.001934236
## [11] 0.001934236 0.001934236 0.001934236 0.003868472 0.003868472
## [16] 0.003868472 0.005802708 0.007736944 0.007736944 0.007736944
## [21] 0.009671180 0.009671180 0.011605416 0.013539652 0.013539652
## [26] 0.013539652 0.015473888 0.021276596 0.021276596 0.025145068
## [31] 0.027079304 0.029013540 0.034816248 0.034816248 0.034816248
## [36] 0.036750484 0.036750484 0.038684720 0.040618956 0.042553191
## [41] 0.044487427 0.046421663 0.052224371 0.056092843 0.063829787
## [46] 0.073500967 0.083172147 0.087040619 0.088974855 0.092843327
## [51] 0.094777563 0.100580271 0.110251451 0.116054159 0.117988395
## [56] 0.123791103 0.131528046 0.141199226 0.154738878 0.160541586
## [61] 0.168278530 0.185686654 0.199226306 0.203094778 0.206963250
## [66] 0.206963250 0.212765957 0.218568665 0.226305609 0.235976789
## [71] 0.253384913 0.264990329 0.286266925 0.294003868 0.307543520
## [76] 0.324951644 0.342359768 0.359767892 0.371373308 0.379110251
## [81] 0.394584139 0.417794971 0.433268859 0.454545455 0.479690522
## [86] 0.504835590 0.533849130 0.545454545 0.570599613 0.605415861
## [91] 0.636363636 0.671179884 0.723404255 0.756286267 0.831721470
## [96] 0.916827853 1.000000000
##
##
## Slot "y.values":
## [[1]]
## [1] 0.000000000 0.005464481 0.016393443 0.021857923 0.043715847
## [6] 0.049180328 0.054644809 0.065573770 0.081967213 0.087431694
## [11] 0.098360656 0.103825137 0.109289617 0.109289617 0.114754098
## [16] 0.120218579 0.125683060 0.142076503 0.153005464 0.163934426
## [21] 0.180327869 0.202185792 0.207650273 0.229508197 0.245901639
## [26] 0.262295082 0.278688525 0.295081967 0.306010929 0.316939891
## [31] 0.333333333 0.338797814 0.344262295 0.355191257 0.360655738
## [36] 0.366120219 0.377049180 0.387978142 0.404371585 0.437158470
## [41] 0.442622951 0.442622951 0.480874317 0.486338798 0.491803279
## [46] 0.519125683 0.519125683 0.519125683 0.530054645 0.540983607
## [51] 0.557377049 0.568306011 0.590163934 0.595628415 0.606557377
## [56] 0.617486339 0.633879781 0.655737705 0.666666667 0.683060109
## [61] 0.688524590 0.699453552 0.710382514 0.726775956 0.743169399
## [66] 0.754098361 0.765027322 0.770491803 0.792349727 0.808743169
## [71] 0.814207650 0.825136612 0.825136612 0.836065574 0.846994536
## [76] 0.863387978 0.868852459 0.879781421 0.896174863 0.907103825
## [81] 0.923497268 0.923497268 0.923497268 0.923497268 0.923497268
## [86] 0.934426230 0.939890710 0.945355191 0.956284153 0.961748634
## [91] 0.972677596 0.978142077 0.983606557 0.994535519 1.000000000
## [96] 1.000000000 1.000000000
##
##
## Slot "alpha.values":
## [[1]]
## [1] Inf 1.00 0.99 0.98 0.97 0.96 0.95 0.94 0.93 0.91 0.90 0.88 0.87 0.86
## [15] 0.85 0.84 0.82 0.81 0.80 0.78 0.77 0.76 0.75 0.74 0.73 0.72 0.71 0.70
## [29] 0.69 0.68 0.67 0.66 0.65 0.64 0.63 0.62 0.61 0.60 0.59 0.57 0.56 0.55
## [43] 0.54 0.53 0.52 0.51 0.50 0.49 0.48 0.47 0.46 0.45 0.44 0.43 0.42 0.41
## [57] 0.40 0.39 0.38 0.37 0.36 0.35 0.34 0.33 0.32 0.31 0.30 0.29 0.28 0.27
## [71] 0.26 0.25 0.24 0.23 0.22 0.21 0.20 0.19 0.18 0.17 0.16 0.15 0.14 0.13
## [85] 0.12 0.11 0.10 0.09 0.08 0.07 0.06 0.05 0.04 0.03 0.02 0.01 0.00
GENERATE AUC-ROC PLOT
plot(ROCEPERF, colorize = T, print.cutoffs.at = seq(0.1, by=0.1))
CLASSIFICATION OF PREDICTION OUTCOME
bankloan$pred_defaulters <- ifelse(bankloan$predprob>0.3,1,0)
head(bankloan)
## AGE EMPLOY ADDRESS DEBTINC CREDDEBT OTHDEBT DEFAULTER predprob
## 1 3 17 12 9.3 11.36 5.01 1 0.82
## 2 1 10 6 17.3 1.36 4.00 0 0.16
## 3 2 15 14 5.5 0.86 2.17 0 0.01
## 4 3 15 14 2.9 2.66 0.82 0 0.02
## 5 1 2 0 17.3 1.79 3.06 1 0.78
## 6 3 5 5 10.2 0.39 2.16 0 0.30
## pred_defaulters
## 1 1
## 2 0
## 3 0
## 4 0
## 5 1
## 6 0
BUILD CONFUSION MATRIX
cMatrix <- table(bankloan$pred_defaulters, bankloan$DEFAULTER) ; cMatrix
##
## 0 1
## 0 410 45
## 1 107 138
CALCULATE SENSITIVITY & SPECIFICITY
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
sensitivity(cMatrix)
## [1] 0.7930368
specificity(cMatrix)
## [1] 0.7540984