# reading the data
df <- read.csv("AutoFinanaceData.csv")
# attach the data frame
attach(df)
# Number of rows and columns
dim(df)## [1] 28906 21
## [1] "Agmt.No" "ContractStatus" "StartDate" "AGE"
## [5] "NOOFDEPE" "MTHINCTH" "SALDATFR" "TENORYR"
## [9] "DWNPMFR" "PROFBUS" "QUALHSC" "QUAL_PG"
## [13] "SEXCODE" "FULLPDC" "FRICODE" "WASHCODE"
## [17] "Region" "Branch" "DefaulterFlag" "DefaulterType"
## [21] "DATASET"
1. Defaulter Flag
1: Customer has delayed paying at least once
0: Otherwise
1. Gender
SEXCODE = 1 (Male)
SEXCODE = 0 (Female)
2. Age
3. Education
QUALHSC
QUAL_PG
4. Income
Monthly Income in Thousands (MTHINCTH)
Owns a Fridge (FRICODE)
Owns a Washing Machine (WASHCODE)
5. Profession
6. No.of Dependents
7. Region
## 'data.frame': 28906 obs. of 21 variables:
## $ Agmt.No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
## $ ContractStatus: chr "Closed" "Closed" "Closed" "Closed" ...
## $ StartDate : chr "19-01-01" "10-05-01" "05-08-01" "03-09-01" ...
## $ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
## $ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
## $ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
## $ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
## $ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
## $ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
## $ PROFBUS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QUALHSC : int 0 0 0 0 0 0 1 0 0 0 ...
## $ QUAL_PG : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SEXCODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FULLPDC : int 1 1 1 1 1 0 0 1 1 1 ...
## $ FRICODE : int 0 1 1 1 1 0 0 0 0 0 ...
## $ WASHCODE : int 0 0 1 1 0 0 0 0 0 0 ...
## $ Region : chr "AP2" "AP2" "AP2" "AP2" ...
## $ Branch : chr "Vizag" "Vizag" "Vizag" "Vizag" ...
## $ DefaulterFlag : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DefaulterType : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DATASET : chr " " "BUILD" "BUILD" "BUILD" ...
factor## 'data.frame': 28906 obs. of 21 variables:
## $ Agmt.No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
## $ ContractStatus: chr "Closed" "Closed" "Closed" "Closed" ...
## $ StartDate : chr "19-01-01" "10-05-01" "05-08-01" "03-09-01" ...
## $ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
## $ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
## $ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
## $ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
## $ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
## $ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
## $ PROFBUS : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ QUALHSC : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
## $ QUAL_PG : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SEXCODE : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ FULLPDC : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 1 2 2 2 ...
## $ FRICODE : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 1 1 1 ...
## $ WASHCODE : Factor w/ 2 levels "0","1": 1 1 2 2 1 1 1 1 1 1 ...
## $ Region : Factor w/ 8 levels "AP1","AP2","Chennai",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Branch : Factor w/ 14 levels "Bangalore","Chennai",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ DefaulterFlag : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ DefaulterType : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ DATASET : chr " " "BUILD" "BUILD" "BUILD" ...
## Warning: package 'caTools' was built under R version 4.0.4
glm() on the training data, the output is as follows.Model1 <- glm(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ NOOFDEPE
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region,data = trainingSet, family = binomial())
summary(Model1)##
## Call:
## glm(formula = DefaulterFlag ~ AGE + NOOFDEPE + MTHINCTH + NOOFDEPE +
## SALDATFR + TENORYR + DWNPMFR + PROFBUS + QUALHSC + QUAL_PG +
## SEXCODE + FULLPDC + FRICODE + WASHCODE + Region, family = binomial(),
## data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7470 -1.0215 0.5716 0.7801 2.0874
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.2291732 0.1890284 11.793 < 2e-16 ***
## AGE -0.0144968 0.0016680 -8.691 < 2e-16 ***
## NOOFDEPE 0.0566860 0.0107923 5.252 1.50e-07 ***
## MTHINCTH -0.0004025 0.0035613 -0.113 0.910023
## SALDATFR -0.3833870 0.0420223 -9.123 < 2e-16 ***
## TENORYR 0.7727065 0.0456475 16.928 < 2e-16 ***
## DWNPMFR -1.3074501 0.1274734 -10.257 < 2e-16 ***
## PROFBUS1 0.1966576 0.0487903 4.031 5.56e-05 ***
## QUALHSC1 0.1853120 0.0401652 4.614 3.95e-06 ***
## QUAL_PG1 -0.2990904 0.0787907 -3.796 0.000147 ***
## SEXCODE1 0.2339445 0.0600322 3.897 9.74e-05 ***
## FULLPDC1 -1.2365885 0.0368674 -33.541 < 2e-16 ***
## FRICODE1 -0.1761473 0.0377247 -4.669 3.02e-06 ***
## WASHCODE1 -0.2644245 0.0476814 -5.546 2.93e-08 ***
## RegionAP2 -0.5788864 0.1796029 -3.223 0.001268 **
## RegionChennai -1.4136987 0.1408192 -10.039 < 2e-16 ***
## RegionKA1 -0.6529787 0.1411987 -4.625 3.75e-06 ***
## RegionKE2 -0.5753874 0.1450753 -3.966 7.30e-05 ***
## RegionTN1 -0.8084619 0.1362745 -5.933 2.98e-09 ***
## RegionTN2 -0.6142186 0.1458691 -4.211 2.55e-05 ***
## RegionVellore -0.6570233 0.1595604 -4.118 3.83e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26040 on 21678 degrees of freedom
## Residual deviance: 23025 on 21658 degrees of freedom
## AIC: 23067
##
## Number of Fisher Scoring iterations: 4
glm() on the training data, the expected output should be as follows.Model2 <- glm(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ NOOFDEPE
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region
+ TENORYR*FULLPDC
,data = trainingSet, family = binomial())
summary(Model2)##
## Call:
## glm(formula = DefaulterFlag ~ AGE + NOOFDEPE + MTHINCTH + NOOFDEPE +
## SALDATFR + TENORYR + DWNPMFR + PROFBUS + QUALHSC + QUAL_PG +
## SEXCODE + FULLPDC + FRICODE + WASHCODE + Region + TENORYR *
## FULLPDC, family = binomial(), data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5410 -1.0043 0.5721 0.7620 2.1539
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.4938627 0.1945079 12.821 < 2e-16 ***
## AGE -0.0144308 0.0016723 -8.629 < 2e-16 ***
## NOOFDEPE 0.0569949 0.0108145 5.270 1.36e-07 ***
## MTHINCTH 0.0006478 0.0035839 0.181 0.856559
## SALDATFR -0.3900738 0.0422177 -9.240 < 2e-16 ***
## TENORYR 0.5723502 0.0557250 10.271 < 2e-16 ***
## DWNPMFR -1.3175535 0.1275771 -10.328 < 2e-16 ***
## PROFBUS1 0.2100458 0.0491541 4.273 1.93e-05 ***
## QUALHSC1 0.1835106 0.0402820 4.556 5.22e-06 ***
## QUAL_PG1 -0.2972520 0.0789155 -3.767 0.000165 ***
## SEXCODE1 0.2352736 0.0600159 3.920 8.85e-05 ***
## FULLPDC1 -1.7550203 0.0952821 -18.419 < 2e-16 ***
## FRICODE1 -0.1719774 0.0378111 -4.548 5.41e-06 ***
## WASHCODE1 -0.2585977 0.0477647 -5.414 6.16e-08 ***
## RegionAP2 -0.5817435 0.1797634 -3.236 0.001211 **
## RegionChennai -1.4436197 0.1414897 -10.203 < 2e-16 ***
## RegionKA1 -0.6604584 0.1417572 -4.659 3.18e-06 ***
## RegionKE2 -0.5973660 0.1458215 -4.097 4.19e-05 ***
## RegionTN1 -0.8355176 0.1369630 -6.100 1.06e-09 ***
## RegionTN2 -0.6171896 0.1464390 -4.215 2.50e-05 ***
## RegionVellore -0.7037441 0.1604099 -4.387 1.15e-05 ***
## TENORYR:FULLPDC1 0.4227544 0.0714356 5.918 3.26e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26040 on 21678 degrees of freedom
## Residual deviance: 22990 on 21657 degrees of freedom
## AIC: 23034
##
## Number of Fisher Scoring iterations: 4
attach(trainingSet)
# Defaulters based on whether they gave post-dated cheques
round(prop.table(table(DefaulterFlag, FULLPDC), 1)*100, 2)[, 1:2]## FULLPDC
## DefaulterFlag 0 1
## 0 38.55 61.45
## 1 69.96 30.04
library(data.table)
# average TENORYR of the borrower by Defaulter Flag
trainingSet <- data.table(trainingSet)
trainingSet[, .(AvgTENORYR = round(mean(TENORYR), 2)),
by=list(DefaulterFlag)]## DefaulterFlag AvgTENORYR
## 1: 0 1.15
## 2: 1 1.34
AGE = mean(AGE),
Male,
Education = UG,
MTHINCTH = mean(MTHINCTH)
NoOfDepe = mean(NOOFDEPE),
Owns a Fridge,
Owns a Washing Machine
Working Professional,
SALDATFR = mean(SALDATFR),
Lives in TN1
Down Payment = mean(DWNPMFR) %
Xdata <- data.frame(
AGE = mean(AGE),
NOOFDEPE = mean(NOOFDEPE),
SALDATFR = mean(SALDATFR),
TENORYR = 1,
DWNPMFR = mean(DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "1",
FULLPDC = "1",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1",
MTHINCTH = mean(MTHINCTH))
pred1 <- predict(Model2, Xdata, type = "response")
pred1## 1
## 0.420266
Xdata <- data.frame(
AGE = mean(AGE),
NOOFDEPE = mean(NOOFDEPE),
SALDATFR = mean(SALDATFR),
TENORYR = 2,
DWNPMFR = mean(DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "1",
FULLPDC = "1",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1",
MTHINCTH = mean(MTHINCTH))
pred2 <- predict(Model2, Xdata, type = "response")
pred2## 1
## 0.6622693
## 1
## 0.2420032
Xdata <- data.frame(
AGE = mean(AGE),
NOOFDEPE = mean(NOOFDEPE),
SALDATFR = mean(SALDATFR),
TENORYR = 1,
DWNPMFR = mean(DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "1",
FULLPDC = "0",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1",
MTHINCTH = mean(MTHINCTH))
pred3 <- predict(Model2, Xdata, type = "response")
pred3## 1
## 0.7331345
Xdata <- data.frame(
AGE = mean(AGE),
NOOFDEPE = mean(NOOFDEPE),
SALDATFR = mean(SALDATFR),
TENORYR = 2,
DWNPMFR = mean(DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "1",
FULLPDC = "0",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1",
MTHINCTH = mean(MTHINCTH))
pred4 <- predict(Model2, Xdata, type = "response")
pred4## 1
## 0.8296197
## 1
## 0.09648523
## Loading required package: lattice
## Loading required package: ggplot2
# confusion matrix using caret package
yPred <- ifelse(predProbTest > 0.5, "Yes", "No")
predY <- as.factor(yPred)
levels(testSet$DefaulterFlag) <- c("No", "Yes")
confusionMatrix(data = predY, reference = testSet$DefaulterFlag, positive = "Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 635 417
## Yes 1448 4727
##
## Accuracy : 0.7419
## 95% CI : (0.7317, 0.752)
## No Information Rate : 0.7118
## P-Value [Acc > NIR] : 5.682e-09
##
## Kappa : 0.2624
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9189
## Specificity : 0.3048
## Pos Pred Value : 0.7655
## Neg Pred Value : 0.6036
## Prevalence : 0.7118
## Detection Rate : 0.6541
## Detection Prevalence : 0.8544
## Balanced Accuracy : 0.6119
##
## 'Positive' Class : Yes
##
## Warning: package 'ROCR' was built under R version 4.0.4
# predicted probabilities
predProbTest <- predict(Model1, testSet, type = "response")
lgPredObj <- prediction(predProbTest,testSet$DefaulterFlag)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")# auc for logit
aucLogit <- performance(lgPredObj, measure = "auc")
aucLogit <- aucLogit@y.values[[1]]
aucLogit## [1] 0.7263962
set.seed(2345)
dTree <- train(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region,
data = trainingSet,
method = "rpart",
parms = list(split = "gini"),
trControl = trainControl(method = "cv"))
dTree## CART
##
## 21679 samples
## 14 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 19511, 19511, 19512, 19511, 19511, 19511, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01024164 0.7292763 0.2129099
## 0.01152184 0.7269239 0.2189725
## 0.01408225 0.7186214 0.1326211
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01024164.
## Loading required package: rpart
library(caret)
# confusion matrix using caret package
yPred <- ifelse(predProbTest[2] > 0.5, "Yes", "No")
predY <- as.factor(yPred)
levels(testSet$DefaulterFlag) <- c("No", "Yes")
confusionMatrix(data = predY, reference = testSet$DefaulterFlag, positive = "Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 434 297
## Yes 1649 4847
##
## Accuracy : 0.7307
## 95% CI : (0.7203, 0.7409)
## No Information Rate : 0.7118
## P-Value [Acc > NIR] : 0.0001805
##
## Kappa : 0.1867
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9423
## Specificity : 0.2084
## Pos Pred Value : 0.7462
## Neg Pred Value : 0.5937
## Prevalence : 0.7118
## Detection Rate : 0.6707
## Detection Prevalence : 0.8989
## Balanced Accuracy : 0.5753
##
## 'Positive' Class : Yes
##
library(ROCR)
# predicted probabilities
predProbTest <- predict(dTree, testSet, type = "prob")
lgPredObj <- prediction(predProbTest[2],testSet$DefaulterFlag)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")# auc for logit
aucTree <- performance(lgPredObj, measure = "auc")
aucTree <- aucTree@y.values[[1]]
aucTree## [1] 0.6829464