# reading the data
df <- read.csv("AutoFinanaceData.csv")
# attach the data frame
attach(df)
# Number of rows and columns
dim(df)## [1] 28906 21
## [1] "Agmt.No" "ContractStatus" "StartDate" "AGE"
## [5] "NOOFDEPE" "MTHINCTH" "SALDATFR" "TENORYR"
## [9] "DWNPMFR" "PROFBUS" "QUALHSC" "QUAL_PG"
## [13] "SEXCODE" "FULLPDC" "FRICODE" "WASHCODE"
## [17] "Region" "Branch" "DefaulterFlag" "DefaulterType"
## [21] "DATASET"
1. Defaulter Flag
1: Customer has delayed paying at least once
0: Otherwise
1. Gender
SEXCODE = 1 (Male)
SEXCODE = 0 (Female)
2. Age
3. Education
QUALHSC
QUAL_PG
4. Income
Monthly Income in Thousands (MTHINCTH)
Owns a Fridge (FRICODE)
Owns a Washing Machine (WASHCODE)
5. Profession
6. No.of Dependents
7. Region
## 'data.frame': 28906 obs. of 21 variables:
## $ Agmt.No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
## $ ContractStatus: chr "Closed" "Closed" "Closed" "Closed" ...
## $ StartDate : chr "19-01-01" "10-05-01" "05-08-01" "03-09-01" ...
## $ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
## $ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
## $ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
## $ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
## $ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
## $ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
## $ PROFBUS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QUALHSC : int 0 0 0 0 0 0 1 0 0 0 ...
## $ QUAL_PG : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SEXCODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FULLPDC : int 1 1 1 1 1 0 0 1 1 1 ...
## $ FRICODE : int 0 1 1 1 1 0 0 0 0 0 ...
## $ WASHCODE : int 0 0 1 1 0 0 0 0 0 0 ...
## $ Region : chr "AP2" "AP2" "AP2" "AP2" ...
## $ Branch : chr "Vizag" "Vizag" "Vizag" "Vizag" ...
## $ DefaulterFlag : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DefaulterType : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DATASET : chr " " "BUILD" "BUILD" "BUILD" ...
factor## 'data.frame': 28906 obs. of 21 variables:
## $ Agmt.No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
## $ ContractStatus: chr "Closed" "Closed" "Closed" "Closed" ...
## $ StartDate : chr "19-01-01" "10-05-01" "05-08-01" "03-09-01" ...
## $ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
## $ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
## $ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
## $ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
## $ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
## $ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
## $ PROFBUS : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ QUALHSC : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
## $ QUAL_PG : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SEXCODE : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ FULLPDC : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 1 2 2 2 ...
## $ FRICODE : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 1 1 1 ...
## $ WASHCODE : Factor w/ 2 levels "0","1": 1 1 2 2 1 1 1 1 1 1 ...
## $ Region : Factor w/ 8 levels "AP1","AP2","Chennai",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Branch : Factor w/ 14 levels "Bangalore","Chennai",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ DefaulterFlag : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ DefaulterType : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ DATASET : chr " " "BUILD" "BUILD" "BUILD" ...
attach(df)
# Defaulters based on whether they gave post-dated cheques
round(prop.table(table(DefaulterFlag, FULLPDC), 2)*100, 2)[, 1:2]## FULLPDC
## DefaulterFlag 0 1
## 0 18.17 45.42
## 1 81.83 54.58
As you can see above, among the people who are not paying post dated cheques, 81.8% defaulted.
library(data.table)
# average TENORYR of the borrower by Defaulter Flag
df <- data.table(df)
df[, .(AvgTENORYR = round(mean(TENORYR), 2)),
by=list(DefaulterFlag)]## DefaulterFlag AvgTENORYR
## 1: 0 1.15
## 2: 1 1.34
As you can see above, the defaulters have greater average tenor.
## Warning: package 'caTools' was built under R version 4.0.4
glm() on the training data, the output is as follows.Model1 <- glm(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ NOOFDEPE
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region,data = trainingSet, family = binomial())
summary(Model1)##
## Call:
## glm(formula = DefaulterFlag ~ AGE + NOOFDEPE + MTHINCTH + NOOFDEPE +
## SALDATFR + TENORYR + DWNPMFR + PROFBUS + QUALHSC + QUAL_PG +
## SEXCODE + FULLPDC + FRICODE + WASHCODE + Region, family = binomial(),
## data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7470 -1.0215 0.5716 0.7801 2.0874
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.2291732 0.1890284 11.793 < 2e-16 ***
## AGE -0.0144968 0.0016680 -8.691 < 2e-16 ***
## NOOFDEPE 0.0566860 0.0107923 5.252 1.50e-07 ***
## MTHINCTH -0.0004025 0.0035613 -0.113 0.910023
## SALDATFR -0.3833870 0.0420223 -9.123 < 2e-16 ***
## TENORYR 0.7727065 0.0456475 16.928 < 2e-16 ***
## DWNPMFR -1.3074501 0.1274734 -10.257 < 2e-16 ***
## PROFBUS1 0.1966576 0.0487903 4.031 5.56e-05 ***
## QUALHSC1 0.1853120 0.0401652 4.614 3.95e-06 ***
## QUAL_PG1 -0.2990904 0.0787907 -3.796 0.000147 ***
## SEXCODE1 0.2339445 0.0600322 3.897 9.74e-05 ***
## FULLPDC1 -1.2365885 0.0368674 -33.541 < 2e-16 ***
## FRICODE1 -0.1761473 0.0377247 -4.669 3.02e-06 ***
## WASHCODE1 -0.2644245 0.0476814 -5.546 2.93e-08 ***
## RegionAP2 -0.5788864 0.1796029 -3.223 0.001268 **
## RegionChennai -1.4136987 0.1408192 -10.039 < 2e-16 ***
## RegionKA1 -0.6529787 0.1411987 -4.625 3.75e-06 ***
## RegionKE2 -0.5753874 0.1450753 -3.966 7.30e-05 ***
## RegionTN1 -0.8084619 0.1362745 -5.933 2.98e-09 ***
## RegionTN2 -0.6142186 0.1458691 -4.211 2.55e-05 ***
## RegionVellore -0.6570233 0.1595604 -4.118 3.83e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26040 on 21678 degrees of freedom
## Residual deviance: 23025 on 21658 degrees of freedom
## AIC: 23067
##
## Number of Fisher Scoring iterations: 4
We want to compare the impact of loan tenor on the probability of load default, for consumers who pay using post dated cheques and remaining consumers who do not pay using post dated cheques, for this purpose we ran a model that gives the following output.
glm() on the training data, the expected output should be as follows.Model2 <- glm(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ NOOFDEPE
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region
+ TENORYR*FULLPDC
,data = trainingSet, family = binomial())
summary(Model2)##
## Call:
## glm(formula = DefaulterFlag ~ AGE + NOOFDEPE + MTHINCTH + NOOFDEPE +
## SALDATFR + TENORYR + DWNPMFR + PROFBUS + QUALHSC + QUAL_PG +
## SEXCODE + FULLPDC + FRICODE + WASHCODE + Region + TENORYR *
## FULLPDC, family = binomial(), data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5410 -1.0043 0.5721 0.7620 2.1539
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.4938627 0.1945079 12.821 < 2e-16 ***
## AGE -0.0144308 0.0016723 -8.629 < 2e-16 ***
## NOOFDEPE 0.0569949 0.0108145 5.270 1.36e-07 ***
## MTHINCTH 0.0006478 0.0035839 0.181 0.856559
## SALDATFR -0.3900738 0.0422177 -9.240 < 2e-16 ***
## TENORYR 0.5723502 0.0557250 10.271 < 2e-16 ***
## DWNPMFR -1.3175535 0.1275771 -10.328 < 2e-16 ***
## PROFBUS1 0.2100458 0.0491541 4.273 1.93e-05 ***
## QUALHSC1 0.1835106 0.0402820 4.556 5.22e-06 ***
## QUAL_PG1 -0.2972520 0.0789155 -3.767 0.000165 ***
## SEXCODE1 0.2352736 0.0600159 3.920 8.85e-05 ***
## FULLPDC1 -1.7550203 0.0952821 -18.419 < 2e-16 ***
## FRICODE1 -0.1719774 0.0378111 -4.548 5.41e-06 ***
## WASHCODE1 -0.2585977 0.0477647 -5.414 6.16e-08 ***
## RegionAP2 -0.5817435 0.1797634 -3.236 0.001211 **
## RegionChennai -1.4436197 0.1414897 -10.203 < 2e-16 ***
## RegionKA1 -0.6604584 0.1417572 -4.659 3.18e-06 ***
## RegionKE2 -0.5973660 0.1458215 -4.097 4.19e-05 ***
## RegionTN1 -0.8355176 0.1369630 -6.100 1.06e-09 ***
## RegionTN2 -0.6171896 0.1464390 -4.215 2.50e-05 ***
## RegionVellore -0.7037441 0.1604099 -4.387 1.15e-05 ***
## TENORYR:FULLPDC1 0.4227544 0.0714356 5.918 3.26e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26040 on 21678 degrees of freedom
## Residual deviance: 22990 on 21657 degrees of freedom
## AIC: 23034
##
## Number of Fisher Scoring iterations: 4
AGE = mean(AGE),
Male,
Education = UG,
MTHINCTH = mean(MTHINCTH)
NoOfDepe = mean(NOOFDEPE),
Owns a Fridge,
Owns a Washing Machine
Working Professional,
SALDATFR = mean(SALDATFR),
Lives in TN1
Down Payment = mean(DWNPMFR) %
Xdata <- data.frame(
AGE = mean(AGE),
NOOFDEPE = mean(NOOFDEPE),
SALDATFR = mean(SALDATFR),
TENORYR = 1,
DWNPMFR = mean(DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "1",
FULLPDC = "0",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1",
MTHINCTH = mean(MTHINCTH))
pred1 <- predict(Model2, Xdata, type = "response")
pred1## 1
## 0.7331136
Xdata <- data.frame(
AGE = mean(AGE),
NOOFDEPE = mean(NOOFDEPE),
SALDATFR = mean(SALDATFR),
TENORYR = 2,
DWNPMFR = mean(DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "1",
FULLPDC = "0",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1",
MTHINCTH = mean(MTHINCTH))
pred2 <- predict(Model2, Xdata, type = "response")
pred2## 1
## 0.8296046
## 1
## 0.09649103
Xdata <- data.frame(
AGE = mean(AGE),
NOOFDEPE = mean(NOOFDEPE),
SALDATFR = mean(SALDATFR),
TENORYR = 1,
DWNPMFR = mean(DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "1",
FULLPDC = "1",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1",
MTHINCTH = mean(MTHINCTH))
pred3 <- predict(Model2, Xdata, type = "response")
pred3## 1
## 0.42024
Xdata <- data.frame(
AGE = mean(AGE),
NOOFDEPE = mean(NOOFDEPE),
SALDATFR = mean(SALDATFR),
TENORYR = 2,
DWNPMFR = mean(DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "1",
FULLPDC = "1",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1",
MTHINCTH = mean(MTHINCTH))
pred4 <- predict(Model2, Xdata, type = "response")
pred4## 1
## 0.6622454
## 1
## 0.2420054
## Loading required package: lattice
## Loading required package: ggplot2
# confusion matrix using caret package
yPred <- ifelse(predProbTestLogit > 0.5, "Yes", "No")
predY <- as.factor(yPred)
levels(testSet$DefaulterFlag) <- c("No", "Yes")
confusionMatrix(data = predY, reference = testSet$DefaulterFlag, positive = "Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 635 417
## Yes 1448 4727
##
## Accuracy : 0.7419
## 95% CI : (0.7317, 0.752)
## No Information Rate : 0.7118
## P-Value [Acc > NIR] : 5.682e-09
##
## Kappa : 0.2624
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9189
## Specificity : 0.3048
## Pos Pred Value : 0.7655
## Neg Pred Value : 0.6036
## Prevalence : 0.7118
## Detection Rate : 0.6541
## Detection Prevalence : 0.8544
## Balanced Accuracy : 0.6119
##
## 'Positive' Class : Yes
##
## Warning: package 'ROCR' was built under R version 4.0.4
# predicted probabilities
predProbTest <- predict(Model1, testSet, type = "response")
lgPredObj <- prediction(predProbTest,testSet$DefaulterFlag)
lgPerfObjlogit <- performance(lgPredObj, "tpr","fpr")
plot(lgPerfObjlogit,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")# auc for logit
aucLogit <- performance(lgPredObj, measure = "auc")
aucLogit <- aucLogit@y.values[[1]]
aucLogit## [1] 0.7263962
set.seed(2345)
dTree <- train(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region,
data = trainingSet,
method = "rpart",
parms = list(split = "gini"),
trControl = trainControl(method = "cv"))
dTree## CART
##
## 21679 samples
## 14 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 19511, 19511, 19512, 19511, 19511, 19511, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01024164 0.7292763 0.2129099
## 0.01152184 0.7269239 0.2189725
## 0.01408225 0.7186214 0.1326211
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01024164.
## Loading required package: rpart
AGE = mean(AGE),
Male,
Education = UG,
MTHINCTH = mean(MTHINCTH)
NoOfDepe = mean(NOOFDEPE),
Owns a Fridge,
Owns a Washing Machine
Working Professional,
SALDATFR = mean(SALDATFR),
Lives in TN1
Down Payment = mean(DWNPMFR) %
TENORYR = mean(TENORYR)
Xdata <- data.frame(
AGE = mean(AGE),
NOOFDEPE = mean(NOOFDEPE),
SALDATFR = mean(SALDATFR),
TENORYR = mean(TENORYR),
DWNPMFR = mean(DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "1",
FULLPDC = "0",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1",
MTHINCTH = mean(MTHINCTH))
pred1 <- predict(dTree, Xdata, type = "raw")
pred1## [1] 1
## Levels: 0 1
Xdata <- data.frame(
AGE = mean(AGE),
NOOFDEPE = mean(NOOFDEPE),
SALDATFR = mean(SALDATFR),
TENORYR = mean(TENORYR),
DWNPMFR = mean(DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "1",
FULLPDC = "1",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1",
MTHINCTH = mean(MTHINCTH))
pred1 <- predict(dTree, Xdata, type = "raw")
pred1## [1] 1
## Levels: 0 1
library(caret)
# confusion matrix using caret package
yPred <- ifelse(predProbTestTree[2] > 0.5, "Yes", "No")
predY <- as.factor(yPred)
levels(testSet$DefaulterFlag) <- c("No", "Yes")
confusionMatrix(data = predY, reference = testSet$DefaulterFlag, positive = "Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 434 297
## Yes 1649 4847
##
## Accuracy : 0.7307
## 95% CI : (0.7203, 0.7409)
## No Information Rate : 0.7118
## P-Value [Acc > NIR] : 0.0001805
##
## Kappa : 0.1867
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9423
## Specificity : 0.2084
## Pos Pred Value : 0.7462
## Neg Pred Value : 0.5937
## Prevalence : 0.7118
## Detection Rate : 0.6707
## Detection Prevalence : 0.8989
## Balanced Accuracy : 0.5753
##
## 'Positive' Class : Yes
##
library(ROCR)
# predicted probabilities
predProbTest <- predict(dTree, testSet, type = "prob")
lgPredObj <- prediction(predProbTest[2],testSet$DefaulterFlag)
lgPerfObjTree <- performance(lgPredObj, "tpr","fpr")
# List of predictions
predList <- list(predProbTestLogit, predProbTestTree[2])
# List of actual values (same for all)
m <- length(predList)
# ROC curves (logit and tree)
plot(lgPerfObjlogit, col = "red", lwd = 2,
main = "ROC Curves \n (Logit and Tree)")
plot(lgPerfObjTree, add = TRUE, col = "blue", lwd = 3,
main = "ROC Curve for CC Default \n (Logit and Tree)")
legend(x = "bottomright",
legend = c("Tree", "Logit"),
fill = 1:m)# auc for logit
aucTree <- performance(lgPredObj, measure = "auc")
aucTree <- aucTree@y.values[[1]]
aucTree## [1] 0.6829464