# reading the data
df <- read.csv("AutoFinanaceData.csv")
# attach the data frame
attach(df)
# Number of rows and columns
dim(df)## [1] 28906 21
## [1] "Agmt.No" "ContractStatus" "StartDate" "AGE"
## [5] "NOOFDEPE" "MTHINCTH" "SALDATFR" "TENORYR"
## [9] "DWNPMFR" "PROFBUS" "QUALHSC" "QUAL_PG"
## [13] "SEXCODE" "FULLPDC" "FRICODE" "WASHCODE"
## [17] "Region" "Branch" "DefaulterFlag" "DefaulterType"
## [21] "DATASET"
1. Defaulter Flag
1: Customer has delayed paying at least once
0: Otherwise
1. Gender
SEXCODE = 1 (Male)
SEXCODE = 0 (Female)
2. Age
3. Education
QUALHSC
QUAL_PG
4. Income
Monthly Income in Thousands (MTHINCTH)
Owns a Fridge (FRICODE)
Owns a Washing Machine (WASHCODE)
5. Profession
6. No.of Dependents
7. Region
## 'data.frame': 28906 obs. of 21 variables:
## $ Agmt.No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
## $ ContractStatus: chr "Closed" "Closed" "Closed" "Closed" ...
## $ StartDate : chr "19-01-01" "10-05-01" "05-08-01" "03-09-01" ...
## $ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
## $ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
## $ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
## $ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
## $ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
## $ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
## $ PROFBUS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ QUALHSC : int 0 0 0 0 0 0 1 0 0 0 ...
## $ QUAL_PG : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SEXCODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FULLPDC : int 1 1 1 1 1 0 0 1 1 1 ...
## $ FRICODE : int 0 1 1 1 1 0 0 0 0 0 ...
## $ WASHCODE : int 0 0 1 1 0 0 0 0 0 0 ...
## $ Region : chr "AP2" "AP2" "AP2" "AP2" ...
## $ Branch : chr "Vizag" "Vizag" "Vizag" "Vizag" ...
## $ DefaulterFlag : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DefaulterType : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DATASET : chr " " "BUILD" "BUILD" "BUILD" ...
factor## 'data.frame': 28906 obs. of 21 variables:
## $ Agmt.No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
## $ ContractStatus: chr "Closed" "Closed" "Closed" "Closed" ...
## $ StartDate : chr "19-01-01" "10-05-01" "05-08-01" "03-09-01" ...
## $ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
## $ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
## $ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
## $ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
## $ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
## $ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
## $ PROFBUS : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ QUALHSC : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
## $ QUAL_PG : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SEXCODE : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ FULLPDC : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 1 2 2 2 ...
## $ FRICODE : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 1 1 1 ...
## $ WASHCODE : Factor w/ 2 levels "0","1": 1 1 2 2 1 1 1 1 1 1 ...
## $ Region : Factor w/ 8 levels "AP1","AP2","Chennai",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Branch : Factor w/ 14 levels "Bangalore","Chennai",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ DefaulterFlag : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ DefaulterType : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ DATASET : chr " " "BUILD" "BUILD" "BUILD" ...
## Warning: package 'caTools' was built under R version 4.0.4
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(2345)
dTree <- train(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region,
data = trainingSet,
method = "rpart",
parms = list(split = "gini"),
trControl = trainControl(method = "cv"))
dTree## CART
##
## 21679 samples
## 14 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 19511, 19511, 19512, 19511, 19511, 19511, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01024164 0.7292763 0.2129099
## 0.01152184 0.7269239 0.2189725
## 0.01408225 0.7186214 0.1326211
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01024164.
## Loading required package: rpart
library(caret)
# predicted probabilities
predProbTestTree <- predict(dTree, testSet, type = "prob")
# confusion matrix using caret package
yPred <- ifelse(predProbTestTree[2] > 0.5, "Yes", "No")
predY <- as.factor(yPred)
levels(testSet$DefaulterFlag) <- c("No", "Yes")
confusionMatrix(data = predY, reference = testSet$DefaulterFlag, positive = "Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 434 297
## Yes 1649 4847
##
## Accuracy : 0.7307
## 95% CI : (0.7203, 0.7409)
## No Information Rate : 0.7118
## P-Value [Acc > NIR] : 0.0001805
##
## Kappa : 0.1867
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9423
## Specificity : 0.2084
## Pos Pred Value : 0.7462
## Neg Pred Value : 0.5937
## Prevalence : 0.7118
## Detection Rate : 0.6707
## Detection Prevalence : 0.8989
## Balanced Accuracy : 0.5753
##
## 'Positive' Class : Yes
##
## Warning: package 'ROCR' was built under R version 4.0.4
DTPrediction <- predict(dTree, testSet,type = "prob")
DTPrediction <- prediction(DTPrediction[2],testSet$DefaulterFlag)
DTperformance <- performance(DTPrediction, "tpr","fpr")
# plotting ROC curve
plot(DTperformance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
DTPrediction <- prediction(predProbTestTree[2],testSet$DefaulterFlag)
aucDT <- performance(DTPrediction, measure = "auc")
aucDT <- aucDT@y.values[[1]]
aucDT## [1] 0.6829464
library(caret)
# control parameters
trctrl <- trainControl(method = "none", classProbs = TRUE,)
## setting levels as "Yes" and "NO"
trainingSet$DefaulterFlag <- ifelse(trainingSet$DefaulterFlag == "1","Yes","No")
RFModel <- train(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region,
data = trainingSet,
method = "rf",
nbagg = 50,
parms = list(split = "gini"),
trControl = trctrl,
importance = TRUE)
# model summary
RFModel## Random Forest
##
## 21679 samples
## 14 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: None
library(caret)
# predicted probabilities
predProbTestRF <- predict(RFModel, testSet, type = "prob")
# confusion matrix using caret package
yPred <- ifelse(predProbTestRF[2] > 0.5, "Yes", "No")
predY <- as.factor(yPred)
levels(testSet$DefaulterFlag) <- c("No", "Yes")
confusionMatrix(data = predY, reference = testSet$DefaulterFlag, positive = "Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 710 552
## Yes 1373 4592
##
## Accuracy : 0.7336
## 95% CI : (0.7233, 0.7438)
## No Information Rate : 0.7118
## P-Value [Acc > NIR] : 1.89e-05
##
## Kappa : 0.2646
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8927
## Specificity : 0.3409
## Pos Pred Value : 0.7698
## Neg Pred Value : 0.5626
## Prevalence : 0.7118
## Detection Rate : 0.6354
## Detection Prevalence : 0.8254
## Balanced Accuracy : 0.6168
##
## 'Positive' Class : Yes
##
# loading the package
library(ROCR)
RFPrediction <- predict(RFModel, testSet,type = "prob")
RFPrediction <- prediction(RFPrediction[2],testSet$DefaulterFlag)
RFperformance <- performance(RFPrediction, "tpr","fpr")
# plotting ROC curve
plot(RFperformance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
RFPrediction <- prediction(predProbTestRF[2],testSet$DefaulterFlag)
aucRF <- performance(RFPrediction, measure = "auc")
aucRF <- aucRF@y.values[[1]]
aucRF## [1] 0.7237563
library(caret)
set.seed(123)
# control parameters
trctrl <- trainControl(method = "none", classProbs = TRUE,)
## setting levels as "Yes" and "NO"
#trainingSet$DefaulterFlag <- ifelse(trainingSet$DefaulterFlag == "1","Yes","No")
BaggingModel <- train(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region,
data = trainingSet,
method = "treebag",
nbagg = 50,
trControl = trctrl,
importance = TRUE)
# model summary
BaggingModel## Bagged CART
##
## 21679 samples
## 14 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: None
library(caret)
# predicted probabilities
predProbTestBagg <- predict(BaggingModel, testSet, type = "prob")
# confusion matrix using caret package
yPred <- ifelse(predProbTestBagg[2] > 0.5, "Yes", "No")
predY <- as.factor(yPred)
levels(testSet$DefaulterFlag) <- c("No", "Yes")
confusionMatrix(data = predY, reference = testSet$DefaulterFlag, positive = "Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 744 706
## Yes 1339 4438
##
## Accuracy : 0.717
## 95% CI : (0.7065, 0.7274)
## No Information Rate : 0.7118
## P-Value [Acc > NIR] : 0.1651
##
## Kappa : 0.2418
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8628
## Specificity : 0.3572
## Pos Pred Value : 0.7682
## Neg Pred Value : 0.5131
## Prevalence : 0.7118
## Detection Rate : 0.6141
## Detection Prevalence : 0.7994
## Balanced Accuracy : 0.6100
##
## 'Positive' Class : Yes
##
# loading the package
library(ROCR)
BaggPrediction <- predict(BaggingModel, testSet,type = "prob")
BaggPrediction <- prediction(BaggPrediction[2],testSet$DefaulterFlag)
Baggperformance <- performance(BaggPrediction, "tpr","fpr")
# plotting ROC curve
plot(Baggperformance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
BaggPrediction <- prediction(predProbTestBagg[2],testSet$DefaulterFlag)
aucBagg <- performance(BaggPrediction, measure = "auc")
aucBagg <- aucBagg@y.values[[1]]
aucBagg## [1] 0.7010126
# List of predictions
predList <- list(predProbTestTree[2],predProbTestRF[2],predProbTestBagg[2])
# List of actual values (same for all)
m <- length(predList)
# ROC curves (logit and tree)
plot(DTperformance, col = "black", lwd = 2)
plot(RFperformance, add = TRUE, col = "red", lwd = 3)
plot(Baggperformance, add = TRUE, col = "green", lwd = 4)
legend(x = "bottomright",
legend = c("Decision Tree", "Random Forest","Bagging"),fill = 1:m)