Presented By - Group 6
Model Used - Logit Regression and Decision Tree
Course - Machine Learning in Marketing
library(data.table)
# reading data
AutoFinanaceData <- fread("C:/Users/lenovo/Downloads/MLM Project/AutoFinanaceData.csv")
attach(AutoFinanaceData)
dim(AutoFinanaceData)
str(AutoFinanaceData)
[1] 28906 21
Classes 'data.table' and 'data.frame': 28906 obs. of 21 variables:
$ Agmt No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
$ ContractStatus: chr "Closed" "Closed" "Closed" "Closed" ...
$ StartDate : chr "19-01-2001" "10-05-2001" "05-08-2001" "03-09-2001" ...
$ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
$ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
$ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
$ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
$ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
$ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
$ PROFBUS : int 0 0 0 0 0 0 0 0 0 0 ...
$ QUALHSC : int 0 0 0 0 0 0 1 0 0 0 ...
$ QUAL_PG : int 0 0 0 0 0 0 0 0 0 0 ...
$ SEXCODE : int 1 1 1 1 1 1 1 1 1 1 ...
$ FULLPDC : int 1 1 1 1 1 0 0 1 1 1 ...
$ FRICODE : int 0 1 1 1 1 0 0 0 0 0 ...
$ WASHCODE : int 0 0 1 1 0 0 0 0 0 0 ...
$ Region : chr "AP2" "AP2" "AP2" "AP2" ...
$ Branch : chr "Vizag" "Vizag" "Vizag" "Vizag" ...
$ DefaulterFlag : int 0 0 0 0 0 0 0 0 0 0 ...
$ DefaulterType : int 0 0 0 0 0 0 0 0 0 0 ...
$ DATASET : chr "BUILD" "BUILD" "BUILD" "BUILD" ...
- attr(*, ".internal.selfref")=<externalptr>
#Convert to factors
AutoFinanaceData$PROFBUS <- as.factor(AutoFinanaceData$PROFBUS)
AutoFinanaceData$QUALHSC <- as.factor(AutoFinanaceData$QUALHSC)
AutoFinanaceData$QUAL_PG <- as.factor(AutoFinanaceData$QUAL_PG)
AutoFinanaceData$SEXCODE <- as.factor(AutoFinanaceData$SEXCODE)
AutoFinanaceData$FULLPDC <- as.factor(AutoFinanaceData$FULLPDC)
AutoFinanaceData$FRICODE <- as.factor(AutoFinanaceData$FRICODE)
AutoFinanaceData$WASHCODE <- as.factor(AutoFinanaceData$WASHCODE)
AutoFinanaceData$DefaulterFlag <- as.factor(AutoFinanaceData$DefaulterFlag)
AutoFinanaceData$DefaulterType <- as.factor(AutoFinanaceData$DefaulterType)
AutoFinanaceData$Region <- as.factor(AutoFinanaceData$Region)
AutoFinanaceData$Branch <- as.factor(AutoFinanaceData$Branch)
# verify conversion
str(AutoFinanaceData)
Classes 'data.table' and 'data.frame': 28906 obs. of 21 variables:
$ Agmt No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
$ ContractStatus: chr "Closed" "Closed" "Closed" "Closed" ...
$ StartDate : chr "19-01-2001" "10-05-2001" "05-08-2001" "03-09-2001" ...
$ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
$ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
$ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
$ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
$ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
$ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
$ PROFBUS : Factor w/ 2 levels "Professional",..: 1 1 1 1 1 1 1 1 1 1 ...
$ QUALHSC : Factor w/ 2 levels "No HSC","HSC": 1 1 1 1 1 1 2 1 1 1 ...
$ QUAL_PG : Factor w/ 2 levels "No PG","PG": 1 1 1 1 1 1 1 1 1 1 ...
$ SEXCODE : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
$ FULLPDC : Factor w/ 2 levels "NO PDC","PDC": 2 2 2 2 2 1 1 2 2 2 ...
$ FRICODE : Factor w/ 2 levels "No Fridge","Fridge": 1 2 2 2 2 1 1 1 1 1 ...
$ WASHCODE : Factor w/ 2 levels "No Washing Machine",..: 1 1 2 2 1 1 1 1 1 1 ...
$ Region : Factor w/ 9 levels "AP1","AP2","Chennai",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Branch : Factor w/ 14 levels "Bangalore","Chennai",..: 14 14 14 14 14 14 14 14 14 14 ...
$ DefaulterFlag : Factor w/ 2 levels "No Default","Default": 1 1 1 1 1 1 1 1 1 1 ...
$ DefaulterType : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
$ DATASET : chr "BUILD" "BUILD" "BUILD" "BUILD" ...
- attr(*, ".internal.selfref")=<externalptr>
No Default Default
28.82 71.18
Gender
No Default Default
Female 0.3447653 0.6552347
Male 0.2835519 0.7164481
No Default Default
Female 0.09169467 0.07057451
Male 0.90830533 0.92942549
Age
Dependent
No Default Default
0 0.2495499 0.7504501
1 0.3370006 0.6629994
2 0.3105651 0.6894349
3 0.3031399 0.6968601
4 0.2802617 0.7197383
5 0.2418033 0.7581967
6 0.2135356 0.7864644
7 0.2107023 0.7892977
8 0.2344828 0.7655172
9 0.2264151 0.7735849
10 0.1818182 0.8181818
Profession
No Default Default
Professional 0.2863178 0.7136822
Business 0.2993007 0.7006993
Highest Qualification - HSC
No Default Default
No HSC 0.2966182 0.7033818
HSC 0.2604866 0.7395134
Highest Qualification - Post Graduate
No Default Default
No PG 0.2837882 0.7162118
PG 0.3941731 0.6058269
Income
No Default Default
9.503609 8.704872
Salary Day After
No Default Default
0.4251908 0.4460295
Have Fridge
No Default Default
No Fridge 0.2464755 0.7535245
Fridge 0.3457176 0.6542824
Have Washing Machine
No Default Default
No Washing Machine 0.2661115 0.7338885
Washing Machine 0.3826261 0.6173739
Post Date Cheques
No Default Default
NO PDC 0.1817459 0.8182541
PDC 0.4541995 0.5458005
Loan Tenure
library(caTools)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 75% data for training
split <- sample.split(AutoFinanaceData$DefaulterFlag, SplitRatio = 0.75)
trainData <- subset(AutoFinanaceData, split == TRUE)
# dimensions of training data
dim(trainData)
# creating 25% data for testing
testData <- subset(AutoFinanaceData, split == FALSE)
# dimensions of testing data
dim(testData)
# fit logistic classifier 1
logitClassifier1 <- glm(DefaulterFlag ~ AGE+NOOFDEPE+SALDATFR+TENORYR+DWNPMFR+PROFBUS+QUALHSC+QUAL_PG+SEXCODE+FULLPDC+FRICODE+WASHCODE+Region+Branch,
data = trainData,
family = binomial())
# summary of the classifier 1
summary(logitClassifier1)
Call:
glm(formula = DefaulterFlag ~ AGE + NOOFDEPE + MTHINCTH + SALDATFR +
TENORYR + DWNPMFR + PROFBUS + QUALHSC + QUAL_PG + SEXCODE +
FULLPDC + FRICODE + WASHCODE + Region + Branch, family = binomial(),
data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6232 -1.0064 0.5589 0.7787 2.0975
Coefficients: (6 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.161379 0.198283 10.900 < 2e-16 ***
AGE -0.014544 0.001677 -8.672 < 2e-16 ***
NOOFDEPE 0.044418 0.011091 4.005 6.20e-05 ***
MTHINCTH 0.004724 0.003708 1.274 0.202671
SALDATFR -0.405922 0.043001 -9.440 < 2e-16 ***
TENORYR 0.792481 0.046271 17.127 < 2e-16 ***
DWNPMFR -1.275723 0.128294 -9.944 < 2e-16 ***
PROFBUSBusiness 0.270346 0.050867 5.315 1.07e-07 ***
QUALHSCHSC 0.180068 0.040573 4.438 9.07e-06 ***
QUAL_PGPG -0.315133 0.079130 -3.982 6.82e-05 ***
SEXCODEMale 0.214698 0.060264 3.563 0.000367 ***
FULLPDCPDC -1.232346 0.037419 -32.934 < 2e-16 ***
FRICODEFridge -0.123879 0.038595 -3.210 0.001329 **
WASHCODEWashing Machine -0.273548 0.048079 -5.690 1.27e-08 ***
RegionAP2 -0.209198 0.204014 -1.025 0.305169
RegionChennai -1.405321 0.150659 -9.328 < 2e-16 ***
RegionKA1 -0.674705 0.151403 -4.456 8.34e-06 ***
RegionKA2 -0.584042 0.154532 -3.779 0.000157 ***
RegionKE2 -0.792185 0.160746 -4.928 8.30e-07 ***
RegionTN1 -0.586235 0.155692 -3.765 0.000166 ***
RegionTN2 -0.637983 0.168516 -3.786 0.000153 ***
BranchChennai NA NA NA NA
BranchCoimbatore NA NA NA NA
BranchErnakulam NA NA NA NA
BranchKumbakonam -1.065906 0.200347 -5.320 1.04e-07 ***
BranchMadurai 0.284617 0.081437 3.495 0.000474 ***
BranchPondy 0.451926 0.116978 3.863 0.000112 ***
BranchSalem -0.330527 0.093718 -3.527 0.000421 ***
BranchTrichy -0.174639 0.079292 -2.202 0.027632 *
BranchTirunelveli NA NA NA NA
BranchTirupathi -0.334214 0.376931 -0.887 0.375256
BranchVellore NA NA NA NA
BranchVijayawada -1.476902 0.277246 -5.327 9.98e-08 ***
BranchVizag NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 26040 on 21678 degrees of freedom
Residual deviance: 22850 on 21651 degrees of freedom
AIC: 22906
Number of Fisher Scoring iterations: 4
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1696 0.5836 0.7611 0.7103 0.8500 0.9812
# prediction using classifier 1
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData)
yPred1 <- ifelse(predProbClass1 > 0.71, 1, 0)
table(yPred1)
yPred1
0 1
2918 4309
# Akaike Information Criterion (AIC)
# AIC of the classifiers
AIC(logitClassifier1)
[1] 22906.36
# confusion matrix using classifier 1
confMatrix1 <- table(yActual = testData$DefaulterFlag, yPred1)
confMatrix1
yPred1
yActual 0 1
0 1384 699
1 1534 3610
# confusion matrix using classifier 1
library(MLmetrics)
# accuracy using classifier 1
Accuracy(y_pred = yPred1, y_true = testData$DefaulterFlag)
[1] 0.6910198
# sensitivity using classifier 1
Sensitivity(y_true = testData$DefaulterFlag, y_pred = yPred1, positive = 1)
[1] 0.7017885
Specificity(y_true = testData$DefaulterFlag, y_pred = yPred1, positive = 1)
[1] 0.6644263
library(ROCR)
PredictObject1 <- prediction(predProbClass1, testData$DefaulterFlag)
PerformObject1 <- performance(PredictObject1, "tpr","fpr")
plot(PerformObject1, main = "ROC Curve for Default", col = "black", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")
library(caret)
# decision tree using method criterian information gain
dTreeInfoGain <- train(DefaulterFlag ~ AGE+NOOFDEPE+SALDATFR+TENORYR+DWNPMFR+PROFBUS+QUALHSC+QUAL_PG+SEXCODE+FULLPDC+FRICODE+WASHCODE+Region+Branch,
data = trainData,
method = "rpart",
parms = list(split = "information"),
trControl = trainControl(method = "cv"))
dTreeInfoGain
CART
21679 samples
14 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 19512, 19511, 19511, 19511, 19511, 19511, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.01024164 0.7309841 0.2080844
0.01152184 0.7286317 0.2176073
0.01408225 0.7193133 0.1408632
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.01024164.
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.3500 0.5899 0.8176 0.7122 0.8176 0.8176
# prediction of default = {no, yes} on test data (decision tree)
predClassInfoGain <- predict(dTreeInfoGain, testData[, 4:18], type = 'raw')
table(predClassInfoGain)
predClassInfoGain
0 1
731 6496
# confusion matrix (decision tree)
confusionMatrix(predClassInfoGain, testData$DefaulterFlag, positive = "1")
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 434 297
1 1649 4847
Accuracy : 0.7307
95% CI : (0.7203, 0.7409)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 0.0001805
Kappa : 0.1867
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9423
Specificity : 0.2084
Pos Pred Value : 0.7462
Neg Pred Value : 0.5937
Prevalence : 0.7118
Detection Rate : 0.6707
Detection Prevalence : 0.8989
Balanced Accuracy : 0.5753
'Positive' Class : 1
library(ROCR)
# prediction
PredictObjectTree <- prediction(predTestProbInfoGain[2], testData$DefaulterFlag)
# performance
PerformObjectTree <- performance(PredictObjectTree, "tpr","fpr")