Sameer Mathur
Logistic Regression Using Caret Package
library(data.table)
autoF.dt <- fread("AutoFinanaceData.csv")
attach(autoF.dt)
# dimension
dim(autoF.dt)
[1] 28906 21
“1” if customer has delayed even once ,“0” otherwise
addmargins(table(autoF.dt$DefaulterFlag),1)
0 1 Sum
8332 20574 28906
round(prop.table(table(autoF.dt$DefaulterFlag))*100,2)
0 1
28.82 71.18
71.18 % of 28,906 customers have defaulted on their Auto Loan
AGE
NOOFDEPE
MTHINCTH
SALDATFR
PROFBUS
QUALHSC
QUAL_PG
SEXCODE
FRICODE
WASHCODE
TENORYR
DWNPMFR
FULLPDC
# data columns
colnames(autoF.dt)
[1] "Agmt No" "ContractStatus" "StartDate" "AGE"
[5] "NOOFDEPE" "MTHINCTH" "SALDATFR" "TENORYR"
[9] "DWNPMFR" "PROFBUS" "QUALHSC" "QUAL_PG"
[13] "SEXCODE" "FULLPDC" "FRICODE" "WASHCODE"
[17] "Region" "Branch" "DefaulterFlag" "DefaulterType"
[21] "DATASET"
# structure of the data table
str(autoF.dt)
Classes 'data.table' and 'data.frame': 28906 obs. of 21 variables:
$ Agmt No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
$ ContractStatus: chr "Closed" "Closed" "Closed" "Closed" ...
$ StartDate : chr "19-01-01" "10-05-01" "05-08-01" "03-09-01" ...
$ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
$ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
$ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
$ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
$ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
$ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
$ PROFBUS : int 0 0 0 0 0 0 0 0 0 0 ...
$ QUALHSC : int 0 0 0 0 0 0 1 0 0 0 ...
$ QUAL_PG : int 0 0 0 0 0 0 0 0 0 0 ...
$ SEXCODE : int 1 1 1 1 1 1 1 1 1 1 ...
$ FULLPDC : int 1 1 1 1 1 0 0 1 1 1 ...
$ FRICODE : int 0 1 1 1 1 0 0 0 0 0 ...
$ WASHCODE : int 0 0 1 1 0 0 0 0 0 0 ...
$ Region : chr "AP2" "AP2" "AP2" "AP2" ...
$ Branch : chr "Vizag" "Vizag" "Vizag" "Vizag" ...
$ DefaulterFlag : int 0 0 0 0 0 0 0 0 0 0 ...
$ DefaulterType : int 0 0 0 0 0 0 0 0 0 0 ...
$ DATASET : chr "" "BUILD" "BUILD" "BUILD" ...
- attr(*, ".internal.selfref")=<externalptr>
# convert 'Contract Status' to a factor
autoF.dt[, ContractStatus := factor(ContractStatus)]
# convert 'PROFBUS' to a factor
autoF.dt[, PROFBUS := factor(PROFBUS)]
# convert 'QUALHSC' to a factor
autoF.dt[, QUALHSC := factor(QUALHSC)]
# convert 'QUAL_PG' to a factor
autoF.dt[, QUAL_PG := factor(QUAL_PG)]
# convert 'SEXCODE' to a factor
autoF.dt[, SEXCODE := factor(SEXCODE)]
# convert 'FULLPDC' to a factor
autoF.dt[, FULLPDC := factor(FULLPDC)]
# convert 'FRICODE' to a factor
autoF.dt[, FRICODE := factor(FRICODE)]
# convert 'WASHCODE' to a factor
autoF.dt[, WASHCODE := factor(WASHCODE)]
# convert 'DefaulterFlag' to a factor
autoF.dt[, DefaulterFlag := factor(DefaulterFlag)]
# convert 'DefaulterType' to a factor
autoF.dt[, DefaulterType := factor(DefaulterType)]
# convert 'Region' to a factor
autoF.dt[, Region := factor(Region)]
# convert 'Branch' to a factor
autoF.dt[, Branch := factor(Branch)]
# convert 'DATASET' to a factor
autoF.dt[, DATASET := factor(DATASET)]
# verify conversion
str(autoF.dt)
Classes 'data.table' and 'data.frame': 28906 obs. of 21 variables:
$ Agmt No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
$ ContractStatus: Factor w/ 4 levels "Closed","Foreclosed",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartDate : chr "19-01-01" "10-05-01" "05-08-01" "03-09-01" ...
$ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
$ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
$ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
$ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
$ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
$ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
$ PROFBUS : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ QUALHSC : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
$ QUAL_PG : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ SEXCODE : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
$ FULLPDC : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 1 2 2 2 ...
$ FRICODE : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 1 1 1 ...
$ WASHCODE : Factor w/ 2 levels "0","1": 1 1 2 2 1 1 1 1 1 1 ...
$ Region : Factor w/ 8 levels "AP1","AP2","Chennai",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Branch : Factor w/ 14 levels "Bangalore","Chennai",..: 14 14 14 14 14 14 14 14 14 14 ...
$ DefaulterFlag : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ DefaulterType : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
$ DATASET : Factor w/ 3 levels "","BUILD","VALIDATE": 1 2 2 2 2 2 2 2 2 2 ...
- attr(*, ".internal.selfref")=<externalptr>
# descriptive statistics of the dataframe
library(psych)
describe(autoF.dt)[, c(1:5)]
vars n mean sd median
Agmt No* 1 28906 NaN NA NA
ContractStatus* 2 28906 1.33 0.77 1.00
StartDate* 3 28906 NaN NA NA
AGE 4 28906 36.44 9.82 35.00
NOOFDEPE 5 28906 2.85 1.61 3.00
MTHINCTH 6 28906 8.94 4.81 8.00
SALDATFR 7 28906 0.44 0.46 0.17
TENORYR 8 28906 1.28 0.52 1.00
DWNPMFR 9 28906 0.38 0.16 0.38
PROFBUS* 10 28906 1.15 0.36 1.00
QUALHSC* 11 28906 1.23 0.42 1.00
QUAL_PG* 12 28906 1.04 0.20 1.00
SEXCODE* 13 28906 1.92 0.27 2.00
FULLPDC* 14 28906 1.39 0.49 1.00
FRICODE* 15 28906 1.42 0.49 1.00
WASHCODE* 16 28906 1.19 0.39 1.00
Region* 17 28906 5.33 1.51 6.00
Branch* 18 28906 5.93 3.47 6.00
DefaulterFlag* 19 28906 1.71 0.45 2.00
DefaulterType* 20 28906 1.85 0.63 2.00
DATASET* 21 28906 2.52 0.50 3.00
Reserve 80% for training and 20% of test
# loading the package
library(caTools)
# fixing the observations
set.seed(123)
# splitting the data
split = sample.split(autoF.dt$DefaulterFlag, SplitRatio = 0.75)
# creating the training set
trainingSet = subset(autoF.dt, split == TRUE)
# creating the test set
testSet = subset(autoF.dt, split == FALSE)
# dimensions of the full data
dim(autoF.dt)
[1] 28906 21
# dimensions of the training data
dim(trainingSet)
[1] 21679 21
# dimensions of the Testing data
dim(testSet)
[1] 7227 21
# percentage defaulter in full data
round(prop.table(table(autoF.dt$DefaulterFlag))*100,2)
0 1
28.82 71.18
# percentage defaulter in train data
round(prop.table(table(trainingSet$DefaulterFlag))*100,2)
0 1
28.83 71.17
# percentage defaulter in test data
round(prop.table(table(testSet$DefaulterFlag))*100,2)
0 1
28.82 71.18
library(caret)
# control parameters
objControl <- trainControl(method = "boot",
number = 2,
returnResamp = 'none',
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
set.seed(766)
# model building using caret package
caretLogitModel <- train(trainingSet[,c(4:17)],
trainingSet$DefaulterFlag,
method = 'glmStepAIC',
trControl = objControl,
metric = "ROC",verbose = FALSE)
# list of all variables
colnames(trainingSet)
[1] "Agmt No" "ContractStatus" "StartDate" "AGE"
[5] "NOOFDEPE" "MTHINCTH" "SALDATFR" "TENORYR"
[9] "DWNPMFR" "PROFBUS" "QUALHSC" "QUAL_PG"
[13] "SEXCODE" "FULLPDC" "FRICODE" "WASHCODE"
[17] "Region" "Branch" "DefaulterFlag" "DefaulterType"
[21] "DATASET"
# summary of the model
caretLogitModelAIC <- caretLogitModel$finalModel
summary(caretLogitModelAIC)
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-2.7471 -1.0217 0.5715 0.7800 2.0890
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.224189 0.183812 12.100 < 2e-16 ***
AGE -0.014513 0.001662 -8.732 < 2e-16 ***
NOOFDEPE 0.056682 0.010792 5.252 1.50e-07 ***
SALDATFR -0.382527 0.041326 -9.256 < 2e-16 ***
TENORYR 0.773227 0.045417 17.025 < 2e-16 ***
DWNPMFR -1.305815 0.126651 -10.310 < 2e-16 ***
PROFBUS1 0.196425 0.048748 4.029 5.59e-05 ***
QUALHSC1 0.185719 0.040003 4.643 3.44e-06 ***
QUAL_PG1 -0.299497 0.078709 -3.805 0.000142 ***
SEXCODE1 0.234107 0.060015 3.901 9.59e-05 ***
FULLPDC1 -1.237006 0.036683 -33.721 < 2e-16 ***
FRICODE1 -0.176777 0.037311 -4.738 2.16e-06 ***
WASHCODE1 -0.264658 0.047637 -5.556 2.76e-08 ***
RegionAP2 -0.578538 0.179581 -3.222 0.001275 **
RegionChennai -1.413050 0.140703 -10.043 < 2e-16 ***
RegionKA1 -0.652726 0.141183 -4.623 3.78e-06 ***
RegionKE2 -0.574337 0.144780 -3.967 7.28e-05 ***
RegionTN1 -0.807934 0.136196 -5.932 2.99e-09 ***
RegionTN2 -0.613207 0.145596 -4.212 2.53e-05 ***
RegionVellore -0.656089 0.159347 -4.117 3.83e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 26040 on 21678 degrees of freedom
Residual deviance: 23025 on 21659 degrees of freedom
AIC: 23065
Number of Fisher Scoring iterations: 4
Demographic Variables
Loan-Related Variables
TENORYR
DWNPMFR
FULLPDC
# important variables
varImp(caretLogitModelAIC)
Overall
AGE 8.732406
NOOFDEPE 5.252202
SALDATFR 9.256242
TENORYR 17.025141
DWNPMFR 10.310381
PROFBUS1 4.029424
QUALHSC1 4.642604
QUAL_PG1 3.805121
SEXCODE1 3.900838
FULLPDC1 33.721322
FRICODE1 4.737867
WASHCODE1 5.555757
RegionAP2 3.221608
RegionChennai 10.042751
RegionKA1 4.623264
RegionKE2 3.966970
RegionTN1 5.932138
RegionTN2 4.211713
RegionVellore 4.117365
Age = meanAge,
Male,
Education = UG,
NoOfDepe = mean(NOOFDEPE),
Owns a Fridge,
Owns a Washing Machine,
Working Professional,
SALDATFR = mean(SALDATFR),
Lives in TN1
Tenure = mean(TENORYR)
Down Payment = mean(DWNPMFR) %
Did not submit FULLPDC
# creating single value data frame
newdata <- data.frame(
AGE = mean(trainingSet$AGE),
NOOFDEPE = mean(trainingSet$NOOFDEPE),
SALDATFR = mean(trainingSet$SALDATFR),
TENORYR = mean(trainingSet$TENORYR),
DWNPMFR = mean(trainingSet$DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "0",
FULLPDC = "0",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1")
# predicting probability
pred1 <- predict(caretLogitModelAIC, newdata, type = "response")
pred1
1
0.7195374
The probability of Mr. A. Kumar defaulting = 71.95%
Did not submit FULLPDC
Age = meanAge,
Male,
Education = UG,
NoOfDepe = mean(NOOFDEPE),
Owns a Fridge,
Owns a Washing Machine,
Working Professional,
SALDATFR = mean(SALDATFR),
Lives in TN1
Tenure = mean(TENORYR)
Down Payment = mean(DWNPMFR) %
submitted FULLPDC
# creating single value data frame
newdata <- data.frame(
AGE = mean(trainingSet$AGE),
NOOFDEPE = mean(trainingSet$NOOFDEPE),
SALDATFR = mean(trainingSet$SALDATFR),
TENORYR = mean(trainingSet$TENORYR),
DWNPMFR = mean(trainingSet$DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "0",
FULLPDC = "1",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1")
# predicting probability
pred2 <- predict(caretLogitModelAIC, newdata, type = "response")
pred2
1
0.42682
The probability of Mr. B. Kumar defaulting = 42.68%
Mr. B. Kumar is similar to A. Kumar, except that he submitted FULLPDC
# table for mean and sd
library(psych)
describe(trainingSet)[c(4:5,7:9),c(3,4)]
mean sd
AGE 36.45 9.82
NOOFDEPE 2.85 1.62
SALDATFR 0.44 0.46
TENORYR 1.29 0.52
DWNPMFR 0.38 0.16
Mr. C. Kumar is similar to A. Kumar, expect his loan Tenure is (Mean + SD of Loan Tenure) Years
# creating single value data frame
newdataX <- data.frame(
AGE = mean(trainingSet$AGE),
NOOFDEPE = mean(trainingSet$NOOFDEPE),
SALDATFR = mean(trainingSet$SALDATFR),
TENORYR = mean(trainingSet$TENORYR) + sd(trainingSet$TENORYR),
DWNPMFR = mean(trainingSet$DWNPMFR),
PROFBUS = "0",
QUALHSC = "0",
QUAL_PG = "0",
SEXCODE = "0",
FULLPDC = "0",
FRICODE = "1",
WASHCODE = "1",
Region = "TN1")
# predicting probability
probX <- predict(caretLogitModelAIC, newdataX, type = "response")
probX
1
0.7934194
TENORYR PROB
1 "mean-sd" "0.76" "0.63"
1 "mean" "1.28" "0.72"
1 "mean+sd" "1.8" "0.79"
Prob(Default) when TENORYR is low (e.g. mean - SD) = 63%
Prob(Default) when TENORYR is average = 72%
Prob(Default) when TENORYR is high (e.g. mean + SD) = 79%
Thus, an increase in TENORYR Increases Prob(Default)
Classification is done based on the predicted probability of default.
We could assume the threshold probability above which customers will default to be 50%.
But that leads to a meaningless answer.
# predicted probabilities
predProbTest <- predict(caretLogitModelAIC, testSet, type = "response")
# plot of probabilities
plot(predProbTest,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of default")
# minimum value of predicted probability
min(predProbTest)
[1] 0.1672974
# maximum value of predicted probability
max(predProbTest)
[1] 0.977689
# mean value of predicted probability
mean(predProbTest)
[1] 0.7101482
library(caret)
# confusion matrix using caret package
yPred <- ifelse(predProbTest > 0.5, "Yes", "No")
predY <- as.factor(yPred)
levels(testSet$DefaulterFlag) <- c("No", "Yes")
confusionMatrix(data = predY, reference = testSet$DefaulterFlag, positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 634 417
Yes 1449 4727
Accuracy : 0.7418
95% CI : (0.7315, 0.7519)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 6.639e-09
Kappa : 0.2619
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9189
Specificity : 0.3044
Pos Pred Value : 0.7654
Neg Pred Value : 0.6032
Prevalence : 0.7118
Detection Rate : 0.6541
Detection Prevalence : 0.8546
Balanced Accuracy : 0.6117
'Positive' Class : Yes
library(caret)
# confusion matrix using caret package
yPred <- ifelse(predProbTest > 0.7101482, "Yes", "No")
predY <- as.factor(yPred)
levels(testSet$DefaulterFlag) <- c("No", "Yes")
confusionMatrix(data = predY, reference = testSet$DefaulterFlag, positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1372 1566
Yes 711 3578
Accuracy : 0.6849
95% CI : (0.6741, 0.6956)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 1
Kappa : 0.3157
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.6956
Specificity : 0.6587
Pos Pred Value : 0.8342
Neg Pred Value : 0.4670
Prevalence : 0.7118
Detection Rate : 0.4951
Detection Prevalence : 0.5935
Balanced Accuracy : 0.6771
'Positive' Class : Yes
predProbTest > 0.5, “Yes”
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 634 417
Yes 1449 4727
Accuracy : 0.7418
95% CI : (0.7315, 0.7519)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 6.639e-09
Kappa : 0.2619
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9189
Specificity : 0.3044
Pos Pred Value : 0.7654
Neg Pred Value : 0.6032
Prevalence : 0.7118
Detection Rate : 0.6541
Detection Prevalence : 0.8546
Balanced Accuracy : 0.6117
'Positive' Class : Yes
predProbTest > 0.7101482(mean), “Yes”
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1372 1566
Yes 711 3578
Accuracy : 0.6849
95% CI : (0.6741, 0.6956)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 1
Kappa : 0.3157
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.6956
Specificity : 0.6587
Pos Pred Value : 0.8342
Neg Pred Value : 0.4670
Prevalence : 0.7118
Detection Rate : 0.4951
Detection Prevalence : 0.5935
Balanced Accuracy : 0.6771
'Positive' Class : Yes
In some applications of ROC curves, you want the point closest to the TPR of (1) and FPR of (0). This cut point is “optimal” in the sense it weighs both sensitivity and specificity equally. To deterimine this cutoff, you can use the code below. The code takes in BOTH the performance object and prediction object and gives the optimal cutoff value of your predictions.
library(ROCR)
# predicted prob on test data
predProbTest <- predict(caretLogitModelAIC, testSet, type = "response")
# prediction
lgPredObj <- prediction(predProbTest, testSet$DefaulterFlag)
# performance
lgPerfObj <- performance(lgPredObj, "tpr",x.measure = "fpr")
# function for optimal cut point
opt.cut = function(lgPerfObj, lgPredObj){
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(sensitivity = y[[ind]], specificity = 1-x[[ind]],
cutoff = p[[ind]])
}, lgPerfObj@x.values, lgPerfObj@y.values, lgPredObj@cutoffs)
}
print(opt.cut(lgPerfObj, lgPredObj))
[,1]
sensitivity 0.6940124
specificity 0.6615458
cutoff 0.7121607
library(caret)
# confusion matrix using caret package
yPred <- ifelse(predProbTest > 0.7121607, "Yes", "No")
predY <- as.factor(yPred)
levels(testSet$DefaulterFlag) <- c("No", "Yes")
confusionMatrix(data = predY, reference = testSet$DefaulterFlag, positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1378 1574
Yes 705 3570
Accuracy : 0.6847
95% CI : (0.6738, 0.6954)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 1
Kappa : 0.3163
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.6940
Specificity : 0.6615
Pos Pred Value : 0.8351
Neg Pred Value : 0.4668
Prevalence : 0.7118
Detection Rate : 0.4940
Detection Prevalence : 0.5915
Balanced Accuracy : 0.6778
'Positive' Class : Yes
predProbTest > 0.7101482(mean), “Yes”
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1372 1566
Yes 711 3578
Accuracy : 0.6849
95% CI : (0.6741, 0.6956)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 1
Kappa : 0.3157
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.6956
Specificity : 0.6587
Pos Pred Value : 0.8342
Neg Pred Value : 0.4670
Prevalence : 0.7118
Detection Rate : 0.4951
Detection Prevalence : 0.5935
Balanced Accuracy : 0.6771
'Positive' Class : Yes
predProbTest > 0.7121607(optimum)
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1378 1574
Yes 705 3570
Accuracy : 0.6847
95% CI : (0.6738, 0.6954)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 1
Kappa : 0.3163
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.6940
Specificity : 0.6615
Pos Pred Value : 0.8351
Neg Pred Value : 0.4668
Prevalence : 0.7118
Detection Rate : 0.4940
Detection Prevalence : 0.5915
Balanced Accuracy : 0.6778
'Positive' Class : Yes
library(ROCR)
# predicted probabilities
predProbTest <- predict(caretLogitModelAIC, testSet, type = "response")
lgPredObj <- prediction(predProbTest,testSet$DefaulterFlag)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# auc for logit
aucLogit <- performance(lgPredObj, measure = "auc")
aucLogit <- aucLogit@y.values[[1]]
aucLogit
[1] 0.7263816
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-2.7471 -1.0217 0.5715 0.7800 2.0890
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.224189 0.183812 12.100 < 2e-16 ***
AGE -0.014513 0.001662 -8.732 < 2e-16 ***
NOOFDEPE 0.056682 0.010792 5.252 1.50e-07 ***
SALDATFR -0.382527 0.041326 -9.256 < 2e-16 ***
TENORYR 0.773227 0.045417 17.025 < 2e-16 ***
DWNPMFR -1.305815 0.126651 -10.310 < 2e-16 ***
PROFBUS1 0.196425 0.048748 4.029 5.59e-05 ***
QUALHSC1 0.185719 0.040003 4.643 3.44e-06 ***
QUAL_PG1 -0.299497 0.078709 -3.805 0.000142 ***
SEXCODE1 0.234107 0.060015 3.901 9.59e-05 ***
FULLPDC1 -1.237006 0.036683 -33.721 < 2e-16 ***
FRICODE1 -0.176777 0.037311 -4.738 2.16e-06 ***
WASHCODE1 -0.264658 0.047637 -5.556 2.76e-08 ***
RegionAP2 -0.578538 0.179581 -3.222 0.001275 **
RegionChennai -1.413050 0.140703 -10.043 < 2e-16 ***
RegionKA1 -0.652726 0.141183 -4.623 3.78e-06 ***
RegionKE2 -0.574337 0.144780 -3.967 7.28e-05 ***
RegionTN1 -0.807934 0.136196 -5.932 2.99e-09 ***
RegionTN2 -0.613207 0.145596 -4.212 2.53e-05 ***
RegionVellore -0.656089 0.159347 -4.117 3.83e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 26040 on 21678 degrees of freedom
Residual deviance: 23025 on 21659 degrees of freedom
AIC: 23065
Number of Fisher Scoring iterations: 4
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1378 1574
Yes 705 3570
Accuracy : 0.6847
95% CI : (0.6738, 0.6954)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 1
Kappa : 0.3163
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.6940
Specificity : 0.6615
Pos Pred Value : 0.8351
Neg Pred Value : 0.4668
Prevalence : 0.7118
Detection Rate : 0.4940
Detection Prevalence : 0.5915
Balanced Accuracy : 0.6778
'Positive' Class : Yes
# predicted probability on whole data
predProbW <- predict(caretLogitModelAIC, autoF.dt, type = "response")
# creating a table with consumer ID and pred probability
tab1 <- cbind(ID = autoF.dt$`Agmt No`,predProbW)
# converting as data frame
tab2 <- as.data.frame(tab1)
predProbW1<- round(as.numeric(predProbW)*100,2)
# ordering the table
tab3 <- tab2[with(tab2, order(- predProbW1)),]
# few rows of the table
head(tab3,13)
ID predProbW
12641 KA02201217 0.977689003767456
12700 KA02201218 0.97737022872726
2972 TN17100419 0.97702167957542
7779 KA02201297 0.976724012868744
5577 KA02200459 0.976324628602939
11587 AP18100009 0.973341881883995
4816 KA02200700 0.972678056191383
11406 KA02200099 0.97266595289672
10235 KA02201299 0.972491525598801
10744 KA02201231 0.972381407526044
11701 KA02201298 0.971704299085467
9089 KA02201116 0.971427460239132
12320 KA02201114 0.971426280754097