Sameer Mathur
Credit Card Default Data
# changing the levels of target variable in training data set
trainingSet$default <- ifelse(trainingSet$default == "Yes",1,0)
# fit logistic regression model
logitModel <- glm(default ~
balance
+ income
+ student,
data = trainingSet,
family = binomial())
# summary of the logistic regression model
summary(logitModel)
Call:
glm(formula = default ~ balance + income + student, family = binomial(),
data = trainingSet)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4563 -0.1434 -0.0560 -0.0203 3.6795
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.088e+01 5.560e-01 -19.576 <2e-16 ***
balance 5.708e-03 2.583e-04 22.095 <2e-16 ***
income 4.017e-06 9.319e-06 0.431 0.6664
studentYes -5.865e-01 2.638e-01 -2.223 0.0262 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2333.8 on 7999 degrees of freedom
Residual deviance: 1263.7 on 7996 degrees of freedom
AIC: 1271.7
Number of Fisher Scoring iterations: 8
# predicting the test set observations
logitModelPred <- predict(logitModel, testSet, type = "response")
# plot of probabilities
plot(logitModelPred,
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")
# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.5,"Yes","No")
# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testSet$default <- ordered(testSet$default, levels = c("Yes", "No"))
# confusion matrix
cm <- table(Predicted = classify50, Actual = testSet$default)
cm
Actual
Predicted Yes No
Yes 24 4
No 43 1929
library(caret)
confusionMatrix(cm)
Confusion Matrix and Statistics
Actual
Predicted Yes No
Yes 24 4
No 43 1929
Accuracy : 0.9765
95% CI : (0.9689, 0.9827)
No Information Rate : 0.9665
P-Value [Acc > NIR] : 0.005658
Kappa : 0.4953
Mcnemar's Test P-Value : 2.976e-08
Sensitivity : 0.3582
Specificity : 0.9979
Pos Pred Value : 0.8571
Neg Pred Value : 0.9782
Prevalence : 0.0335
Detection Rate : 0.0120
Detection Prevalence : 0.0140
Balanced Accuracy : 0.6781
'Positive' Class : Yes
library(caret)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
logitModelPred <- predict(logitModel, testSet, type = "response")
C1 <- ifelse(logitModelPred > cutoff, "Yes", "No")
C2 <- testSet$default
predY <- as.factor(C1)
actualY <- as.factor(C2)
predY <- ordered(predY, levels = c("Yes", "No"))
actualY <- ordered(actualY, levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(table(predY,actualY))
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# making sequence of cut-off probabilities
cutoff1 <- seq( .1, .9, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# extra coding for saving table as desired format
tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
tab2[[15]],tab2[[16]],tab2[[17]])
cutoff Accuracy Senstivity Specificity kappa
1 0.10 0.9340 0.73134328 0.9410243 0.39747762
2 0.15 0.9510 0.64179104 0.9617175 0.44369071
3 0.20 0.9605 0.58208955 0.9736161 0.47671723
4 0.25 0.9685 0.53731343 0.9834454 0.51703413
5 0.30 0.9705 0.52238806 0.9860321 0.52741778
6 0.35 0.9745 0.50746269 0.9906880 0.55850272
7 0.40 0.9755 0.46268657 0.9932747 0.54651464
8 0.45 0.9765 0.38805970 0.9968960 0.51474354
9 0.50 0.9765 0.35820896 0.9979307 0.49529659
10 0.55 0.9770 0.34328358 0.9989653 0.49072793
11 0.60 0.9755 0.29850746 0.9989653 0.44016635
12 0.65 0.9735 0.23880597 0.9989653 0.36749648
13 0.70 0.9725 0.19402985 0.9994827 0.31303240
14 0.75 0.9700 0.11940299 0.9994827 0.20421237
15 0.80 0.9695 0.10447761 0.9994827 0.18081220
16 0.85 0.9680 0.05970149 0.9994827 0.10695598
17 0.90 0.9680 0.04477612 1.0000000 0.08308142
It can be seen that accuracy is maximum at cut-off probability = 0.55
cut-off Probability = 0.10
Confusion Matrix and Statistics
Actual
Predicted Yes No
Yes 49 114
No 18 1819
Accuracy : 0.934
95% CI : (0.9222, 0.9445)
No Information Rate : 0.9665
P-Value [Acc > NIR] : 1
Kappa : 0.3975
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.7313
Specificity : 0.9410
Pos Pred Value : 0.3006
Neg Pred Value : 0.9902
Prevalence : 0.0335
Detection Rate : 0.0245
Detection Prevalence : 0.0815
Balanced Accuracy : 0.8362
'Positive' Class : Yes
cut-off Probability = 0.50
Confusion Matrix and Statistics
Actual
Predicted Yes No
Yes 24 4
No 43 1929
Accuracy : 0.9765
95% CI : (0.9689, 0.9827)
No Information Rate : 0.9665
P-Value [Acc > NIR] : 0.005658
Kappa : 0.4953
Mcnemar's Test P-Value : 2.976e-08
Sensitivity : 0.3582
Specificity : 0.9979
Pos Pred Value : 0.8571
Neg Pred Value : 0.9782
Prevalence : 0.0335
Detection Rate : 0.0120
Detection Prevalence : 0.0140
Balanced Accuracy : 0.6781
'Positive' Class : Yes
# loading the package
library(ROCR)
PredLR <- predict(logitModel, testSet,type = "response")
lgPredObj <- prediction(PredLR[2],testSet$default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# area under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR
[1] 0.9485604