library(data.table)
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
# attach the data table
attach(CCdefault.dt)
# dimension of the data table
dim(CCdefault.dt)## [1] 29601 9
## Id CreditLimit Male Education MaritalStatus Age BillOutstanding LastPayment
## 1: 1 20000 0 2 1 24 3913 0
## 2: 2 120000 0 2 2 26 2682 0
## 3: 3 90000 0 2 2 34 29239 1518
## 4: 4 50000 0 2 1 37 46990 2000
## 5: 5 50000 1 2 1 57 8617 2000
## 6: 6 50000 1 1 2 37 64400 2500
## Default
## 1: 1
## 2: 1
## 3: 0
## 4: 0
## 5: 0
## 6: 0
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : int 0 0 0 0 1 1 1 0 0 1 ...
## $ Education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : int 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : int 1 1 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# convert 'Id' as a factor
CCdefault.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]
levels(CCdefault.dt$Default)## [1] "0" "1"
# Changing the lavels of 'Default' variable
levels(CCdefault.dt$Default) <- c("No","Yes")
# verifying conversion
str(CCdefault.dt)## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : Factor w/ 29601 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
## $ Education : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]
dim(trainData.dt)## [1] 23681 9
## [1] 5920 9
set.seed(766)
# model building using caret package
LRModel <- train(Default ~ CreditLimit
+ Male
+ Education
+ MaritalStatus
+ BillOutstanding
+ LastPayment,
data = trainData.dt,
method = 'glm',
trControl = objControl,
metric = "ROC")
# summary of the model
summary(LRModel)##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9767 -0.7751 -0.6468 -0.3778 4.1637
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.626e-01 4.867e-02 -13.614 < 2e-16 ***
## CreditLimit -3.315e-06 1.615e-07 -20.523 < 2e-16 ***
## Male1 1.732e-01 3.226e-02 5.369 7.90e-08 ***
## Education2 1.606e-02 3.781e-02 0.425 0.67108
## Education3 5.350e-03 4.933e-02 0.108 0.91363
## Education4 -1.135e+00 3.952e-01 -2.872 0.00407 **
## MaritalStatus2 -2.464e-01 3.308e-02 -7.448 9.45e-14 ***
## MaritalStatus3 -6.862e-02 1.450e-01 -0.473 0.63608
## BillOutstanding 1.400e-06 2.647e-07 5.292 1.21e-07 ***
## LastPayment -2.302e-05 2.739e-06 -8.404 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25142 on 23680 degrees of freedom
## Residual deviance: 24266 on 23671 degrees of freedom
## AIC: 24286
##
## Number of Fisher Scoring iterations: 6
# plot of probabilities
plot(PredLR$Yes,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of default")## [1] "0.00001234883" "0.37989222693"
# choosing cut-off probability
Predicted <- ifelse(PredLR$Yes > 0.22, "Yes", "No")
# actual and predicted data columns
Predicted <- as.factor(Predicted)
Actual <- as.factor(testData.dt$Default)
# making confusion matrix
cm <-confusionMatrix(data =Predicted,reference = Actual,
positive = "Yes")
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2333 416
## Yes 2266 905
##
## Accuracy : 0.547
## 95% CI : (0.5342, 0.5597)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1283
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6851
## Specificity : 0.5073
## Pos Pred Value : 0.2854
## Neg Pred Value : 0.8487
## Prevalence : 0.2231
## Detection Rate : 0.1529
## Detection Prevalence : 0.5356
## Balanced Accuracy : 0.5962
##
## 'Positive' Class : Yes
##
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
Pred.LR <- predict(LRModel, testData.dt,type = "prob")
C1 <- ifelse(Pred.LR$Yes > cutoff, "Yes", "No")
C2 <- testData.dt$Default
predY <- as.factor(C1)
actualY <- as.factor(C2)
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(data = predY,reference = actualY, positive = "Yes")
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# sequence of cut-off probability
cutoff1 <- seq( .01, .4, by = .03 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# extra coding for saving table as desired format
tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]])
# printing the table
tab4 <- as.data.frame(tab3)
tab5 <- cbind(cutoff1,tab4$Accuracy,tab4$Sensitivity,tab4$Specificity,tab4$Kappa)
tab6 <- as.data.frame(tab5)
pm <- setnames(tab6, "cutoff1", "cutoff")
pm <- setnames(pm, "V2", "Accuracy")
pm <- setnames(pm, "V3", "Senstivity")
pm <- setnames(pm, "V4", "Specificity")
pm <- setnames(pm, "V5", "kappa")
pm## cutoff Accuracy Senstivity Specificity kappa
## 1 0.01 0.2256757 0.999242998 0.003479017 0.001217283
## 2 0.04 0.2331081 0.998485995 0.013263753 0.005283801
## 3 0.07 0.2498311 0.994700984 0.035877365 0.013935201
## 4 0.10 0.2856419 0.975018925 0.087627745 0.029507143
## 5 0.13 0.3315878 0.936411809 0.157860404 0.046618561
## 6 0.16 0.3866554 0.863739591 0.249619482 0.060218697
## 7 0.19 0.4641892 0.785768357 0.371819961 0.092532914
## 8 0.22 0.5469595 0.685087055 0.507284192 0.128324799
## 9 0.25 0.6238176 0.557153671 0.642965862 0.155715728
## 10 0.28 0.6881757 0.355791067 0.783648619 0.134225226
## 11 0.31 0.7467905 0.193792581 0.905631659 0.119821784
## 12 0.34 0.7684122 0.073429220 0.968036530 0.058448157
## 13 0.37 0.7770270 0.006056018 0.998477930 0.007000454
## 14 0.40 0.7768581 0.000000000 1.000000000 0.000000000
## Warning: package 'ROCR' was built under R version 4.0.4
PredLR <- predict(LRModel, testData.dt,type = "prob")
lgPredObj <- prediction(PredLR[2],testData.dt$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")# aria under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR## [1] 0.6303996