library(data.table)
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
# attach the data table
attach(CCdefault.dt)
# dimension of the data table
dim(CCdefault.dt)## [1] 29601 9
## Id CreditLimit Male Education MaritalStatus Age BillOutstanding LastPayment
## 1: 1 20000 0 2 1 24 3913 0
## 2: 2 120000 0 2 2 26 2682 0
## 3: 3 90000 0 2 2 34 29239 1518
## 4: 4 50000 0 2 1 37 46990 2000
## 5: 5 50000 1 2 1 57 8617 2000
## 6: 6 50000 1 1 2 37 64400 2500
## Default
## 1: 1
## 2: 1
## 3: 0
## 4: 0
## 5: 0
## 6: 0
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : int 0 0 0 0 1 1 1 0 0 1 ...
## $ Education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : int 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : int 1 1 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# convert 'Id' as a factor
CCdefault.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]
# Changing the lavels of 'Default' variable
levels(CCdefault.dt$Default) <- c("No","Yes")
# verifying conversion
str(CCdefault.dt)## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : Factor w/ 29601 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
## $ Education : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
## [1] "No" "Yes"
# ordering the levels
CCdefault.dt$Default <- ordered(CCdefault.dt$Default, levels = c("Yes", "No"))
# verifying the new order of levels
levels(CCdefault.dt$Default)## [1] "Yes" "No"
library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]
dim(trainData.dt)## [1] 23681 9
## [1] 5920 9
set.seed(766)
# model building using caret package
LRModel <- train(Default ~ CreditLimit
+ Male
+ Education
+ MaritalStatus
+ BillOutstanding
+ LastPayment,
data = trainData.dt,
method = 'glm',
trControl = objControl,
metric = "ROC")
# summary of the model
summary(LRModel)##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.5635 0.3625 0.6510 0.7766 0.9881
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.795e-01 4.847e-02 14.019 < 2e-16 ***
## CreditLimit 3.161e-06 1.603e-07 19.719 < 2e-16 ***
## Male1 -1.606e-01 3.228e-02 -4.977 6.46e-07 ***
## Education2 -1.539e-02 3.767e-02 -0.409 0.68289
## Education3 4.191e-03 4.948e-02 0.085 0.93250
## Education4 1.473e+00 4.618e-01 3.190 0.00142 **
## MaritalStatus2 2.335e-01 3.302e-02 7.071 1.54e-12 ***
## MaritalStatus3 1.342e-01 1.479e-01 0.907 0.36449
## BillOutstanding -1.930e-06 2.596e-07 -7.434 1.05e-13 ***
## LastPayment 2.993e-05 3.112e-06 9.615 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25142 on 23680 degrees of freedom
## Residual deviance: 24262 on 23671 degrees of freedom
## AIC: 24282
##
## Number of Fisher Scoring iterations: 6
# plot of probabilities
plot(PredLR$Yes,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of default")## [1] "0.0000276088" "0.3716211045"
# choosing cut-off probability
pred.LR <- ifelse(PredLR$Yes > 0.20, "Yes", "No")
Predicted <- ordered(pred.LR, levels = c("Yes", "No"))
# actual and predicted data columns
Predicted <- as.factor(Predicted)
Actual <- as.factor(testData.dt$Default)
# making confusion matrix
cm <-confusionMatrix(data =Predicted,reference = Actual,
positive = "Yes")
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 1053 2806
## No 268 1793
##
## Accuracy : 0.4807
## 95% CI : (0.4679, 0.4936)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.111
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7971
## Specificity : 0.3899
## Pos Pred Value : 0.2729
## Neg Pred Value : 0.8700
## Prevalence : 0.2231
## Detection Rate : 0.1779
## Detection Prevalence : 0.6519
## Balanced Accuracy : 0.5935
##
## 'Positive' Class : Yes
##
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
Pred.LR <- predict(LRModel, testData.dt,type = "prob")
C1 <- ifelse(Pred.LR$Yes > cutoff, "Yes", "No")
C2 <- testData.dt$Default
predY <- as.factor(C1)
actualY <- as.factor(C2)
# ordering the levels of predicted variable
predY <- ordered(predY, levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(data = predY,reference = actualY, positive = "Yes")
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# sequence of cut-off probability
cutoff1 <- seq( .01, .4, by = .03 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# extra coding for saving table as desired format
tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]])
# printing the table
tab4 <- as.data.frame(tab3)
tab5 <- cbind(cutoff1,tab4$Accuracy,tab4$Sensitivity,tab4$Specificity,tab4$Kappa)
tab6 <- as.data.frame(tab5)
pm <- setnames(tab6, "cutoff1", "cutoff")
pm <- setnames(pm, "V2", "Accuracy")
pm <- setnames(pm, "V3", "Senstivity")
pm <- setnames(pm, "V4", "Specificity")
pm <- setnames(pm, "V5", "kappa")
pm## cutoff Accuracy Senstivity Specificity kappa
## 1 0.01 0.2271959 0.997728993 0.005870841 0.001612372
## 2 0.04 0.2344595 0.993943982 0.016307893 0.004621428
## 3 0.07 0.2486486 0.984859955 0.037181996 0.010068503
## 4 0.10 0.2800676 0.970476911 0.081756904 0.024537119
## 5 0.13 0.3170608 0.947009841 0.136116547 0.040491021
## 6 0.16 0.3751689 0.894776684 0.225918678 0.062766589
## 7 0.19 0.4496622 0.820590462 0.343118069 0.093490427
## 8 0.22 0.5334459 0.718395155 0.480321809 0.128667716
## 9 0.25 0.6125000 0.576835731 0.622744075 0.151510926
## 10 0.28 0.6964527 0.394398183 0.783213742 0.168648746
## 11 0.31 0.7532095 0.199848600 0.912154816 0.135954120
## 12 0.34 0.7706081 0.062074186 0.974124810 0.051872595
## 13 0.37 0.7768581 0.001514005 0.999565123 0.001673851
## 14 0.40 0.7768581 0.000000000 1.000000000 0.000000000
## Warning: package 'ROCR' was built under R version 4.0.4
PredLR <- predict(LRModel, testData.dt,type = "prob")
lgPredObj <- prediction(PredLR[2],testData.dt$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")# aria under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR## [1] 0.6412909