Logistic Regression on (Credit Card Default in Taiwan)

READING AND EXPLORING DATA
SPLITTING DATA
- Training (80%) and Tesing (20%) Data
Training the Logistic Regression Model

READING AND EXPLORING DATA

Importing Data

library(data.table)
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
# attach the data table
attach(CCdefault.dt)
# dimension of the data table
dim(CCdefault.dt)

## [1] 29601     9

First Few Rows of the Data Table

# first few rows of the data table
head(CCdefault.dt)

##    Id CreditLimit Male Education MaritalStatus Age BillOutstanding LastPayment
## 1:  1       20000    0         2             1  24            3913           0
## 2:  2      120000    0         2             2  26            2682           0
## 3:  3       90000    0         2             2  34           29239        1518
## 4:  4       50000    0         2             1  37           46990        2000
## 5:  5       50000    1         2             1  57            8617        2000
## 6:  6       50000    1         1             2  37           64400        2500
##    Default
## 1:       1
## 2:       1
## 3:       0
## 4:       0
## 5:       0
## 6:       0

Data Types of the Data Columns

# data types of the data columns
str(CCdefault.dt)

## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Converting Data Type Structure

# convert 'Id' as a factor
CCdefault.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]


levels(CCdefault.dt$Default)

## [1] "0" "1"

# Changing the lavels of 'Default' variable
levels(CCdefault.dt$Default) <- c("No","Yes")

# verifying conversion
str(CCdefault.dt)

## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : Factor w/ 29601 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

SPLITTING DATA

Training (80%) and Tesing (20%) Data

library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]
dim(trainData.dt)

## [1] 23681     9

# 20% testing data
testData.dt <- CCdefault.dt[-trainIndex, ]
dim(testData.dt)

## [1] 5920    9

Training the Logistic Regression Model

Setting Control parameters

# control parameters
objControl <- trainControl(method = "boot", 
                           number = 2, 
                           returnResamp = 'none', 
                           summaryFunction = twoClassSummary, 
                           classProbs = TRUE,
                           savePredictions = TRUE)

Running the Training Model

set.seed(766)
# model building using caret package
 LRModel <- train(Default ~ CreditLimit
                 + Male
                 + Education
                 + MaritalStatus
                 + BillOutstanding
                 + LastPayment, 
                 data = trainData.dt,
                 method = 'glm',
                 trControl = objControl,
                 metric = "ROC")
 
# summary of the model
summary(LRModel)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9767  -0.7751  -0.6468  -0.3778   4.1637  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -6.626e-01  4.867e-02 -13.614  < 2e-16 ***
## CreditLimit     -3.315e-06  1.615e-07 -20.523  < 2e-16 ***
## Male1            1.732e-01  3.226e-02   5.369 7.90e-08 ***
## Education2       1.606e-02  3.781e-02   0.425  0.67108    
## Education3       5.350e-03  4.933e-02   0.108  0.91363    
## Education4      -1.135e+00  3.952e-01  -2.872  0.00407 ** 
## MaritalStatus2  -2.464e-01  3.308e-02  -7.448 9.45e-14 ***
## MaritalStatus3  -6.862e-02  1.450e-01  -0.473  0.63608    
## BillOutstanding  1.400e-06  2.647e-07   5.292 1.21e-07 ***
## LastPayment     -2.302e-05  2.739e-06  -8.404  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25142  on 23680  degrees of freedom
## Residual deviance: 24266  on 23671  degrees of freedom
## AIC: 24286
## 
## Number of Fisher Scoring iterations: 6

Predicting Test Set Results by Regression Model

Predicting Test Set Results

# predicting the model on test data set
PredLR <- predict(LRModel, testData.dt,type = "prob")

Plotting the Predicted Plobalities

# plot of probabilities
plot(PredLR$Yes, 
     main = "Scatterplot of Probabilities of default (test data)", 
     xlab = "Customer ID", 
     ylab = "Predicted Probability of default")

Range of the Predicted Probabilities

# range of the presicted probabilities
range <- range(PredLR$Yes)
format(range, scientific = FALSE)

## [1] "0.00001234883" "0.37989222693"

Making Confusion Matrix at Cut-off Probability “0.20”

# choosing cut-off probability
Predicted <- ifelse(PredLR$Yes > 0.22, "Yes", "No")
# actual and predicted data columns
Predicted <- as.factor(Predicted)
Actual <- as.factor(testData.dt$Default)
# making confusion matrix
cm <-confusionMatrix(data =Predicted,reference = Actual,
                      positive = "Yes")
cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2333  416
##        Yes 2266  905
##                                           
##                Accuracy : 0.547           
##                  95% CI : (0.5342, 0.5597)
##     No Information Rate : 0.7769          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1283          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6851          
##             Specificity : 0.5073          
##          Pos Pred Value : 0.2854          
##          Neg Pred Value : 0.8487          
##              Prevalence : 0.2231          
##          Detection Rate : 0.1529          
##    Detection Prevalence : 0.5356          
##       Balanced Accuracy : 0.5962          
##                                           
##        'Positive' Class : Yes             
##

Calculating Accuracy, Sensitivity,Specificity at Different Cut-off Levels

# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
      
       # predicting the test set results
         Pred.LR <- predict(LRModel, testData.dt,type = "prob")
         C1 <- ifelse(Pred.LR$Yes > cutoff, "Yes", "No")
         C2 <- testData.dt$Default
         predY   <- as.factor(C1)
           actualY <- as.factor(C2)
     
        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(data = predY,reference = actualY, positive = "Yes")
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]
          
        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # sequence of cut-off probability       
        cutoff1 <- seq( .01, .4, by = .03 )
        
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # extra coding for saving table as desired format
        tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
                 tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]])
        # printing the table
        tab4 <- as.data.frame(tab3)
        tab5 <- cbind(cutoff1,tab4$Accuracy,tab4$Sensitivity,tab4$Specificity,tab4$Kappa)
        tab6 <- as.data.frame(tab5)
        
        pm <- setnames(tab6, "cutoff1", "cutoff")
        pm <- setnames(pm, "V2", "Accuracy")
        pm <- setnames(pm, "V3", "Senstivity")
        pm <- setnames(pm, "V4", "Specificity")
        pm <- setnames(pm, "V5", "kappa")
        pm

##    cutoff  Accuracy  Senstivity Specificity       kappa
## 1    0.01 0.2256757 0.999242998 0.003479017 0.001217283
## 2    0.04 0.2331081 0.998485995 0.013263753 0.005283801
## 3    0.07 0.2498311 0.994700984 0.035877365 0.013935201
## 4    0.10 0.2856419 0.975018925 0.087627745 0.029507143
## 5    0.13 0.3315878 0.936411809 0.157860404 0.046618561
## 6    0.16 0.3866554 0.863739591 0.249619482 0.060218697
## 7    0.19 0.4641892 0.785768357 0.371819961 0.092532914
## 8    0.22 0.5469595 0.685087055 0.507284192 0.128324799
## 9    0.25 0.6238176 0.557153671 0.642965862 0.155715728
## 10   0.28 0.6881757 0.355791067 0.783648619 0.134225226
## 11   0.31 0.7467905 0.193792581 0.905631659 0.119821784
## 12   0.34 0.7684122 0.073429220 0.968036530 0.058448157
## 13   0.37 0.7770270 0.006056018 0.998477930 0.007000454
## 14   0.40 0.7768581 0.000000000 1.000000000 0.000000000

Plotting ROC Curve

# loading the package
library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.4

PredLR <- predict(LRModel, testData.dt,type = "prob")
lgPredObj <- prediction(PredLR[2],testData.dt$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

AUC (Area Under the Curve)

# aria under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR

## [1] 0.6303996

Logistic Regression on (Credit Card Default in Taiwan)

Sameer Mathur

READING AND EXPLORING DATA

Importing Data

First Few Rows of the Data Table

Data Types of the Data Columns

Converting Data Type Structure

SPLITTING DATA

Training (80%) and Tesing (20%) Data

Training the Logistic Regression Model

Setting Control parameters

Running the Training Model

Predicting Test Set Results by Regression Model

Predicting Test Set Results

Plotting the Predicted Plobalities

Range of the Predicted Probabilities

Making Confusion Matrix at Cut-off Probability “0.20”

Calculating Accuracy, Sensitivity,Specificity at Different Cut-off Levels

Plotting ROC Curve

AUC (Area Under the Curve)