READING AND EXPLORING DATA

First Few Rows of the Data Table

##    Id CreditLimit Male Education MaritalStatus Age BillOutstanding LastPayment
## 1:  1       20000    0         2             1  24            3913           0
## 2:  2      120000    0         2             2  26            2682           0
## 3:  3       90000    0         2             2  34           29239        1518
## 4:  4       50000    0         2             1  37           46990        2000
## 5:  5       50000    1         2             1  57            8617        2000
## 6:  6       50000    1         1             2  37           64400        2500
##    Default
## 1:       1
## 2:       1
## 3:       0
## 4:       0
## 5:       0
## 6:       0

Data Types of the Data Columns

## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Converting Data Type Structure

## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : Factor w/ 29601 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Check the order of levels of the target variable

## [1] "No"  "Yes"

Reset the order of levels of the target variable, as (“Event” = Yes, “No Event” = No)

## [1] "Yes" "No"

SPLITTING DATA

Training the Logistic Regression Model

Running the Training Model

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.5635   0.3625   0.6510   0.7766   0.9881  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      6.795e-01  4.847e-02  14.019  < 2e-16 ***
## CreditLimit      3.161e-06  1.603e-07  19.719  < 2e-16 ***
## Male1           -1.606e-01  3.228e-02  -4.977 6.46e-07 ***
## Education2      -1.539e-02  3.767e-02  -0.409  0.68289    
## Education3       4.191e-03  4.948e-02   0.085  0.93250    
## Education4       1.473e+00  4.618e-01   3.190  0.00142 ** 
## MaritalStatus2   2.335e-01  3.302e-02   7.071 1.54e-12 ***
## MaritalStatus3   1.342e-01  1.479e-01   0.907  0.36449    
## BillOutstanding -1.930e-06  2.596e-07  -7.434 1.05e-13 ***
## LastPayment      2.993e-05  3.112e-06   9.615  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25142  on 23680  degrees of freedom
## Residual deviance: 24262  on 23671  degrees of freedom
## AIC: 24282
## 
## Number of Fisher Scoring iterations: 6

Predicting Test Set Results by Regression Model

Range of the Predicted Probabilities

## [1] "0.0000276088" "0.3716211045"

Making Confusion Matrix at Cut-off Probability “0.20”

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Yes   No
##        Yes 1053 2806
##        No   268 1793
##                                           
##                Accuracy : 0.4807          
##                  95% CI : (0.4679, 0.4936)
##     No Information Rate : 0.7769          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.111           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7971          
##             Specificity : 0.3899          
##          Pos Pred Value : 0.2729          
##          Neg Pred Value : 0.8700          
##              Prevalence : 0.2231          
##          Detection Rate : 0.1779          
##    Detection Prevalence : 0.6519          
##       Balanced Accuracy : 0.5935          
##                                           
##        'Positive' Class : Yes             
## 

Calculating Accuracy, Sensitivity,Specificity at Different Cut-off Levels

# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
      
       # predicting the test set results
         Pred.LR <- predict(LRModel, testData.dt,type = "prob")
         C1 <- ifelse(Pred.LR$Yes > cutoff, "Yes", "No")
         C2 <- testData.dt$Default
         predY   <- as.factor(C1)
           actualY <- as.factor(C2)
           
     # ordering the levels of predicted variable
        predY <- ordered(predY, levels = c("Yes", "No"))
     
        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(data = predY,reference = actualY, positive = "Yes")
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]
          
        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # sequence of cut-off probability       
        cutoff1 <- seq( .01, .4, by = .03 )
        
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # extra coding for saving table as desired format
        tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
                 tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]])
        # printing the table
        tab4 <- as.data.frame(tab3)
        tab5 <- cbind(cutoff1,tab4$Accuracy,tab4$Sensitivity,tab4$Specificity,tab4$Kappa)
        tab6 <- as.data.frame(tab5)
        
        pm <- setnames(tab6, "cutoff1", "cutoff")
        pm <- setnames(pm, "V2", "Accuracy")
        pm <- setnames(pm, "V3", "Senstivity")
        pm <- setnames(pm, "V4", "Specificity")
        pm <- setnames(pm, "V5", "kappa")
        pm
##    cutoff  Accuracy  Senstivity Specificity       kappa
## 1    0.01 0.2271959 0.997728993 0.005870841 0.001612372
## 2    0.04 0.2344595 0.993943982 0.016307893 0.004621428
## 3    0.07 0.2486486 0.984859955 0.037181996 0.010068503
## 4    0.10 0.2800676 0.970476911 0.081756904 0.024537119
## 5    0.13 0.3170608 0.947009841 0.136116547 0.040491021
## 6    0.16 0.3751689 0.894776684 0.225918678 0.062766589
## 7    0.19 0.4496622 0.820590462 0.343118069 0.093490427
## 8    0.22 0.5334459 0.718395155 0.480321809 0.128667716
## 9    0.25 0.6125000 0.576835731 0.622744075 0.151510926
## 10   0.28 0.6964527 0.394398183 0.783213742 0.168648746
## 11   0.31 0.7532095 0.199848600 0.912154816 0.135954120
## 12   0.34 0.7706081 0.062074186 0.974124810 0.051872595
## 13   0.37 0.7768581 0.001514005 0.999565123 0.001673851
## 14   0.40 0.7768581 0.000000000 1.000000000 0.000000000