RandomForest

library(randomForest)
library(caret)
library(tidyr)

Reading the data

data1 <- read.table(file = "C://Users/cs_mo/Downloads/ISYE7406/ProjectCreditCard/creditcards.csv", header= TRUE,sep= ",",skip = 1)
names(data1)[25] <- 'default'
head(data1)

##   ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## 1  1     20000   2         2        1  24     2     2    -1    -1    -2    -2
## 2  2    120000   2         2        2  26    -1     2     0     0     0     2
## 3  3     90000   2         2        2  34     0     0     0     0     0     0
## 4  4     50000   2         2        1  37     0     0     0     0     0     0
## 5  5     50000   1         2        1  57    -1     0    -1     0     0     0
## 6  6     50000   1         1        2  37     0     0     0     0     0     0
##   BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## 1      3913      3102       689         0         0         0        0      689
## 2      2682      1725      2682      3272      3455      3261        0     1000
## 3     29239     14027     13559     14331     14948     15549     1518     1500
## 4     46990     48233     49291     28314     28959     29547     2000     2019
## 5      8617      5670     35835     20940     19146     19131     2000    36681
## 6     64400     57069     57608     19394     19619     20024     2500     1815
##   PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default
## 1        0        0        0        0       1
## 2     1000     1000        0     2000       1
## 3     1000     1000     1000     5000       0
## 4     1200     1100     1069     1000       0
## 5    10000     9000      689      679       0
## 6      657     1000     1000      800       0

Removing 167 outliers as identified in the data exploration part.

out <- boxplot.stats(data1$LIMIT_BAL)$out
out_ind <- which(data1$LIMIT_BAL %in% c(out))
mydata1 <- data1[-out_ind,]
dim(mydata1)

## [1] 29833    25

Cleaning up Marriage and Education feature

mydata1$MARRIAGE[mydata1$MARRIAGE == "0"] <- "3"
mydata1$EDUCATION[mydata1$EDUCATION== "6"]<-"4"
mydata1$EDUCATION[mydata1$EDUCATION== "5"]<-"4"
mydata1$EDUCATION[mydata1$EDUCATION== "0"]<-"4"

mydata1$default[mydata1$default=="0"] <- "ND"
mydata1$default[mydata1$default=="1"] <- "DEF"

Removing the ID column…

mydata <- mydata1[,2:25]
head(mydata)

##   LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## 1     20000   2         2        1  24     2     2    -1    -1    -2    -2
## 2    120000   2         2        2  26    -1     2     0     0     0     2
## 3     90000   2         2        2  34     0     0     0     0     0     0
## 4     50000   2         2        1  37     0     0     0     0     0     0
## 5     50000   1         2        1  57    -1     0    -1     0     0     0
## 6     50000   1         1        2  37     0     0     0     0     0     0
##   BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## 1      3913      3102       689         0         0         0        0      689
## 2      2682      1725      2682      3272      3455      3261        0     1000
## 3     29239     14027     13559     14331     14948     15549     1518     1500
## 4     46990     48233     49291     28314     28959     29547     2000     2019
## 5      8617      5670     35835     20940     19146     19131     2000    36681
## 6     64400     57069     57608     19394     19619     20024     2500     1815
##   PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default
## 1        0        0        0        0     DEF
## 2     1000     1000        0     2000     DEF
## 3     1000     1000     1000     5000      ND
## 4     1200     1100     1069     1000      ND
## 5    10000     9000      689      679      ND
## 6      657     1000     1000      800      ND

dim(mydata)

## [1] 29833    24

Splitting the data: 85% training and 15% testing

set.seed(7406)
flag<- sort(sample(1:29833,4475))
data_train  <- mydata[-flag,]
data_test <- mydata[flag,]
dim(data_train)

## [1] 25358    24

dim(data_test)

## [1] 4475   24

head(data_train)

##   LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## 1     20000   2         2        1  24     2     2    -1    -1    -2    -2
## 2    120000   2         2        2  26    -1     2     0     0     0     2
## 3     90000   2         2        2  34     0     0     0     0     0     0
## 4     50000   2         2        1  37     0     0     0     0     0     0
## 5     50000   1         2        1  57    -1     0    -1     0     0     0
## 7    500000   1         1        2  29     0     0     0     0     0     0
##   BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## 1      3913      3102       689         0         0         0        0      689
## 2      2682      1725      2682      3272      3455      3261        0     1000
## 3     29239     14027     13559     14331     14948     15549     1518     1500
## 4     46990     48233     49291     28314     28959     29547     2000     2019
## 5      8617      5670     35835     20940     19146     19131     2000    36681
## 7    367965    412023    445007    542653    483003    473944    55000    40000
##   PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default
## 1        0        0        0        0     DEF
## 2     1000     1000        0     2000     DEF
## 3     1000     1000     1000     5000      ND
## 4     1200     1100     1069     1000      ND
## 5    10000     9000      689      679      ND
## 7    38000    20239    13750    13770      ND

table(mydata$default)

## 
##   DEF    ND 
##  6617 23216

finding the best mtry value: 4

set.seed(7406)
#bestmtry <- tuneRF(data_train[,1:23],as.factor(data_train[,24]), stepFactor=1.5, improve=1e-5, ntree=500)
#print(bestmtry)

RandomForest did not see any improvement after removing outliers.

set.seed(7406)
rf1 <- randomForest(as.factor(default) ~., data = data_train, ntree= 500, replace = TRUE, mtry =4)
print(rf1)

## 
## Call:
##  randomForest(formula = as.factor(default) ~ ., data = data_train,      ntree = 500, replace = TRUE, mtry = 4) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 18.31%
## Confusion matrix:
##      DEF    ND class.error
## DEF 2084  3530   0.6287852
## ND  1114 18630   0.0564222

Variable importance plot. It looks like Sex, Marriage and Education are the least important variables.

varImpPlot(rf1,type =2)

After 100 trees there is no improvement in the accuracy.

plot(rf1)

Predicting values using type as “response” we can see that model’s accuracy is 82% but sensitivity is only 68% and balance accuracy is 76%. We need to improve the sensitivity as misclassifying Defaulter might be costlier than misclassifying the non-defaulter.

ptr <- predict(rf1, data_test[,1:23])
cfr <- confusionMatrix(as.factor(data_test[,24]),as.factor(ptr))
cfr

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  DEF   ND
##        DEF  389  614
##        ND   183 3289
##                                          
##                Accuracy : 0.8219         
##                  95% CI : (0.8104, 0.833)
##     No Information Rate : 0.8722         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.3956         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.68007        
##             Specificity : 0.84269        
##          Pos Pred Value : 0.38784        
##          Neg Pred Value : 0.94729        
##              Prevalence : 0.12782        
##          Detection Rate : 0.08693        
##    Detection Prevalence : 0.22413        
##       Balanced Accuracy : 0.76138        
##                                          
##        'Positive' Class : DEF            
##

Using probability as type to change the threshold value to improve sensitivity of the model

pt <- predict(rf1, data_test[,1:23], type= "prob",index =2 )

CF with threshold value of 0.65. As we can see that this improves the model’s sensitivity and balanced accuracy. However, it reduces the overall accuracy of the model.

tr_0.65 <- ifelse(pt[,1]>0.65,"DEF","ND")
table(tr_0.65)

## tr_0.65
##  DEF   ND 
##  283 4192

cf1 <- confusionMatrix(as.factor(data_test[,24]),as.factor(tr_0.65))
cf1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  DEF   ND
##        DEF  219  784
##        ND    64 3408
##                                           
##                Accuracy : 0.8105          
##                  95% CI : (0.7987, 0.8219)
##     No Information Rate : 0.9368          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2684          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.77385         
##             Specificity : 0.81298         
##          Pos Pred Value : 0.21834         
##          Neg Pred Value : 0.98157         
##              Prevalence : 0.06324         
##          Detection Rate : 0.04894         
##    Detection Prevalence : 0.22413         
##       Balanced Accuracy : 0.79341         
##                                           
##        'Positive' Class : DEF             
##

CF with threshold value of 0.75

tr_0.75 <- ifelse(pt[,1]>0.75, "DEF","ND")
cf2 <- confusionMatrix(as.factor(data_test[,24]),as.factor(tr_0.75))
cf2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  DEF   ND
##        DEF  109  894
##        ND    27 3445
##                                         
##                Accuracy : 0.7942        
##                  95% CI : (0.782, 0.806)
##     No Information Rate : 0.9696        
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.1457        
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.80147       
##             Specificity : 0.79396       
##          Pos Pred Value : 0.10867       
##          Neg Pred Value : 0.99222       
##              Prevalence : 0.03039       
##          Detection Rate : 0.02436       
##    Detection Prevalence : 0.22413       
##       Balanced Accuracy : 0.79772       
##                                         
##        'Positive' Class : DEF           
##

CF with threshold value of 0.90. It gives the maximum Balanced accuracy.

tr_0.90 <- ifelse(pt[,1]>0.90,"DEF","ND")
cf3 <- confusionMatrix(as.factor(data_test[,24]),as.factor(tr_0.90))
cf3

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  DEF   ND
##        DEF   25  978
##        ND     2 3470
##                                          
##                Accuracy : 0.781          
##                  95% CI : (0.7686, 0.793)
##     No Information Rate : 0.994          
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0372         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.925926       
##             Specificity : 0.780126       
##          Pos Pred Value : 0.024925       
##          Neg Pred Value : 0.999424       
##              Prevalence : 0.006034       
##          Detection Rate : 0.005587       
##    Detection Prevalence : 0.224134       
##       Balanced Accuracy : 0.853026       
##                                          
##        'Positive' Class : DEF            
##

We can also build the ROC plot and compare the sensitivity and specificity for different threshold values.

library(pROC)
par(pty ="s")
roc(data_test[,24], pt[,1], plot = TRUE, legacy.axes = T, percent = TRUE,
    print.auc =TRUE,
    
    #auc.polygon = TRUE,
    xlab= "1-Specificity",
    ylab= "Sencitivity"
    #xlab ="False Positive Percentage",
    #ylab =" True positive Percentage"
    )

## 
## Call:
## roc.default(response = data_test[, 24], predictor = pt[, 1],     percent = TRUE, plot = TRUE, legacy.axes = T, print.auc = TRUE,     xlab = "1-Specificity", ylab = "Sencitivity")
## 
## Data: pt[, 1] in 1003 controls (data_test[, 24] DEF) > 3472 cases (data_test[, 24] ND).
## Area under the curve: 77.62%

roc.info <- roc(data_test[,24], pt[,1], plot = FALSE, legacy.axes = TRUE)
auc(roc.info)

## Area under the curve: 0.7762

roc.df <- data.frame(sensitivity = roc.info$sensitivities*100,
                     specificity =(roc.info$specificities)*100,
                     thresholds = roc.info$thresholds)

roc.df$Balance <- ((roc.df$sensitivity + roc.df$specificity)/2)
head(roc.df)

##   sensitivity specificity thresholds  Balance
## 1    100.0000   0.0000000        Inf 50.00000
## 2    100.0000   0.0997009      0.997 50.04985
## 3    100.0000   0.4985045      0.994 50.24925
## 4     99.9712   0.4985045      0.989 50.23485
## 5     99.9712   0.5982054      0.984 50.28470
## 6     99.9712   0.6979063      0.976 50.33455

Printing the top 10 records with the highest Balance accuracy.

df <- roc.df[with(roc.df,order(-Balance)),]
head(df)

##     sensitivity specificity thresholds  Balance
## 300    83.92857    57.92622      0.289 70.92740
## 301    83.72696    58.12562      0.287 70.92629
## 304    83.20853    58.62413      0.281 70.91633
## 303    83.35253    58.42473      0.283 70.88863
## 311    80.90438    60.81755      0.267 70.86096
## 302    83.49654    58.22532      0.285 70.86093

RandomForest

Chintan Modi

Reading the data

Removing 167 outliers as identified in the data exploration part.

Cleaning up Marriage and Education feature

Removing the ID column…

Splitting the data: 85% training and 15% testing

finding the best mtry value: 4

RandomForest did not see any improvement after removing outliers.

Variable importance plot. It looks like Sex, Marriage and Education are the least important variables.

After 100 trees there is no improvement in the accuracy.

Predicting values using type as “response” we can see that model’s accuracy is 82% but sensitivity is only 68% and balance accuracy is 76%. We need to improve the sensitivity as misclassifying Defaulter might be costlier than misclassifying the non-defaulter.

Using probability as type to change the threshold value to improve sensitivity of the model

CF with threshold value of 0.65. As we can see that this improves the model’s sensitivity and balanced accuracy. However, it reduces the overall accuracy of the model.

CF with threshold value of 0.75

CF with threshold value of 0.90. It gives the maximum Balanced accuracy.

We can also build the ROC plot and compare the sensitivity and specificity for different threshold values.

Printing the top 10 records with the highest Balance accuracy.