Classify Credit Card Defaults

Sameer Mathur

Comparing Classifiers (Logit, Decision Tree and Random Forest)

Using Default Data from ISLR Package

---

IMPORTING DATA

Reading Data

library(ISLR)
# reading inbuilt data as data frame
default.df <- as.data.frame(Default)
# attach data frame
attach(default.df)
# dimension of the data frame
dim(default.df)
[1] 10000     4

Data Structure

# structure of the data table
str(default.df)
'data.frame':   10000 obs. of  4 variables:
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num  730 817 1074 529 786 ...
 $ income : num  44362 12106 31767 35704 38463 ...

Descriptive Statistics

# descriptive statistics of the dataframe
library(psych)
describe(default.df)[, c(1:5)]
         vars     n     mean       sd   median
default*    1 10000     1.03     0.18     1.00
student*    2 10000     1.29     0.46     1.00
balance     3 10000   835.37   483.71   823.64
income      4 10000 33516.98 13336.64 34552.64

SPLITTING DATA

(DATA TRAINING AND TESTING)

Training (80%) and Tesing (20%) Data

library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(default.df$default, p = 0.80, list = FALSE)
# 80% training data
trainData.df <- default.df[trainIndex, ]
table(trainData.df$default)

  No  Yes 
7734  267 
# 20% testing data
testData.df <- default.df[-trainIndex, ]
table(testData.df$default)

  No  Yes 
1933   66 

BUILDING MODELS

(LOGIT AND DECISION TREE)

Control Parameters

# control parameters
objControl <- trainControl(method = "boot", 
                           number = 2, 
                           returnResamp = 'none', 
                           summaryFunction = twoClassSummary, 
                           classProbs = TRUE,
                           savePredictions = TRUE)

Building Logit Model using caret Package

# model building using caret package
set.seed(766)
caretLogitModel <- train(trainData.df[, 2:4],
                      trainData.df[, 1],
                      method = 'glmStepAIC',
                      trControl = objControl,
                      metric = "ROC",
                      verbose = FALSE)

Building Tree Model using caret Package (Information Gain)

# decision tree using method criterian information gain
dTreeInfoGain <- train(default ~ ., 
                       data = trainData.df, 
                       method = "rpart", 
                       parms = list(split = "information"), 
                       trControl = trainControl(method = "cv"))
dTreeInfoGain

Building Random Forest Tree Model using caret Package

# setting parameters
library(caret)
ctrl <- trainControl(method = "repeatedcv", 
                     number = 2, 
                     repeats = 2, 
                     selectionFunction = "oneSE")

set.seed(766)
# fitting random forest model
modelRF <- train(default ~ ., 
                 data = testData.df, 
                 method = "rf", 
                 metric = "Kappa",
                 trControl = ctrl)
modelRF

PREDICTED PROBABILITIES PLOTS

(LOGIT, DECISION TREE AND RANDOM FOREST)

Predicted Probabilities of Default (Yes / No), based on Test Data (Logit)

# predicted probabilities
predTestProb <- predict(caretLogitModel, testData.df, type = "prob")
# plot of probabilities
plot(predTestProb[,2], 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

plot of chunk unnamed-chunk-13

Predicted Probabilities of Default (Yes / No), based on Test Data (Tree)

# predicted probabilities
predTestProbInfoGain <- predict(dTreeInfoGain, testData.df, type = "prob")
# plot of probabilities
plot(predTestProbInfoGain[,2], 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

plot of chunk unnamed-chunk-15

Predicted Probabilities of Default (Yes / No), based on Test Data (Random Forest)

# predicted probabilities
predTestProbRF <- predict(modelRF, testData.df, type = "prob")
# plot of probabilities
plot(predTestProbRF[,2], 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

plot of chunk unnamed-chunk-17

PREDICTION AND CONFUSION MATRIX

(LOGIT, DECISION TREE AND RANDOM FOREST)

Prediction based on Test Data (Logit)

# prediction of default = {no, yes} on test data (logit model)
predClass <- predict(caretLogitModel, testData.df[, 2:4], type = 'raw')
table(predClass)
predClass
  No  Yes 
1976   23 

Prediction based on Test Data (Tree)

# prediction of default = {no, yes} on test data (decision tree)
predClassInfoGain <- predict(dTreeInfoGain, testData.df[, 2:4], type = 'raw')
table(predClassInfoGain)
predClassInfoGain
  No  Yes 
1961   38 

Prediction based on Test Data (Random Forest)

# prediction of default = {no, yes} on test data (random forest)
predClassRF <- predict(modelRF, testData.df[, 2:4], type = 'raw')
table(predClassRF)
predClassRF
  No  Yes 
1961   38 

Confusion Matrix based on Test Data (Logit)

# confusion matrix (logit)
confusionMatrix(predClass, testData.df$default, positive = "Yes")

Confusion Matrix based on Test Data (Tree)

# confusion matrix (decision tree)
confusionMatrix(predClassInfoGain, testData.df$default, positive = "Yes")

Confusion Matrix based on Test Data (Random Forest)

# confusion matrix (random forest)
confusionMatrix(predClassRF, testData.df$default, positive = "Yes")

Logit Model

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1925   51
       Yes    8   15

               Accuracy : 0.9705          
                 95% CI : (0.9621, 0.9775)
    No Information Rate : 0.967           
    P-Value [Acc > NIR] : 0.2098          

                  Kappa : 0.3256          
 Mcnemar's Test P-Value : 4.553e-08       

            Sensitivity : 0.227273        
            Specificity : 0.995861        
         Pos Pred Value : 0.652174        
         Neg Pred Value : 0.974190        
             Prevalence : 0.033017        
         Detection Rate : 0.007504        
   Detection Prevalence : 0.011506        
      Balanced Accuracy : 0.611567        

       'Positive' Class : Yes             

Decision Tree

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1914   47
       Yes   19   19

               Accuracy : 0.967           
                 95% CI : (0.9582, 0.9744)
    No Information Rate : 0.967           
    P-Value [Acc > NIR] : 0.532679        

                  Kappa : 0.3497          
 Mcnemar's Test P-Value : 0.000889        

            Sensitivity : 0.287879        
            Specificity : 0.990171        
         Pos Pred Value : 0.500000        
         Neg Pred Value : 0.976033        
             Prevalence : 0.033017        
         Detection Rate : 0.009505        
   Detection Prevalence : 0.019010        
      Balanced Accuracy : 0.639025        

       'Positive' Class : Yes             

Random Forest

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1911   50
       Yes   22   16

               Accuracy : 0.964           
                 95% CI : (0.9549, 0.9717)
    No Information Rate : 0.967           
    P-Value [Acc > NIR] : 0.794016        

                  Kappa : 0.2906          
 Mcnemar's Test P-Value : 0.001463        

            Sensitivity : 0.242424        
            Specificity : 0.988619        
         Pos Pred Value : 0.421053        
         Neg Pred Value : 0.974503        
             Prevalence : 0.033017        
         Detection Rate : 0.008004        
   Detection Prevalence : 0.019010        
      Balanced Accuracy : 0.615521        

       'Positive' Class : Yes             

ROC CURVES AND AUC

(LOGIT, DECISION TREE AND RANDOM FOREST)

ROC Plot on the Test data (Logit)

library(ROCR)
# prediction
PredictObjectLogit <- prediction(predTestProb[2], testData.df$default)

# performance
PerformObjectLogit <- performance(PredictObjectLogit, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObjectLogit, 
     main = "ROC Curve for CC Default (Logit)",
     col = "red",
     lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

plot of chunk unnamed-chunk-28

ROC Plot on the Test data (Tree)

library(ROCR)
# prediction
PredictObjectTree <- prediction(predTestProbInfoGain[2], testData.df$default)

# performance
PerformObjectTree <- performance(PredictObjectTree, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObjectTree, 
     main = "ROC Curve for CC Default (Decision Tree)",
     col = "blue",
     lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

plot of chunk unnamed-chunk-30

ROC Plot on the Test data (Random Forest)

library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObjectRF <- prediction(predTestProbRF[2], testData.df$default)

# All kinds of predictor evaluations are performed using the performance function
PerformObjectRF <- performance(PredictObjectRF, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObjectRF, 
     main = "ROC Curve for CC Default (Random Forest)",
     col = "green",
     lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

plot of chunk unnamed-chunk-32

Comparing ROC Curves of Classifiers (Logit, Tree and Random Forest)

# ROC curves (logit, tree and random forest)
plot(PerformObjectLogit, col = "red", lwd = 2,
     main = "ROC Curve for CC Default \n (Logit, Decision Tree and Random Forest)")
plot(PerformObjectTree, add = TRUE, col = "blue", lwd = 3,
     main = "ROC Curve for CC Default \n (Logit, Decision Tree and Random Forest)")
plot(PerformObjectRF, add = TRUE, col = "green", lwd = 3,
     main = "ROC Curve for CC Default \n (Logit, Decision Tree and Random Forest)")

plot of chunk unnamed-chunk-34

Area Under the Curve (Logit)

# auc for logit model
aucLogit <- performance(PredictObjectLogit, measure = "auc")
aucLogit <- aucLogit@y.values[[1]]
aucLogit
[1] 0.9451316

Area Under the Curve (Tree)

# auc for random forest
aucTree <- performance(PredictObjectTree, measure = "auc")
aucTree <- aucTree@y.values[[1]]
aucTree
[1] 0.8503817

Area Under the Curve (Random Forest)

# auc for random forest
aucRF <- performance(PredictObjectRF, measure = "auc")
aucRF <- aucRF@y.values[[1]]
aucRF
[1] 0.8849371