Classify Credit Card Defaults

Sameer Mathur

Decision Tree

(Gini Index, Information Gain)

Using Default Data from ISLR Package

---

READING AND EXPLORING DATA

Importing Data

library(ISLR)
# reading inbuilt data as data frame
default.df <- as.data.frame(Default)
# attach data frame
attach(default.df)
# dimension of the data frame
dim(default.df)
[1] 10000     4

First Fews Rows of the Dataframe

# first few rows of the dataframe
head(default.df)
  default student   balance    income
1      No      No  729.5265 44361.625
2      No     Yes  817.1804 12106.135
3      No      No 1073.5492 31767.139
4      No      No  529.2506 35704.494
5      No      No  785.6559 38463.496
6      No     Yes  919.5885  7491.559

Data Types of the Data Columns

# data types of the data columns
str(default.df)
'data.frame':   10000 obs. of  4 variables:
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num  730 817 1074 529 786 ...
 $ income : num  44362 12106 31767 35704 38463 ...

Descriptive Statistics

# descriptive statistics
library(psych)
describe(default.df)[, c(1:5, 8:9)]
         vars     n     mean       sd   median    min      max
default*    1 10000     1.03     0.18     1.00   1.00     2.00
student*    2 10000     1.29     0.46     1.00   1.00     2.00
balance     3 10000   835.37   483.71   823.64   0.00  2654.32
income      4 10000 33516.98 13336.64 34552.64 771.97 73554.23

SPLITTING DATA

(DATA TRAINING AND TESTING)

Training (80%) and Tesing (20%) Data

library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(default.df$default, p = 0.80, list = FALSE)
# 80% training data
trainData.df <- default.df[trainIndex, ]
table(trainData.df$default)

  No  Yes 
7734  267 
# 20% testing data
testData.df <- default.df[-trainIndex, ]
table(testData.df$default)

  No  Yes 
1933   66 

DECISION TREE

(GINI INDEX AND INFORMATION GAIN)

Fitting Decision Tree Classifier on training dataset with criterion as Gini Index

# gini index
set.seed(4321)
dTreeGiniIndex <- train(default ~., 
                        data = trainData.df, 
                        method = "rpart", 
                        parms = list(split = "gini"))
dTreeGiniIndex

Fitting Decision Tree Classifier on training dataset with criterion as Information Gain

# information gain
set.seed(4534)
dTreeInfoGain <- train(default ~ ., 
                       data = trainData.df, 
                       method = "rpart", 
                       parms = list(split = "information"), 
                       trControl = trainControl(method = "cv"))
dTreeInfoGain

Visualizing Tree using package rpart.plot (Gini Index)

# viasulaziation (gini index)
library(rpart.plot)
prp(dTreeGiniIndex$finalModel, box.palette = "Reds", tweak = 1.2)

plot of chunk unnamed-chunk-11

Visualizing Tree using package rpart.plot (Information Gain)

# viasulaziation (information gain)
library(rpart.plot)
prp(dTreeInfoGain$finalModel, box.palette = "Reds", tweak = 1.2)

plot of chunk unnamed-chunk-13

Predicting test dataset results for criterian Gini Index

# prediction of default = {no, yes} on test data (gini Index)
predClassGiniIndex <- predict(dTreeGiniIndex, testData.df[, 2:4], type = 'raw')
table(predClassGiniIndex)
predClassGiniIndex
  No  Yes 
1983   16 

Predicting test dataset results for criterian Information Gain

# prediction of default = {no, yes} on test data (information gain)
predClassInfoGain <- predict(dTreeInfoGain, testData.df[, 2:4], type = 'raw')
table(predClassInfoGain)
predClassInfoGain
  No  Yes 
1961   38 

Confusion Matrix based on Test Data (Gini Index)

# confusion matrix (gini index)
confusionMatrix(predClassGiniIndex, testData.df$default, positive = "Yes")
Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1926   57
       Yes    7    9

               Accuracy : 0.968           
                 95% CI : (0.9593, 0.9753)
    No Information Rate : 0.967           
    P-Value [Acc > NIR] : 0.433           

                  Kappa : 0.2093          
 Mcnemar's Test P-Value : 9.068e-10       

            Sensitivity : 0.136364        
            Specificity : 0.996379        
         Pos Pred Value : 0.562500        
         Neg Pred Value : 0.971256        
             Prevalence : 0.033017        
         Detection Rate : 0.004502        
   Detection Prevalence : 0.008004        
      Balanced Accuracy : 0.566371        

       'Positive' Class : Yes             

Confusion Matrix based on Test Data (Information Gain)

# confusion matrix (information gain)
confusionMatrix(predClassInfoGain, testData.df$default, positive = "Yes")
Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1914   47
       Yes   19   19

               Accuracy : 0.967           
                 95% CI : (0.9582, 0.9744)
    No Information Rate : 0.967           
    P-Value [Acc > NIR] : 0.532679        

                  Kappa : 0.3497          
 Mcnemar's Test P-Value : 0.000889        

            Sensitivity : 0.287879        
            Specificity : 0.990171        
         Pos Pred Value : 0.500000        
         Neg Pred Value : 0.976033        
             Prevalence : 0.033017        
         Detection Rate : 0.009505        
   Detection Prevalence : 0.019010        
      Balanced Accuracy : 0.639025        

       'Positive' Class : Yes             

Predicted Probabilities of Default (Yes/No), based on Test Data (Gini Index)

# predicted probabilities (gini index)
predTestProbGiniIndex <- predict(dTreeGiniIndex, testData.df, type = "prob")
# plot of probabilities
plot(predTestProbGiniIndex[,2], 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

plot of chunk unnamed-chunk-21

Predicted Probabilities of Default (Yes/No), based on Test Data (Information Gain)

# predicted probabilities (information gain)
predTestProbInfoGain <- predict(dTreeInfoGain, testData.df, type = "prob")
# plot of probabilities
plot(predTestProbInfoGain[,2], 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

plot of chunk unnamed-chunk-23

ROC Plot on the Test data (Gini Index)

library(ROCR)
# prediction
PredictObjectGiniIndex <- prediction(predTestProbGiniIndex[2], testData.df$default)

# performance
PerformObjectGiniIndex <- performance(PredictObjectGiniIndex, "tpr","fpr")

# plot of the ROC curve for credit default
plot(PerformObjectGiniIndex, 
     main = "ROC Curve for CC Default (Gini Index)",
     col = "red",
     lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

plot of chunk unnamed-chunk-25

ROC Plot on the Test data (Information Gain)

library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObjectInfoGain <- prediction(predTestProbInfoGain[2], testData.df$default)

# All kinds of predictor evaluations are performed using the performance function
PerformObjectInfoGain <- performance(PredictObjectInfoGain, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObjectInfoGain, 
     main = "ROC Curve for CC Default (Information Gain)",
     col = "blue",
     lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

plot of chunk unnamed-chunk-27

Area Under the Curve (Gini Index)

# auc for random forest
aucGiniIndex <- performance(PredictObjectGiniIndex, measure = "auc")
aucGiniIndex <- aucGiniIndex@y.values[[1]]
aucGiniIndex
[1] 0.6837934

Area Under the Curve (Information Gain)

# auc for random forest
aucInfoGain <- performance(PredictObjectInfoGain, measure = "auc")
aucInfoGain <- aucInfoGain@y.values[[1]]
aucInfoGain
[1] 0.8503817