Classify Credit Card Defaults

Sameer Mathur

Using Default Data from ISLR Package

---

Importing Data

library(ISLR)
# reading inbuilt data as data frame
default.df <- as.data.frame(Default)
# attach data frame
attach(default.df)
# dimension of the data frame
dim(default.df)
[1] 10000     4

Data Types of the Data Columns

# data types of the data columns
str(default.df)
'data.frame':   10000 obs. of  4 variables:
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num  730 817 1074 529 786 ...
 $ income : num  44362 12106 31767 35704 38463 ...

DATA PREPARATION

Creating Training and Testing Data

library(caTools)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 75% data for training 
split <- sample.split(default.df$default, SplitRatio = 0.75)
trainData <- subset(default.df, split == TRUE)
# dimensions of training data
dim(trainData)
[1] 7500    4
# creating 25% data for testing
testData <- subset(default.df, split == FALSE)
# dimensions of testing data
dim(testData)
[1] 2500    4

Scaling Variables

# feature scaling
trainData[c("balance", "income")] <- scale(trainData[c("balance", "income")])
testData[c("balance", "income")] <- scale(testData[c("balance", "income")])

CLASSIFICATION USING BINOMIAL LOGISTIC MODEL

(Classification using Logistic Regression with glm())

Classifier 1

# fit logistic classifier 1
logitClassifier1 <- glm(default ~ balance, 
                  data = trainData, 
                  family = binomial())
# summary of the classifier 1
summary(logitClassifier1)

Call:
glm(formula = default ~ balance, family = binomial(), data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2763  -0.1456  -0.0578  -0.0213   3.7020  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -6.0903     0.2134  -28.54   <2e-16 ***
balance       2.6841     0.1239   21.67   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2192.2  on 7499  degrees of freedom
Residual deviance: 1193.7  on 7498  degrees of freedom
AIC: 1197.7

Number of Fisher Scoring iterations: 8

Classifier 2

# fit logistic classifier 2
logitClassifier2 <- glm(default ~ student + balance, 
                  data = trainData, 
                  family = binomial())
# summary of the classifier 2
summary(logitClassifier2)

Call:
glm(formula = default ~ student + balance, family = binomial(), 
    data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4456  -0.1414  -0.0553  -0.0198   3.6847  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -5.9968     0.2164  -27.71  < 2e-16 ***
studentYes   -0.6307     0.1686   -3.74 0.000184 ***
balance       2.7878     0.1301   21.43  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2192.2  on 7499  degrees of freedom
Residual deviance: 1179.1  on 7497  degrees of freedom
AIC: 1185.1

Number of Fisher Scoring iterations: 8

Classifier 3

# fit logistic classifier
logitClassifier3 <- glm(default ~ balance + income + student,
                  data = trainData, 
                  family = binomial())
# summary of the classifier
summary(logitClassifier3)

Call:
glm(formula = default ~ balance + income + student, family = binomial(), 
    data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4537  -0.1416  -0.0553  -0.0198   3.6893  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -6.01034    0.22584 -26.613   <2e-16 ***
balance      2.78772    0.13014  21.421   <2e-16 ***
income       0.02735    0.12831   0.213   0.8312    
studentYes  -0.58511    0.27241  -2.148   0.0317 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2192.2  on 7499  degrees of freedom
Residual deviance: 1179.1  on 7496  degrees of freedom
AIC: 1187.1

Number of Fisher Scoring iterations: 8

PREDICTION

Predicting Test Data results using Classifier 1

# prediction using classifier 1
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData[-1])
yPred1 <- ifelse(predProbClass1 > 0.5, "Yes", "No")
table(yPred1)
yPred1
  No  Yes 
2464   36 

Predicting Test Data results using Classifier 2

# prediction using classifier 1
predProbClass2 <- predict(logitClassifier2, type = 'response', newdata = testData[-1])
yPred2 <- ifelse(predProbClass2 > 0.5, "Yes", "No")
table(yPred2)
yPred2
  No  Yes 
2466   34 

Predicting Test Data results using Classifier 3

# prediction usinf Classifier 3
predProbClass3 <- predict(logitClassifier3, type = 'response', newdata = testData[-1])
yPred3 <- ifelse(predProbClass3 > 0.5, "Yes", "No")
table(yPred3)
yPred3
  No  Yes 
2466   34 

Akaike Information Criterion (AIC)

  • Measures how good is the model
  • The smaller the AIC, the “gooder” the model
# AIC of the classifiers
AIC(logitClassifier1, logitClassifier2, logitClassifier3)
                 df      AIC
logitClassifier1  2 1197.732
logitClassifier2  3 1185.120
logitClassifier3  4 1187.074

CONFUSION MATRIX

Confusion Matrix using Classifier 1

# confusion matrix using classifier 1
confMatrix1 <- table(yActual = testData[, 1], yPred1)
confMatrix1
       yPred1
yActual   No  Yes
    No  2409    8
    Yes   55   28

Confusion Matrix using Classifier 2

# confusion matrix using classifier 2
confMatrix2 <- table(yActual = testData[, 1], yPred2)
confMatrix2
       yPred2
yActual   No  Yes
    No  2412    5
    Yes   54   29

Confusion Matrix using Classifier 3

# confusion matrix using classifier 3
confMatrix3 <- table(yActual = testData[, 1], yPred3)
confMatrix3
       yPred3
yActual   No  Yes
    No  2412    5
    Yes   54   29

METRICES USING MLmetrics

(ACCURACY, SENSITIVITY, SPECIFICITY)

Confusion Matrix using Classifier 1

# confusion matrix using classifier 1
library(MLmetrics)
ConfusionMatrix(y_pred = yPred1, y_true = testData[, 1])
      y_pred
y_true   No  Yes
   No  2409    8
   Yes   55   28

Confusion Matrix using Classifier 2

# confusion matrix using classifier 2
library(MLmetrics)
ConfusionMatrix(y_pred = yPred2, y_true = testData[, 1])
      y_pred
y_true   No  Yes
   No  2412    5
   Yes   54   29

Confusion Matrix using Classifier 3

# confusion matrix using Classifier 3
library(MLmetrics)
ConfusionMatrix(y_pred = yPred3, y_true = testData[, 1])
      y_pred
y_true   No  Yes
   No  2412    5
   Yes   54   29

Accuracy using Classifier 1

# accuracy using classifier 1
library(MLmetrics)
Accuracy(y_pred = yPred1, y_true = testData$default)
[1] 0.9748

Accuracy using Classifier 2

# accuracy using classifier 2
library(MLmetrics)
Accuracy(y_pred = yPred2, y_true = testData$default)
[1] 0.9764

Accuracy using Classifier 3

# accuracy using classifier 3
library(MLmetrics)
Accuracy(y_pred = yPred3, y_true = testData$default)
[1] 0.9764

Sensitivity using Classifier 1

# sensitivity using classifier 1
library(MLmetrics)
Sensitivity(y_true = testData$default, y_pred = yPred1, positive = "Yes")
[1] 0.3373494

Sensitivity using Classifier 2

# sensitivity using classifier 2
library(MLmetrics)
Sensitivity(y_true = testData$default, y_pred = yPred2, positive = "Yes")
[1] 0.3493976

Sensitivity using Classifier 3

# sensitivity using classifier 3
library(MLmetrics)
Sensitivity(y_true = testData$default, y_pred = yPred3, positive = "Yes")
[1] 0.3493976

Specificity using Classifier 1

# specificity using classifier 1
library(MLmetrics)
Specificity(y_true = testData$default, y_pred = yPred1, positive = "Yes")
[1] 0.9966901

Specificity using Classifier 2

# specificity using classifier 2
library(MLmetrics)
Specificity(y_true = testData$default, y_pred = yPred2, positive = "Yes")
[1] 0.9979313

Specificity using Classifier 3

# specificity using classifier 3
library(MLmetrics)
Specificity(y_true = testData$default, y_pred = yPred3, positive = "Yes")
[1] 0.9979313

ROC PLOT

ROC Plot using Classifier 1

library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject1 <- prediction(predProbClass1, testData$default)

# All kinds of predictor evaluations are performed using the performance function
PerformObject1 <- performance(PredictObject1, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObject1, main = "ROC Curve for CC Default", col = "black", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")

plot of chunk unnamed-chunk-31

ROC Plot using Classifier 2

library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject2 <- prediction(predProbClass2, testData$default)

# All kinds of predictor evaluations are performed using the performance function
PerformObject2 <- performance(PredictObject2, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObject2, main = "ROC Curve for CC Default", col = "red", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")

plot of chunk unnamed-chunk-33

ROC Plot using Classifier 3

library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject3 <- prediction(predProbClass3, testData$default)

# All kinds of predictor evaluations are performed using the performance function
PerformObject3 <- performance(PredictObject3, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObject3, main = "ROC Curve for CC Default", col = "blue", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")

plot of chunk unnamed-chunk-35

ROC Curves of Three Classifiers

# ROC curves
plot(PerformObject1, col = "black", lwd = 2,
     main = "ROC Curve for CC Default")
plot(PerformObject2, add = TRUE, col = "red", lwd = 3,
     main = "ROC Curve for CC Default")
plot(PerformObject3, add = TRUE, col = "blue", lwd = 2,
     main = "ROC Curve for CC Default")

plot of chunk unnamed-chunk-37