Classify Credit Card Defaults

Sameer Mathur

Using Default Data from ISLR Package

---

Importing Data

library(ISLR)
# reading inbuilt data as data frame
default.df <- as.data.frame(Default)
# attach data frame
attach(default.df)
# dimension of the data frame
dim(default.df)

[1] 10000     4

Data Types of the Data Columns

# data types of the data columns
str(default.df)

'data.frame':   10000 obs. of  4 variables:
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num  730 817 1074 529 786 ...
 $ income : num  44362 12106 31767 35704 38463 ...

DATA PREPARATION

Creating Training and Testing Data

library(caTools)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 75% data for training 
split <- sample.split(default.df$default, SplitRatio = 0.75)
trainData <- subset(default.df, split == TRUE)
# dimensions of training data
dim(trainData)

[1] 7500    4

# creating 25% data for testing
testData <- subset(default.df, split == FALSE)
# dimensions of testing data
dim(testData)

[1] 2500    4

Scaling Variables

# feature scaling
trainData[c("balance", "income")] <- scale(trainData[c("balance", "income")])
testData[c("balance", "income")] <- scale(testData[c("balance", "income")])

CLASSIFICATION USING BINOMIAL LOGISTIC MODEL

(Classification using Logistic Regression with `glm()`)

Classifier 1

# fit logistic classifier 1
logitClassifier1 <- glm(default ~ balance, 
                  data = trainData, 
                  family = binomial())
# summary of the classifier 1
summary(logitClassifier1)


Call:
glm(formula = default ~ balance, family = binomial(), data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2763  -0.1456  -0.0578  -0.0213   3.7020  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -6.0903     0.2134  -28.54   <2e-16 ***
balance       2.6841     0.1239   21.67   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2192.2  on 7499  degrees of freedom
Residual deviance: 1193.7  on 7498  degrees of freedom
AIC: 1197.7

Number of Fisher Scoring iterations: 8

Classifier 2

# fit logistic classifier 2
logitClassifier2 <- glm(default ~ student + balance, 
                  data = trainData, 
                  family = binomial())
# summary of the classifier 2
summary(logitClassifier2)


Call:
glm(formula = default ~ student + balance, family = binomial(), 
    data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4456  -0.1414  -0.0553  -0.0198   3.6847  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -5.9968     0.2164  -27.71  < 2e-16 ***
studentYes   -0.6307     0.1686   -3.74 0.000184 ***
balance       2.7878     0.1301   21.43  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2192.2  on 7499  degrees of freedom
Residual deviance: 1179.1  on 7497  degrees of freedom
AIC: 1185.1

Number of Fisher Scoring iterations: 8

Classifier 3

# fit logistic classifier
logitClassifier3 <- glm(default ~ balance + income + student,
                  data = trainData, 
                  family = binomial())
# summary of the classifier
summary(logitClassifier3)


Call:
glm(formula = default ~ balance + income + student, family = binomial(), 
    data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4537  -0.1416  -0.0553  -0.0198   3.6893  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -6.01034    0.22584 -26.613   <2e-16 ***
balance      2.78772    0.13014  21.421   <2e-16 ***
income       0.02735    0.12831   0.213   0.8312    
studentYes  -0.58511    0.27241  -2.148   0.0317 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2192.2  on 7499  degrees of freedom
Residual deviance: 1179.1  on 7496  degrees of freedom
AIC: 1187.1

Number of Fisher Scoring iterations: 8

PREDICTION

Predicting Test Data results using Classifier 1

# prediction using classifier 1
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData[-1])
yPred1 <- ifelse(predProbClass1 > 0.5, "Yes", "No")
table(yPred1)

yPred1
  No  Yes 
2464   36

Predicting Test Data results using Classifier 2

# prediction using classifier 1
predProbClass2 <- predict(logitClassifier2, type = 'response', newdata = testData[-1])
yPred2 <- ifelse(predProbClass2 > 0.5, "Yes", "No")
table(yPred2)

yPred2
  No  Yes 
2466   34

Predicting Test Data results using Classifier 3

# prediction usinf Classifier 3
predProbClass3 <- predict(logitClassifier3, type = 'response', newdata = testData[-1])
yPred3 <- ifelse(predProbClass3 > 0.5, "Yes", "No")
table(yPred3)

yPred3
  No  Yes 
2466   34

Akaike Information Criterion (AIC)

Measures how good is the model
The smaller the AIC, the “gooder” the model

# AIC of the classifiers
AIC(logitClassifier1, logitClassifier2, logitClassifier3)

                 df      AIC
logitClassifier1  2 1197.732
logitClassifier2  3 1185.120
logitClassifier3  4 1187.074

CONFUSION MATRIX

Confusion Matrix using Classifier 1

# confusion matrix using classifier 1
confMatrix1 <- table(yActual = testData[, 1], yPred1)
confMatrix1

       yPred1
yActual   No  Yes
    No  2409    8
    Yes   55   28

Confusion Matrix using Classifier 2

# confusion matrix using classifier 2
confMatrix2 <- table(yActual = testData[, 1], yPred2)
confMatrix2

       yPred2
yActual   No  Yes
    No  2412    5
    Yes   54   29

Confusion Matrix using Classifier 3

# confusion matrix using classifier 3
confMatrix3 <- table(yActual = testData[, 1], yPred3)
confMatrix3

       yPred3
yActual   No  Yes
    No  2412    5
    Yes   54   29

METRICES USING `MLmetrics`

(ACCURACY, SENSITIVITY, SPECIFICITY)

Confusion Matrix using Classifier 1

# confusion matrix using classifier 1
library(MLmetrics)
ConfusionMatrix(y_pred = yPred1, y_true = testData[, 1])

      y_pred
y_true   No  Yes
   No  2409    8
   Yes   55   28

Confusion Matrix using Classifier 2

# confusion matrix using classifier 2
library(MLmetrics)
ConfusionMatrix(y_pred = yPred2, y_true = testData[, 1])

      y_pred
y_true   No  Yes
   No  2412    5
   Yes   54   29

Confusion Matrix using Classifier 3

# confusion matrix using Classifier 3
library(MLmetrics)
ConfusionMatrix(y_pred = yPred3, y_true = testData[, 1])

      y_pred
y_true   No  Yes
   No  2412    5
   Yes   54   29

Accuracy using Classifier 1

# accuracy using classifier 1
library(MLmetrics)
Accuracy(y_pred = yPred1, y_true = testData$default)

[1] 0.9748

Accuracy using Classifier 2

# accuracy using classifier 2
library(MLmetrics)
Accuracy(y_pred = yPred2, y_true = testData$default)

[1] 0.9764

Accuracy using Classifier 3

# accuracy using classifier 3
library(MLmetrics)
Accuracy(y_pred = yPred3, y_true = testData$default)

[1] 0.9764

Sensitivity using Classifier 1

# sensitivity using classifier 1
library(MLmetrics)
Sensitivity(y_true = testData$default, y_pred = yPred1, positive = "Yes")

[1] 0.3373494

Sensitivity using Classifier 2

# sensitivity using classifier 2
library(MLmetrics)
Sensitivity(y_true = testData$default, y_pred = yPred2, positive = "Yes")

[1] 0.3493976

Sensitivity using Classifier 3

# sensitivity using classifier 3
library(MLmetrics)
Sensitivity(y_true = testData$default, y_pred = yPred3, positive = "Yes")

[1] 0.3493976

Specificity using Classifier 1

# specificity using classifier 1
library(MLmetrics)
Specificity(y_true = testData$default, y_pred = yPred1, positive = "Yes")

[1] 0.9966901

Specificity using Classifier 2

# specificity using classifier 2
library(MLmetrics)
Specificity(y_true = testData$default, y_pred = yPred2, positive = "Yes")

[1] 0.9979313

Specificity using Classifier 3

# specificity using classifier 3
library(MLmetrics)
Specificity(y_true = testData$default, y_pred = yPred3, positive = "Yes")

[1] 0.9979313

ROC PLOT

ROC Plot using Classifier 1

library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject1 <- prediction(predProbClass1, testData$default)

# All kinds of predictor evaluations are performed using the performance function
PerformObject1 <- performance(PredictObject1, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObject1, main = "ROC Curve for CC Default", col = "black", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")

plot of chunk unnamed-chunk-31

ROC Plot using Classifier 2

library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject2 <- prediction(predProbClass2, testData$default)

# All kinds of predictor evaluations are performed using the performance function
PerformObject2 <- performance(PredictObject2, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObject2, main = "ROC Curve for CC Default", col = "red", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")

plot of chunk unnamed-chunk-33

ROC Plot using Classifier 3

library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject3 <- prediction(predProbClass3, testData$default)

# All kinds of predictor evaluations are performed using the performance function
PerformObject3 <- performance(PredictObject3, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObject3, main = "ROC Curve for CC Default", col = "blue", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")

plot of chunk unnamed-chunk-35

ROC Curves of Three Classifiers

# ROC curves
plot(PerformObject1, col = "black", lwd = 2,
     main = "ROC Curve for CC Default")
plot(PerformObject2, add = TRUE, col = "red", lwd = 3,
     main = "ROC Curve for CC Default")
plot(PerformObject3, add = TRUE, col = "blue", lwd = 2,
     main = "ROC Curve for CC Default")

plot of chunk unnamed-chunk-37

Classify Credit Card Defaults

Importing Data

Data Types of the Data Columns

DATA PREPARATION

Creating Training and Testing Data

Scaling Variables

CLASSIFICATION USING BINOMIAL LOGISTIC MODEL

(Classification using Logistic Regression with glm())

Classifier 1

Classifier 2

Classifier 3

PREDICTION

Predicting Test Data results using Classifier 1

Predicting Test Data results using Classifier 2

Predicting Test Data results using Classifier 3

Akaike Information Criterion (AIC)

CONFUSION MATRIX

Confusion Matrix using Classifier 1

Confusion Matrix using Classifier 2

Confusion Matrix using Classifier 3

METRICES USING MLmetrics

(ACCURACY, SENSITIVITY, SPECIFICITY)

Confusion Matrix using Classifier 1

Confusion Matrix using Classifier 2

Confusion Matrix using Classifier 3

Accuracy using Classifier 1

Accuracy using Classifier 2

Accuracy using Classifier 3

Sensitivity using Classifier 1

Sensitivity using Classifier 2

Sensitivity using Classifier 3

Specificity using Classifier 1

Specificity using Classifier 2

Specificity using Classifier 3

ROC PLOT

ROC Plot using Classifier 1

ROC Plot using Classifier 2

ROC Plot using Classifier 3

ROC Curves of Three Classifiers

(Classification using Logistic Regression with `glm()`)

METRICES USING `MLmetrics`