Classifing Loan Defaulters

Sameer Mathur

Using Default Data from ISLR Package

---

Importing Data

library(ISLR)
# reading inbuilt data as data frame
default.df <- as.data.frame(Default)
# attach data frame
attach(default.df)
# dimension of the data frame
dim(default.df)
[1] 10000     4

Data Types of the Data Columns

# data types of the data columns
str(default.df)
'data.frame':   10000 obs. of  4 variables:
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num  730 817 1074 529 786 ...
 $ income : num  44362 12106 31767 35704 38463 ...

DATA PREPARATION

Creating Training and Testing Data

library(caTools)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 75% data for training 
split <- sample.split(default.df$default, SplitRatio = 0.75)
trainData <- subset(default.df, split == TRUE)
# dimensions of training data
dim(trainData)
[1] 7500    4
# creating 25% data for testing
testData <- subset(default.df, split == FALSE)
# dimensions of testing data
dim(testData)
[1] 2500    4

Scaling Variables

# feature scaling
trainData[c("balance", "income")] <- scale(trainData[c("balance", "income")])
testData[c("balance", "income")] <- scale(testData[c("balance", "income")])

CLASSIFICATION USING BINOMIAL LOGISTIC MODEL

(Classification using Logistic Regression with glm())

Classifier 1

# fit logistic classifier
logitClassifier1 <- glm(default ~ balance, 
                  data = trainData, 
                  family = binomial())
# summary of the classifier
summary(logitClassifier1)

Call:
glm(formula = default ~ balance, family = binomial(), data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2763  -0.1456  -0.0578  -0.0213   3.7020  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -6.0903     0.2134  -28.54   <2e-16 ***
balance       2.6841     0.1239   21.67   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2192.2  on 7499  degrees of freedom
Residual deviance: 1193.7  on 7498  degrees of freedom
AIC: 1197.7

Number of Fisher Scoring iterations: 8

Classifier 2

# fit logistic classifier
logitClassifier2 <- glm(default ~ student, 
                  data = trainData, 
                  family = binomial())
# summary of the classifier
summary(logitClassifier2)

Call:
glm(formula = default ~ student, family = binomial(), data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3009  -0.3009  -0.2413  -0.2413   2.6650  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -3.52207    0.08257 -42.655  < 2e-16 ***
studentYes   0.44956    0.13186   3.409 0.000651 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2192.2  on 7499  degrees of freedom
Residual deviance: 2181.0  on 7498  degrees of freedom
AIC: 2185

Number of Fisher Scoring iterations: 6

Classifier 3

# fit logistic classifier
logitClassifier3 <- glm(default ~ ., 
                  data = trainData, 
                  family = binomial())
# summary of the classifier
summary(logitClassifier3)

Call:
glm(formula = default ~ ., family = binomial(), data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4537  -0.1416  -0.0553  -0.0198   3.6893  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -6.01034    0.22584 -26.613   <2e-16 ***
studentYes  -0.58511    0.27241  -2.148   0.0317 *  
balance      2.78772    0.13014  21.421   <2e-16 ***
income       0.02735    0.12831   0.213   0.8312    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2192.2  on 7499  degrees of freedom
Residual deviance: 1179.1  on 7496  degrees of freedom
AIC: 1187.1

Number of Fisher Scoring iterations: 8

PREDICTION

Predicting Test Data results using Classifier 1

# prediction usinf classifier 1
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData[-1])
yPred1 <- ifelse(predProbClass1 > 0.5, 1, 0)
table(yPred1)
yPred1
   0    1 
2464   36 

Predicting Test Data results using Classifier 3

# prediction usinf classifier 3
predProbClass3 <- predict(logitClassifier3, type = 'response', newdata = testData[-1])
yPred3 <- ifelse(predProbClass3 > 0.5, 1, 0)
table(yPred3)
yPred3
   0    1 
2466   34 

AIC – Aikaike Information Criterion

  • Measures how good is the model
  • The smaller the AIC, the gooder the model
# AIC of the classifiers
AIC(logitClassifier1, logitClassifier3)
                 df      AIC
logitClassifier1  2 1197.732
logitClassifier3  4 1187.074

CONFUSION MATRIX

Confusion Matrix using Model 1

# confusion matrix using classifier 1
confMatrix1 <- table(testData[, 1], yPred1 > 0.5)
confMatrix1

      FALSE TRUE
  No   2409    8
  Yes    55   28

Confusion Matrix using Model 3

# confusion matrix using classifier 3
confMatrix3 <- table(testData[, 1], yPred3 > 0.5)
confMatrix3

      FALSE TRUE
  No   2412    5
  Yes    54   29