R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

#EDA for Taiwan Credit Card data

library(data.table)
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")

# attaching the data
attach(CCdefault.dt)
# dimension of the data table
dim(CCdefault.dt)
## [1] 29601     9
# column names
colnames(CCdefault.dt)
## [1] "Id"              "CreditLimit"     "Male"            "Education"      
## [5] "MaritalStatus"   "Age"             "BillOutstanding" "LastPayment"    
## [9] "Default"
# structure of the dataframe
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
#Data tyoe conversions
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]
# verifying conversion
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
#Checking the levels of 'Default variable'
levels(CCdefault.dt$Default) <- c("No","Yes")

# levels of the target variable
levels(CCdefault.dt$Default)
## [1] "No"  "Yes"
#Reordering the levels of 'Default variable'
CCdefault.dt$Default <- ordered(CCdefault.dt$Default, levels = c("Yes", "No"))

# verifying the new order of levels
levels(CCdefault.dt$Default)
## [1] "Yes" "No"
#Data partition
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
set.seed(2341)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)

# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]

# 20% testing data
testData.dt <- CCdefault.dt[-trainIndex, ]

#Verifying the attributes of partitioned data sets

# dimension of training dataset
dim(trainData.dt)
## [1] 23681     9
# dimension of testing dataset
dim(testData.dt)
## [1] 5920    9
# proportion of defaulters in training dataset
round(prop.table(table(trainData.dt$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testData.dt$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69

#KNN algorithm implementation

#KNN model development
set.seed(3333)
library(caret)
# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3)
set.seed(3333)

# Run kNN Classifier in package caret
knn_fit  <- train(Default ~ ., 
                         data = trainData.dt,
                         method = "knn",
                         trControl = trctrl,
                         preProcess = c("center", "scale"),
                         tuneLength = 10)
# kNN model summary
knn_fit 
## k-Nearest Neighbors 
## 
## 23681 samples
##     8 predictor
##     2 classes: 'Yes', 'No' 
## 
## Pre-processing: centered (11), scaled (11) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 21313, 21314, 21313, 21313, 21312, 21314, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    5  0.7327261  0.04264164
##    7  0.7467312  0.04021462
##    9  0.7529805  0.02928387
##   11  0.7581885  0.02459777
##   13  0.7627492  0.02378213
##   15  0.7662680  0.02270087
##   17  0.7686751  0.01919706
##   19  0.7708850  0.01794406
##   21  0.7722223  0.01641531
##   23  0.7731231  0.01487522
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
# predicting the test set observations
kNNPred <- predict(knn_fit, testData.dt, type = "prob")
#Setting cut off probability to 20%
classify20KNN <- ifelse(kNNPred$Yes > 0.20,"Yes","No")

# ordering the levels
classify20KNN <- ordered(classify20KNN, levels = c("Yes", "No"))
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))
#Confusion matrix
cmKNN <- table(Predicted = classify20KNN, Actual = testData.dt$Default)
confusionMatrix(cmKNN)
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes  942 2547
##       No   379 2052
##                                           
##                Accuracy : 0.5057          
##                  95% CI : (0.4929, 0.5186)
##     No Information Rate : 0.7769          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1005          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7131          
##             Specificity : 0.4462          
##          Pos Pred Value : 0.2700          
##          Neg Pred Value : 0.8441          
##              Prevalence : 0.2231          
##          Detection Rate : 0.1591          
##    Detection Prevalence : 0.5894          
##       Balanced Accuracy : 0.5796          
##                                           
##        'Positive' Class : Yes             
## 
#ROC curve and AUC
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
PredLRKNN <- predict(knn_fit, testData.dt,type = "prob")
levels(testData.dt$Default)
## [1] "Yes" "No"
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))
lgPredObjKNN <- prediction(PredLRKNN[1],testData.dt$Default)
lgPerfObjKNN <- performance(lgPredObjKNN, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObjKNN,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

# area under curve
aucLRKNN <- performance(lgPredObjKNN, measure = "auc")
aucLRKNN <- aucLRKNN@y.values[[1]]
aucLRKNN
## [1] 0.3914298

#Logistic regression model implementation

#Data partition
set.seed(766)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)

# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]

# 20% testing data
testData.dt <- CCdefault.dt[-trainIndex, ]
#Logit model development
 logitModel <- glm(trainData.dt$Default ~ ., 
                        data = trainData.dt, 
                        family = binomial())
# summary of the logistic regression model 
summary(logitModel)
## 
## Call:
## glm(formula = trainData.dt$Default ~ ., family = binomial(), 
##     data = trainData.dt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.6831   0.3601   0.6479   0.7764   1.0039  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      7.461e-01  9.101e-02   8.198 2.44e-16 ***
## Id               2.501e-06  1.844e-06   1.357    0.175    
## CreditLimit      3.266e-06  1.628e-07  20.059  < 2e-16 ***
## Male1           -1.637e-01  3.253e-02  -5.032 4.84e-07 ***
## Education2      -2.807e-02  3.769e-02  -0.745    0.456    
## Education3       1.417e-02  5.042e-02   0.281    0.779    
## Education4       1.309e+00  4.236e-01   3.090    0.002 ** 
## MaritalStatus2   2.149e-01  3.681e-02   5.838 5.29e-09 ***
## MaritalStatus3   6.957e-02  1.475e-01   0.472    0.637    
## Age             -2.990e-03  1.978e-03  -1.511    0.131    
## BillOutstanding -1.889e-06  2.634e-07  -7.170 7.52e-13 ***
## LastPayment      3.168e-05  3.250e-06   9.748  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25142  on 23680  degrees of freedom
## Residual deviance: 24225  on 23669  degrees of freedom
## AIC: 24249
## 
## Number of Fisher Scoring iterations: 6
# predicting the test set observations
logitModelPred <- predict(logitModel, testData.dt, type = "response")
# plot of probabilities
plot(logitModelPred, 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

#Setting the cutoff probability to 20%
classify20LGT <- ifelse(logitModelPred > 0.2,"Yes","No")

# ordering the levels
classify20LGT <- ordered(classify20LGT, levels = c("Yes", "No"))
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))
# confusion matrix
cmLGT <- table(Predicted = classify20LGT, Actual = testData.dt$Default)
confusionMatrix(cmLGT)
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes 1321 4599
##       No     0    0
##                                          
##                Accuracy : 0.2231         
##                  95% CI : (0.2126, 0.234)
##     No Information Rate : 0.7769         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.2231         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.2231         
##          Detection Rate : 0.2231         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : Yes            
## 
#ROC curve and AUG

# loading the package
library(ROCR)
PredLR <- predict(logitModel, testData.dt,type = "response")
lgPredObj <- prediction(PredLR,testData.dt$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

# area under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR
## [1] 0.6318744