Part 1: READING AND EXPLORING DATA

Part 1a: Importing Data

library(data.table)
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
# attach the data table
attach(CCdefault.dt)
# dimension of the data table
dim(CCdefault.dt)

## [1] 29601     9

Part 1b: First Few Rows of the Data Table

# first few rows of the data table
head(CCdefault.dt)

##    Id CreditLimit Male Education MaritalStatus Age BillOutstanding
## 1:  1       20000    0         2             1  24            3913
## 2:  2      120000    0         2             2  26            2682
## 3:  3       90000    0         2             2  34           29239
## 4:  4       50000    0         2             1  37           46990
## 5:  5       50000    1         2             1  57            8617
## 6:  6       50000    1         1             2  37           64400
##    LastPayment Default
## 1:           0       1
## 2:           0       1
## 3:        1518       0
## 4:        2000       0
## 5:        2000       0
## 6:        2500       0

Part 1c: Data Types of the Data Columns

# structure of the dataframe
str(CCdefault.dt)

## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Part 1d: Converting Data Type Structure

# convert 'Id' as a factor
#CCdefault.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]


# Changing the lavels of 'Default' variable
levels(CCdefault.dt$Default) <- c("No","Yes")

# verifying conversion
str(CCdefault.dt)

## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Part 1e: Descriptive Statistics

# descriptive statistics
library(psych)
describe(CCdefault.dt)[, c(1:5, 8:9)]

##                 vars     n      mean        sd median     min     max
## Id                 1 29601  14971.76   8660.18  14953       1   30000
## CreditLimit        2 29601 167550.54 129944.02 140000   10000 1000000
## Male*              3 29601      1.40      0.49      1       1       2
## Education*         4 29601      1.82      0.71      2       1       4
## MaritalStatus*     5 29601      1.56      0.52      2       1       3
## Age                6 29601     35.46      9.21     34      21      79
## BillOutstanding    7 29601  50957.43  73370.24  22259 -165580  964511
## LastPayment        8 29601   5649.56  16568.26   2100       0  873552
## Default*           9 29601      1.22      0.42      1       1       2

Part 2: SUMMARY STATISTICS OF DEFAULTERS

Part 2a: Percentage of Defaulters

# printing counts of defaulters
tab1 <- table(Default)
# proportion of defaulters
tab2 <- prop.table(tab1)
# Percentage of defaulters
round(tab2*100,2)

## Default
##     0     1 
## 77.69 22.31

Part 2b: Percentage of Defaulters by Gender

# table for counts
t1 <- table(Default,Male)
# table for proportions
t2 <- prop.table(t1)
# adding margins
t3 <- addmargins(t2)
# making table
round(t3*100,2)

##        Male
## Default      0      1    Sum
##     0    47.67  30.02  77.69
##     1    12.65   9.67  22.31
##     Sum  60.32  39.68 100.00

Part 2c: Percentage of Defaulters by Education

# table for counts
t1 <- table(Default,Education)
# table for proportions
t2 <- prop.table(t1)
# adding margins
t3 <- addmargins(t2)
# making table
round(t3*100,2)

##        Education
## Default      1      2      3      4    Sum
##     0    28.87  36.13  12.30   0.39  77.69
##     1     6.88  11.25   4.17   0.02  22.31
##     Sum  35.75  47.38  16.46   0.42 100.00

Part 2d: Percentage of Defaulters by MaritalStatus

# table for counts
t1 <- table(Default,MaritalStatus)
# table for proportions
t2 <- prop.table(t1)
# adding margins
t3 <- addmargins(t2)
# making table
round(t3*100,2)

##        MaritalStatus
## Default      1      2      3    Sum
##     0    34.75  42.15   0.79  77.69
##     1    10.78  11.25   0.28  22.31
##     Sum  45.53  53.40   1.07 100.00

Part 3: DESCRIPTIVE STATISTICS OF VARIABLES (CreditLimit, BillOutstanding, LastPayment, Age)

Part 3a: Descriptive Statistics by Defaulters

# loading the package
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# make a grouping variable
group <- group_by(CCdefault.dt, Default)
# summrising by grouping variables
tabA <- summarise(group, count = n(),
meanCreditLimit     = round(mean(CreditLimit),0),
sdCreditLimit       = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding   = round(sd(BillOutstanding),0),
meanLastPayment     = round(mean(LastPayment),0),
sdLastPayment       = round(sd(LastPayment),0),
meanAge             = round(mean(Age),1),
sdAge               = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))

# saving table as data frame
as.data.frame(tabA)

##   Default count meanCreditLimit sdCreditLimit meanBillOutstanding
## 1      No 22996          178300        131877               51716
## 2     Yes  6605          130125        115424               48315
##   sdBillOutstanding meanLastPayment sdLastPayment meanAge sdAge 1
## 1             73278            6306         18063    35.4     9 1
## 2             73636            3366          9360    35.7    10 1
##   RelativeProportions
## 1              0.7769
## 2              0.2231

Part 3b: Descriptive Statistics by Gender & Defaulters

# loading the package
library(dplyr)
# make a grouping variable
group <- group_by(CCdefault.dt,Male, Default)
# summrising by grouping variables
tabD <- summarise(group, count = n(),
meanCreditLimit     = round(mean(CreditLimit),0),
sdCreditLimit       = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding   = round(sd(BillOutstanding),0),
meanLastPayment     = round(mean(LastPayment),0),
sdLastPayment       = round(sd(LastPayment),0),
meanAge             = round(mean(Age),1),
sdAge               = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))


as.data.frame(tabD)

##   Male Default count meanCreditLimit sdCreditLimit meanBillOutstanding
## 1    0      No 14111          179925        126673               49529
## 2    0     Yes  3744          133384        113497               46638
## 3    1      No  8885          175719        139712               55191
## 4    1     Yes  2861          125861        117782               50510
##   sdBillOutstanding meanLastPayment sdLastPayment meanAge sdAge 1
## 1             70327            6272         18617    34.8     9 1
## 2             71925            3330          8761    34.9    10 1
## 3             77611            6360         17147    36.4     9 1
## 4             75773            3412         10091    36.8    10 1
##   RelativeProportions
## 1              0.7903
## 2              0.2097
## 3              0.7564
## 4              0.2436

Part 3c: Descriptive Statistics by Education & Defaulters

# loading the package
library(dplyr)
# make a grouping variable
group <- group_by(CCdefault.dt,Education, Default)
# summrising by grouping variables
tabB <- summarise(group, count = n(),
meanCreditLimit     = round(mean(CreditLimit),0),
sdCreditLimit       = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding   = round(sd(BillOutstanding),0),
meanLastPayment     = round(mean(LastPayment),0),
sdLastPayment       = round(sd(LastPayment),0),
meanAge             = round(mean(Age),1),
sdAge               = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))


as.data.frame(tabB)

##   Education Default count meanCreditLimit sdCreditLimit
## 1         1      No  8545          221237        136019
## 2         1     Yes  2036          178237        127546
## 3         2      No 10695          157768        123637
## 4         2     Yes  3329          112577        103365
## 5         3      No  3640          136327        117213
## 6         3     Yes  1233           97979         99833
## 7         4      No   116          225517        112128
## 8         4     Yes     7          144286         68522
##   meanBillOutstanding sdBillOutstanding meanLastPayment sdLastPayment
## 1               49221             78579            7535         21612
## 2               47257             78867            3627          7881
## 3               54373             71164            5644         15929
## 4               51190             73927            3268         10279
## 5               49687             65650            5391         14503
## 6               42236             62742            3188          9017
## 7               54345             79787            5484         10289
## 8               59382             78472            4904          4448
##   meanAge sdAge 1 RelativeProportions
## 1    34.1     8 1              0.8076
## 2    34.6     9 1              0.1924
## 3    34.7     9 1              0.7626
## 4    34.7     9 1              0.2374
## 5    40.3    10 1              0.7470
## 6    40.2    11 1              0.2530
## 7    33.8     8 1              0.9431
## 8    34.9    10 1              0.0569

Part 3d: Descriptive Statistics by MaritalStatus & Defaulters

# loading the package
library(dplyr)
# make a grouping variable
group <- group_by(CCdefault.dt,MaritalStatus,Default)
# summrising by grouping variables
tabC <- summarise(group, count = n(),
meanCreditLimit     = round(mean(CreditLimit),0),
sdCreditLimit       = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding   = round(sd(BillOutstanding),0),
meanLastPayment     = round(mean(LastPayment),0),
sdLastPayment       = round(sd(LastPayment),0),
meanAge             = round(mean(Age),1),
sdAge               = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))


as.data.frame(tabC)

##   MaritalStatus Default count meanCreditLimit sdCreditLimit
## 1             1      No 10285          194209        134855
## 2             1     Yes  3192          143192        121499
## 3             2      No 12477          166539        128184
## 4             2     Yes  3329          119032        108419
## 5             3      No   234          106154        100487
## 6             3     Yes    84           73214         80033
##   meanBillOutstanding sdBillOutstanding meanLastPayment sdLastPayment
## 1               53469             76800            6482         17655
## 2               50756             76177            3503          9854
## 3               50438             70464            6124         16792
## 4               46209             71476            3233          8889
## 5               42829             58650            8216         57764
## 6               39005             54746            3389          8217
##   meanAge sdAge 1 RelativeProportions
## 1    39.9     9 1              0.7632
## 2    40.3     9 1              0.2368
## 3    31.5     7 1              0.7894
## 4    31.2     8 1              0.2106
## 5    42.6     9 1              0.7358
## 6    43.5    10 1              0.2642

Part 4: DATA VISUALIZATION

Part 4a: VISULAIZATION (DISCRETE VARIABLES)

Part 4a1: Bar Chart for Defaulters

# percentage of defaulters
tab1 <- round(prop.table(table(Default))*100,2)
# bar-plot
bp <- barplot(tab1,
        xlab = "Default (Yes/No)", ylab = "Percent (%)",
        main = "Percentage of Defaulters",
        col = c("lightblue","red"), 
        legend = rownames(tab1), 
        beside = TRUE,
        ylim = c(0, 90))
text(bp, 0, round(tab1, 1),cex=1,pos=3)

Part 4a2: Bar Chart For Defaulters By Gender

# Percentage of defaulters by Gender
tab2 <- round(prop.table(table(Default,Male))*100,2)
# bar-plot
bp <- barplot(tab2, beside = TRUE, main = "Bar Chart For Defaulters By Gender", 
col = c("lightblue", "mistyrose"),
xlab = "Male", 
ylab = "Percent (%)", legend = c("No", "Yes"), 
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab2, 1),cex=1,pos=3)

Part 4a3: Bar Chart For Defaulters By Education

# percentage of defaulters by Education
tab3 <- round(prop.table(table(Default,Education))*100,2)

bp <- barplot(tab3, beside = TRUE, main = "Bar Chart For Defaulters By Education", 
col = c("lightblue", "mistyrose"),
xlab = "Education",
ylab = "Percent (%)", legend = c("No", "Yes"), 
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab3, 1),cex=1,pos=3)

Part 4a4: Bar Chart For Defaulters By MaritalStatus

# Percentage of defaulters by MaritalStatus

tab4 <- round(prop.table(table(Default,MaritalStatus))*100,2)

bp <- barplot(tab4, beside = TRUE, main = "Bar Chart For Defaulters By Marital Status", 
col = c("lightblue", "mistyrose"),
xlab = "Marital Status",
ylab = "Percent (%)", legend = c("No", "Yes"), 
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab4, 1),cex=1,pos=3)

Part 4b: VISULAIZATION (CONTINUOUS VARIABLES)

Part 4b1: Box plots For Variable CreditLimit Grouped By Default (Yes/No)

# loading the package
library(ggplot2)

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

# plotting box plots 
p <- ggplot(CCdefault.dt, aes(x = Default, y = CreditLimit)) + ylab("CreditLimit (NT$)") + geom_boxplot()
p + labs(title="CreditLimit of Defaulters and Others")

Part 4b2: Box plots For The Variable BillOutstanding Grouped By Default (Yes/No)

# loading the package
library(ggplot2)
# plotting box plots
p <- ggplot(CCdefault.dt, aes(x = Default, y = BillOutstanding)) + ylab("BillOutstanding (NT$)") + geom_boxplot()
p + labs(title=" BillOutstanding of defaulters and Others")

Part 4b3: Box plots For The Variable LastPayment Grouped By Default (Yes/No)

# loading the package
library(ggplot2)
# plotting box plots
p <- ggplot(CCdefault.dt, aes(x = Default, y = LastPayment)) +  
  ylab("LastPayment (NT$)") + geom_boxplot()
p + labs(title="LastPayment of Defaulters and Others")

Part 4c: SCATTER PLOTS

Part 4c1: Display a Scatter Plot Between CreditLimit And BillOutstanding

# loading the package
library(ggplot2)
# plotting scatter plot using ggplot2
p <- ggplot(CCdefault.dt, aes(x = BillOutstanding, y = CreditLimit)) + 
  ylab("Credit Limit (NT$)") + geom_point() 
p + labs(x = "BillOutstanding (NT$)") + 
  labs(title="Scatter Plot Between CreditLimit And BillOutstanding")

Part 4c2: Display a Scatter Plot Between CreditLimit And BillOutstanding Categorised by Defaulters

# loading the package
library(ggplot2)
# plotting scatter plot
p <- ggplot(CCdefault.dt, aes(x = BillOutstanding, y = CreditLimit,linetype = Default)) + ylab("Credit Limit (NT$)") + geom_point() + scale_linetype_manual(values=c("dashed", "solid")) +
  geom_smooth(method=lm, se=FALSE, color= "red")
p + labs(x = "BillOutstanding (NT$)") + 
labs(title="Scatter Plot Between CreditLimit And BillOutstanding By Defaulters")

Part 4c3: Display a Scatter plot between CreditLimit And LastPayment

# loading the package
library(ggplot2)
# plotting scatter plot 
p <- ggplot(CCdefault.dt, aes(x = LastPayment, y = CreditLimit)) +  
  ylab("Credit Limit (NT$)") + geom_point() 
p + labs(x = "Last Payment (NT$)") + 
  labs(title="Scatter plot between CreditLimit And LastPayment")

Part 4c4: Display a Scatter Plot Between CreditLimit And LastPayment Categorised by Defaulters

# loading the package
library(ggplot2)
# plotting scatter
p <- ggplot(CCdefault.dt, aes(x = LastPayment, y = CreditLimit,linetype = Default))+
   ylab("Credit Limit (NT$)") + geom_point() + scale_linetype_manual(values=c("dashed", "solid")) +
  geom_smooth(method=lm, se=FALSE, color= "red")
p + labs(x = "LastPayment (NT$)") + 
  labs(title="Scatter Plot Between CreditLimit And LastPayment By Defaulters")

Part 4c5: Display a Scatter plot between CreditLimit And Age

# loading the package
library(ggplot2)
# plotting scatter plot 
p <- ggplot(CCdefault.dt, aes(x = Age, y = CreditLimit)) +  
  ylab("Credit Limit (NT$)") + geom_point() 
p + labs(x = "Age (Years)") +
labs(title="Scatter plot between CreditLimit And Age")

Part 4c6: Display a Scatter Plot Between CreditLimit And Age Categorised by Defaulters

# loading the package
library(ggplot2)
# plotting scatter
p <- ggplot(CCdefault.dt, aes(x = Age, y = CreditLimit,linetype = Default)) + ylab("Credit Limit (NT$)") + geom_point() +
     scale_linetype_manual(values=c("dashed", "solid")) +
     geom_smooth(method=lm, se=FALSE, color= "red")
  p + labs(x = "Age (Years)") + 
  labs(title="Scatter Plot Between CreditLimit And Age By Defaulters")

Part 4d: SCATTERPLOT MATRIX

# basic scatterplot matrix
pairs(~ CreditLimit + BillOutstanding + LastPayment + Age ,data = CCdefault.dt, 
   main="Simple Scatterplot Matrix")

Part 5: Data Preparation

Part 5a: Check the order of levels of the target variable

# levels of the target variable
levels(CCdefault.dt$Default)

## [1] "No"  "Yes"

Part 5b: Reset the order of levels of the target variable, as (“Event” = Yes, “No Event” = No)

# ordering the levels
CCdefault.dt$Default <- ordered(CCdefault.dt$Default, levels = c("Yes", "No"))

# verifying the new order of levels
levels(CCdefault.dt$Default)

## [1] "Yes" "No"

Part 5c: Splitting the Dataset into the Training set and Test set

library(caTools)

# get the same split when you re-run the code
set.seed(123)

# splitting the data set into ratio 0.80:0.20
split <- sample.split(CCdefault.dt$Default, SplitRatio = 0.80)

# create the training dataset
trainingSet <- subset(CCdefault.dt, split == TRUE)

# create the testing dataset
testSet <- subset(CCdefault.dt, split == FALSE)

Part 5d: Verify Training & Testing Datasets

# dimension of training dataset
dim(trainingSet)

## [1] 23681     9

# dimension of testing dataset
dim(testSet)

## [1] 5920    9

Part 5e: Verify Proportion of “Event (default = Yes)”

# proportion of defaulters in training dataset
round(prop.table(table(trainingSet$Default))*100,2)

## 
##   Yes    No 
## 22.31 77.69

# proportion of defaulters in test dataset
round(prop.table(table(testSet$Default))*100,2)

## 
##   Yes    No 
## 22.31 77.69

Part 6: Run the Machine Learning algorithm – kNN

Part 6a: Model Building Using kNN

library(caret)

## Loading required package: lattice

# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3)
set.seed(123)

# Run kNN Classifier in package caret
knn_fit  <- train(Default ~ ., 
                         data = trainingSet,
                         method = "knn",
                         trControl = trctrl,
                         preProcess = c("center", "scale"),
                         tuneLength = 10)
# kNN model summary
knn_fit

## k-Nearest Neighbors 
## 
## 23681 samples
##     8 predictor
##     2 classes: 'Yes', 'No' 
## 
## Pre-processing: centered (11), scaled (11) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 21313, 21313, 21312, 21313, 21313, 21314, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    5  0.7332323  0.04144949
##    7  0.7466466  0.03837324
##    9  0.7524177  0.02828311
##   11  0.7588644  0.02824895
##   13  0.7629885  0.02539935
##   15  0.7660992  0.02232876
##   17  0.7686470  0.01989299
##   19  0.7703924  0.01715124
##   21  0.7719830  0.01592183
##   23  0.7721380  0.01225606
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.

Part 6b: Testing the kNN Model

# predicting the test set observations
kNNPred <- predict(knn_fit, testSet, type = "raw")

Part 6c: Confusion Matrix

# confusion matrix
table(Predicted = kNNPred, Actual = testSet$Default)

##          Actual
## Predicted  Yes   No
##       Yes   26   50
##       No  1295 4549

Part 7: Run the Machine Learning algorithm – Logistic Regression

Part 7a: Model Building Using Logistic Regression

# fit logistic regression model 
 logitModel <- glm(Default ~ 
                          CreditLimit 
                        + Male
                        + Education
                        + MaritalStatus
                        + Age
                        + BillOutstanding
                        + LastPayment, 
                        data = trainingSet, 
                        family = binomial())
# summary of the logistic regression model 
summary(logitModel)

## 
## Call:
## glm(formula = Default ~ CreditLimit + Male + Education + MaritalStatus + 
##     Age + BillOutstanding + LastPayment, family = binomial(), 
##     data = trainingSet)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0795   0.3568   0.6467   0.7761   0.9938  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      7.746e-01  8.675e-02   8.929  < 2e-16 ***
## CreditLimit      3.363e-06  1.631e-07  20.616  < 2e-16 ***
## Male1           -1.526e-01  3.254e-02  -4.690 2.73e-06 ***
## Education2      -3.524e-03  3.778e-02  -0.093  0.92569    
## Education3       1.038e-02  5.018e-02   0.207  0.83616    
## Education4       1.500e+00  4.615e-01   3.251  0.00115 ** 
## MaritalStatus2   2.282e-01  3.674e-02   6.212 5.23e-10 ***
## MaritalStatus3   1.397e-01  1.454e-01   0.961  0.33662    
## Age             -3.679e-03  1.963e-03  -1.874  0.06093 .  
## BillOutstanding -1.941e-06  2.632e-07  -7.374 1.65e-13 ***
## LastPayment      3.090e-05  3.168e-06   9.754  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25142  on 23680  degrees of freedom
## Residual deviance: 24199  on 23670  degrees of freedom
## AIC: 24221
## 
## Number of Fisher Scoring iterations: 6

Part 7b: Testing the Model

# predicting the test set observations
logitModelPred <- predict(logitModel, testSet, type = "response")

Part 7c: Plotting the Predicted Probabilities

# plot of probabilities
plot(logitModelPred, 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

Part 7d: Confusion Matrix

# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.5,"Yes","No")

# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testSet$Default <- ordered(testSet$Default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify50, Actual = testSet$Default)
cm

##          Actual
## Predicted  Yes   No
##       Yes 1321 4599
##       No     0    0

Part 7e: Confusion Matrix using carat package

library(caret)
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes 1321 4599
##       No     0    0
##                                          
##                Accuracy : 0.2231         
##                  95% CI : (0.2126, 0.234)
##     No Information Rate : 0.7769         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.2231         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.2231         
##          Detection Rate : 0.2231         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : Yes            
##

Part 7f: Measuring Machine Learning Metrics at different Cut-off Probabilities

library(caret)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {

       # predicting the test set results
         logitModelPred <- predict(logitModel, testSet, type = "response")
         C1 <- ifelse(logitModelPred > cutoff, "Yes", "No")
         C2 <- testSet$Default
         predY   <- as.factor(C1)
         actualY <- as.factor(C2)

      predY <- ordered(predY, levels = c("Yes", "No"))
      actualY <- ordered(actualY, levels = c("Yes", "No"))

        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(table(predY,actualY))
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]

        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # making sequence of cut-off probabilities       
        cutoff1 <- seq( .1, .9, by = .05 )
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # extra coding for saving table as desired format
        tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
                 tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
                 tab2[[15]],tab2[[16]],tab2[[17]])
        
        tab3

##           Accuracy Sensitivity Specificity       Kappa
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2293919  0.95457986  0.02109154 -0.01106665
## Accuracy 0.2790541  0.73959122  0.14677104 -0.05780682
## Accuracy 0.3846284  0.43149129  0.37116765 -0.12508918
## Accuracy 0.5130068  0.24072672  0.59121548 -0.13590326
## Accuracy 0.6339527  0.11127933  0.78408350 -0.11000896
## Accuracy 0.7113176  0.03028009  0.90693629 -0.08155029

Part 7g: ROC Curve

# loading the package
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(gplots)
PredLR <- predict(logitModel, testSet, type = "response")
lgPredObj <- prediction(PredLR, testSet$Default)
lgPerfObj <- performance(lgPredObj, "tpr", "fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

Part 7h: AUC (Area Under The Curve)

# area under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR

## [1] 0.6265644

Taiwan Credit Card Data

Avik Debnath

today

Part 1: READING AND EXPLORING DATA

Part 1a: Importing Data

Part 1b: First Few Rows of the Data Table

Part 1c: Data Types of the Data Columns

Part 1d: Converting Data Type Structure

Part 1e: Descriptive Statistics

Part 2: SUMMARY STATISTICS OF DEFAULTERS

Part 2a: Percentage of Defaulters

Part 2b: Percentage of Defaulters by Gender

Part 2c: Percentage of Defaulters by Education

Part 2d: Percentage of Defaulters by MaritalStatus

Part 3: DESCRIPTIVE STATISTICS OF VARIABLES (CreditLimit, BillOutstanding, LastPayment, Age)

Part 3a: Descriptive Statistics by Defaulters

Part 3b: Descriptive Statistics by Gender & Defaulters

Part 3c: Descriptive Statistics by Education & Defaulters

Part 3d: Descriptive Statistics by MaritalStatus & Defaulters

Part 4: DATA VISUALIZATION

Part 4a: VISULAIZATION (DISCRETE VARIABLES)

Part 4a1: Bar Chart for Defaulters

Part 4a2: Bar Chart For Defaulters By Gender

Part 4a3: Bar Chart For Defaulters By Education

Part 4a4: Bar Chart For Defaulters By MaritalStatus

Part 4b: VISULAIZATION (CONTINUOUS VARIABLES)

Part 4b1: Box plots For Variable CreditLimit Grouped By Default (Yes/No)

Part 4b2: Box plots For The Variable BillOutstanding Grouped By Default (Yes/No)

Part 4b3: Box plots For The Variable LastPayment Grouped By Default (Yes/No)

Part 4c: SCATTER PLOTS

Part 4c1: Display a Scatter Plot Between CreditLimit And BillOutstanding

Part 4c2: Display a Scatter Plot Between CreditLimit And BillOutstanding Categorised by Defaulters

Part 4c3: Display a Scatter plot between CreditLimit And LastPayment

Part 4c4: Display a Scatter Plot Between CreditLimit And LastPayment Categorised by Defaulters

Part 4c5: Display a Scatter plot between CreditLimit And Age

Part 4c6: Display a Scatter Plot Between CreditLimit And Age Categorised by Defaulters

Part 4d: SCATTERPLOT MATRIX

Part 5: Data Preparation

Part 5a: Check the order of levels of the target variable

Part 5b: Reset the order of levels of the target variable, as (“Event” = Yes, “No Event” = No)

Part 5c: Splitting the Dataset into the Training set and Test set

Part 5d: Verify Training & Testing Datasets

Part 5e: Verify Proportion of “Event (default = Yes)”

Part 6: Run the Machine Learning algorithm – kNN

Part 6a: Model Building Using kNN

Part 6b: Testing the kNN Model

Part 6c: Confusion Matrix

Part 7: Run the Machine Learning algorithm – Logistic Regression

Part 7a: Model Building Using Logistic Regression

Part 7b: Testing the Model

Part 7c: Plotting the Predicted Probabilities

Part 7d: Confusion Matrix

Part 7e: Confusion Matrix using carat package

Part 7f: Measuring Machine Learning Metrics at different Cut-off Probabilities

Part 7g: ROC Curve

Part 7h: AUC (Area Under The Curve)