Part 1: READING AND EXPLORING DATA
Part 1a: Importing Data
library(data.table)
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
# attach the data table
attach(CCdefault.dt)
# dimension of the data table
dim(CCdefault.dt)
## [1] 29601 9
Part 1b: First Few Rows of the Data Table
# first few rows of the data table
head(CCdefault.dt)
## Id CreditLimit Male Education MaritalStatus Age BillOutstanding
## 1: 1 20000 0 2 1 24 3913
## 2: 2 120000 0 2 2 26 2682
## 3: 3 90000 0 2 2 34 29239
## 4: 4 50000 0 2 1 37 46990
## 5: 5 50000 1 2 1 57 8617
## 6: 6 50000 1 1 2 37 64400
## LastPayment Default
## 1: 0 1
## 2: 0 1
## 3: 1518 0
## 4: 2000 0
## 5: 2000 0
## 6: 2500 0
Part 1c: Data Types of the Data Columns
# structure of the dataframe
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : int 0 0 0 0 1 1 1 0 0 1 ...
## $ Education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : int 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : int 1 1 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
Part 1d: Converting Data Type Structure
# convert 'Id' as a factor
#CCdefault.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]
# Changing the lavels of 'Default' variable
levels(CCdefault.dt$Default) <- c("No","Yes")
# verifying conversion
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
## $ Education : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
Part 1e: Descriptive Statistics
# descriptive statistics
library(psych)
describe(CCdefault.dt)[, c(1:5, 8:9)]
## vars n mean sd median min max
## Id 1 29601 14971.76 8660.18 14953 1 30000
## CreditLimit 2 29601 167550.54 129944.02 140000 10000 1000000
## Male* 3 29601 1.40 0.49 1 1 2
## Education* 4 29601 1.82 0.71 2 1 4
## MaritalStatus* 5 29601 1.56 0.52 2 1 3
## Age 6 29601 35.46 9.21 34 21 79
## BillOutstanding 7 29601 50957.43 73370.24 22259 -165580 964511
## LastPayment 8 29601 5649.56 16568.26 2100 0 873552
## Default* 9 29601 1.22 0.42 1 1 2
Part 2: SUMMARY STATISTICS OF DEFAULTERS
Part 2a: Percentage of Defaulters
# printing counts of defaulters
tab1 <- table(Default)
# proportion of defaulters
tab2 <- prop.table(tab1)
# Percentage of defaulters
round(tab2*100,2)
## Default
## 0 1
## 77.69 22.31
Part 2b: Percentage of Defaulters by Gender
# table for counts
t1 <- table(Default,Male)
# table for proportions
t2 <- prop.table(t1)
# adding margins
t3 <- addmargins(t2)
# making table
round(t3*100,2)
## Male
## Default 0 1 Sum
## 0 47.67 30.02 77.69
## 1 12.65 9.67 22.31
## Sum 60.32 39.68 100.00
Part 2c: Percentage of Defaulters by Education
# table for counts
t1 <- table(Default,Education)
# table for proportions
t2 <- prop.table(t1)
# adding margins
t3 <- addmargins(t2)
# making table
round(t3*100,2)
## Education
## Default 1 2 3 4 Sum
## 0 28.87 36.13 12.30 0.39 77.69
## 1 6.88 11.25 4.17 0.02 22.31
## Sum 35.75 47.38 16.46 0.42 100.00
Part 2d: Percentage of Defaulters by MaritalStatus
# table for counts
t1 <- table(Default,MaritalStatus)
# table for proportions
t2 <- prop.table(t1)
# adding margins
t3 <- addmargins(t2)
# making table
round(t3*100,2)
## MaritalStatus
## Default 1 2 3 Sum
## 0 34.75 42.15 0.79 77.69
## 1 10.78 11.25 0.28 22.31
## Sum 45.53 53.40 1.07 100.00
Part 3: DESCRIPTIVE STATISTICS OF VARIABLES (CreditLimit, BillOutstanding, LastPayment, Age)
Part 3a: Descriptive Statistics by Defaulters
# loading the package
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# make a grouping variable
group <- group_by(CCdefault.dt, Default)
# summrising by grouping variables
tabA <- summarise(group, count = n(),
meanCreditLimit = round(mean(CreditLimit),0),
sdCreditLimit = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding = round(sd(BillOutstanding),0),
meanLastPayment = round(mean(LastPayment),0),
sdLastPayment = round(sd(LastPayment),0),
meanAge = round(mean(Age),1),
sdAge = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
# saving table as data frame
as.data.frame(tabA)
## Default count meanCreditLimit sdCreditLimit meanBillOutstanding
## 1 No 22996 178300 131877 51716
## 2 Yes 6605 130125 115424 48315
## sdBillOutstanding meanLastPayment sdLastPayment meanAge sdAge 1
## 1 73278 6306 18063 35.4 9 1
## 2 73636 3366 9360 35.7 10 1
## RelativeProportions
## 1 0.7769
## 2 0.2231
Part 3b: Descriptive Statistics by Gender & Defaulters
# loading the package
library(dplyr)
# make a grouping variable
group <- group_by(CCdefault.dt,Male, Default)
# summrising by grouping variables
tabD <- summarise(group, count = n(),
meanCreditLimit = round(mean(CreditLimit),0),
sdCreditLimit = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding = round(sd(BillOutstanding),0),
meanLastPayment = round(mean(LastPayment),0),
sdLastPayment = round(sd(LastPayment),0),
meanAge = round(mean(Age),1),
sdAge = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
as.data.frame(tabD)
## Male Default count meanCreditLimit sdCreditLimit meanBillOutstanding
## 1 0 No 14111 179925 126673 49529
## 2 0 Yes 3744 133384 113497 46638
## 3 1 No 8885 175719 139712 55191
## 4 1 Yes 2861 125861 117782 50510
## sdBillOutstanding meanLastPayment sdLastPayment meanAge sdAge 1
## 1 70327 6272 18617 34.8 9 1
## 2 71925 3330 8761 34.9 10 1
## 3 77611 6360 17147 36.4 9 1
## 4 75773 3412 10091 36.8 10 1
## RelativeProportions
## 1 0.7903
## 2 0.2097
## 3 0.7564
## 4 0.2436
Part 3c: Descriptive Statistics by Education & Defaulters
# loading the package
library(dplyr)
# make a grouping variable
group <- group_by(CCdefault.dt,Education, Default)
# summrising by grouping variables
tabB <- summarise(group, count = n(),
meanCreditLimit = round(mean(CreditLimit),0),
sdCreditLimit = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding = round(sd(BillOutstanding),0),
meanLastPayment = round(mean(LastPayment),0),
sdLastPayment = round(sd(LastPayment),0),
meanAge = round(mean(Age),1),
sdAge = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
as.data.frame(tabB)
## Education Default count meanCreditLimit sdCreditLimit
## 1 1 No 8545 221237 136019
## 2 1 Yes 2036 178237 127546
## 3 2 No 10695 157768 123637
## 4 2 Yes 3329 112577 103365
## 5 3 No 3640 136327 117213
## 6 3 Yes 1233 97979 99833
## 7 4 No 116 225517 112128
## 8 4 Yes 7 144286 68522
## meanBillOutstanding sdBillOutstanding meanLastPayment sdLastPayment
## 1 49221 78579 7535 21612
## 2 47257 78867 3627 7881
## 3 54373 71164 5644 15929
## 4 51190 73927 3268 10279
## 5 49687 65650 5391 14503
## 6 42236 62742 3188 9017
## 7 54345 79787 5484 10289
## 8 59382 78472 4904 4448
## meanAge sdAge 1 RelativeProportions
## 1 34.1 8 1 0.8076
## 2 34.6 9 1 0.1924
## 3 34.7 9 1 0.7626
## 4 34.7 9 1 0.2374
## 5 40.3 10 1 0.7470
## 6 40.2 11 1 0.2530
## 7 33.8 8 1 0.9431
## 8 34.9 10 1 0.0569
Part 3d: Descriptive Statistics by MaritalStatus & Defaulters
# loading the package
library(dplyr)
# make a grouping variable
group <- group_by(CCdefault.dt,MaritalStatus,Default)
# summrising by grouping variables
tabC <- summarise(group, count = n(),
meanCreditLimit = round(mean(CreditLimit),0),
sdCreditLimit = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding = round(sd(BillOutstanding),0),
meanLastPayment = round(mean(LastPayment),0),
sdLastPayment = round(sd(LastPayment),0),
meanAge = round(mean(Age),1),
sdAge = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
as.data.frame(tabC)
## MaritalStatus Default count meanCreditLimit sdCreditLimit
## 1 1 No 10285 194209 134855
## 2 1 Yes 3192 143192 121499
## 3 2 No 12477 166539 128184
## 4 2 Yes 3329 119032 108419
## 5 3 No 234 106154 100487
## 6 3 Yes 84 73214 80033
## meanBillOutstanding sdBillOutstanding meanLastPayment sdLastPayment
## 1 53469 76800 6482 17655
## 2 50756 76177 3503 9854
## 3 50438 70464 6124 16792
## 4 46209 71476 3233 8889
## 5 42829 58650 8216 57764
## 6 39005 54746 3389 8217
## meanAge sdAge 1 RelativeProportions
## 1 39.9 9 1 0.7632
## 2 40.3 9 1 0.2368
## 3 31.5 7 1 0.7894
## 4 31.2 8 1 0.2106
## 5 42.6 9 1 0.7358
## 6 43.5 10 1 0.2642
Part 4: DATA VISUALIZATION
Part 4a: VISULAIZATION (DISCRETE VARIABLES)
Part 4a1: Bar Chart for Defaulters
# percentage of defaulters
tab1 <- round(prop.table(table(Default))*100,2)
# bar-plot
bp <- barplot(tab1,
xlab = "Default (Yes/No)", ylab = "Percent (%)",
main = "Percentage of Defaulters",
col = c("lightblue","red"),
legend = rownames(tab1),
beside = TRUE,
ylim = c(0, 90))
text(bp, 0, round(tab1, 1),cex=1,pos=3)

Part 4a2: Bar Chart For Defaulters By Gender
# Percentage of defaulters by Gender
tab2 <- round(prop.table(table(Default,Male))*100,2)
# bar-plot
bp <- barplot(tab2, beside = TRUE, main = "Bar Chart For Defaulters By Gender",
col = c("lightblue", "mistyrose"),
xlab = "Male",
ylab = "Percent (%)", legend = c("No", "Yes"),
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab2, 1),cex=1,pos=3)

Part 4a3: Bar Chart For Defaulters By Education
# percentage of defaulters by Education
tab3 <- round(prop.table(table(Default,Education))*100,2)
bp <- barplot(tab3, beside = TRUE, main = "Bar Chart For Defaulters By Education",
col = c("lightblue", "mistyrose"),
xlab = "Education",
ylab = "Percent (%)", legend = c("No", "Yes"),
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab3, 1),cex=1,pos=3)

Part 4a4: Bar Chart For Defaulters By MaritalStatus
# Percentage of defaulters by MaritalStatus
tab4 <- round(prop.table(table(Default,MaritalStatus))*100,2)
bp <- barplot(tab4, beside = TRUE, main = "Bar Chart For Defaulters By Marital Status",
col = c("lightblue", "mistyrose"),
xlab = "Marital Status",
ylab = "Percent (%)", legend = c("No", "Yes"),
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab4, 1),cex=1,pos=3)

Part 4b: VISULAIZATION (CONTINUOUS VARIABLES)
Part 4b1: Box plots For Variable CreditLimit Grouped By Default (Yes/No)
# loading the package
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
# plotting box plots
p <- ggplot(CCdefault.dt, aes(x = Default, y = CreditLimit)) + ylab("CreditLimit (NT$)") + geom_boxplot()
p + labs(title="CreditLimit of Defaulters and Others")

Part 4b2: Box plots For The Variable BillOutstanding Grouped By Default (Yes/No)
# loading the package
library(ggplot2)
# plotting box plots
p <- ggplot(CCdefault.dt, aes(x = Default, y = BillOutstanding)) + ylab("BillOutstanding (NT$)") + geom_boxplot()
p + labs(title=" BillOutstanding of defaulters and Others")

Part 4b3: Box plots For The Variable LastPayment Grouped By Default (Yes/No)
# loading the package
library(ggplot2)
# plotting box plots
p <- ggplot(CCdefault.dt, aes(x = Default, y = LastPayment)) +
ylab("LastPayment (NT$)") + geom_boxplot()
p + labs(title="LastPayment of Defaulters and Others")

Part 4c: SCATTER PLOTS
Part 4c1: Display a Scatter Plot Between CreditLimit And BillOutstanding
# loading the package
library(ggplot2)
# plotting scatter plot using ggplot2
p <- ggplot(CCdefault.dt, aes(x = BillOutstanding, y = CreditLimit)) +
ylab("Credit Limit (NT$)") + geom_point()
p + labs(x = "BillOutstanding (NT$)") +
labs(title="Scatter Plot Between CreditLimit And BillOutstanding")

Part 4c2: Display a Scatter Plot Between CreditLimit And BillOutstanding Categorised by Defaulters
# loading the package
library(ggplot2)
# plotting scatter plot
p <- ggplot(CCdefault.dt, aes(x = BillOutstanding, y = CreditLimit,linetype = Default)) + ylab("Credit Limit (NT$)") + geom_point() + scale_linetype_manual(values=c("dashed", "solid")) +
geom_smooth(method=lm, se=FALSE, color= "red")
p + labs(x = "BillOutstanding (NT$)") +
labs(title="Scatter Plot Between CreditLimit And BillOutstanding By Defaulters")

Part 4c3: Display a Scatter plot between CreditLimit And LastPayment
# loading the package
library(ggplot2)
# plotting scatter plot
p <- ggplot(CCdefault.dt, aes(x = LastPayment, y = CreditLimit)) +
ylab("Credit Limit (NT$)") + geom_point()
p + labs(x = "Last Payment (NT$)") +
labs(title="Scatter plot between CreditLimit And LastPayment")

Part 4c4: Display a Scatter Plot Between CreditLimit And LastPayment Categorised by Defaulters
# loading the package
library(ggplot2)
# plotting scatter
p <- ggplot(CCdefault.dt, aes(x = LastPayment, y = CreditLimit,linetype = Default))+
ylab("Credit Limit (NT$)") + geom_point() + scale_linetype_manual(values=c("dashed", "solid")) +
geom_smooth(method=lm, se=FALSE, color= "red")
p + labs(x = "LastPayment (NT$)") +
labs(title="Scatter Plot Between CreditLimit And LastPayment By Defaulters")

Part 4c5: Display a Scatter plot between CreditLimit And Age
# loading the package
library(ggplot2)
# plotting scatter plot
p <- ggplot(CCdefault.dt, aes(x = Age, y = CreditLimit)) +
ylab("Credit Limit (NT$)") + geom_point()
p + labs(x = "Age (Years)") +
labs(title="Scatter plot between CreditLimit And Age")

Part 4c6: Display a Scatter Plot Between CreditLimit And Age Categorised by Defaulters
# loading the package
library(ggplot2)
# plotting scatter
p <- ggplot(CCdefault.dt, aes(x = Age, y = CreditLimit,linetype = Default)) + ylab("Credit Limit (NT$)") + geom_point() +
scale_linetype_manual(values=c("dashed", "solid")) +
geom_smooth(method=lm, se=FALSE, color= "red")
p + labs(x = "Age (Years)") +
labs(title="Scatter Plot Between CreditLimit And Age By Defaulters")

Part 4d: SCATTERPLOT MATRIX
# basic scatterplot matrix
pairs(~ CreditLimit + BillOutstanding + LastPayment + Age ,data = CCdefault.dt,
main="Simple Scatterplot Matrix")

Part 5: Data Preparation
Part 5a: Check the order of levels of the target variable
# levels of the target variable
levels(CCdefault.dt$Default)
## [1] "No" "Yes"
Part 5b: Reset the order of levels of the target variable, as (“Event” = Yes, “No Event” = No)
# ordering the levels
CCdefault.dt$Default <- ordered(CCdefault.dt$Default, levels = c("Yes", "No"))
# verifying the new order of levels
levels(CCdefault.dt$Default)
## [1] "Yes" "No"
Part 5c: Splitting the Dataset into the Training set and Test set
library(caTools)
# get the same split when you re-run the code
set.seed(123)
# splitting the data set into ratio 0.80:0.20
split <- sample.split(CCdefault.dt$Default, SplitRatio = 0.80)
# create the training dataset
trainingSet <- subset(CCdefault.dt, split == TRUE)
# create the testing dataset
testSet <- subset(CCdefault.dt, split == FALSE)
Part 5d: Verify Training & Testing Datasets
# dimension of training dataset
dim(trainingSet)
## [1] 23681 9
# dimension of testing dataset
dim(testSet)
## [1] 5920 9
Part 5e: Verify Proportion of “Event (default = Yes)”
# proportion of defaulters in training dataset
round(prop.table(table(trainingSet$Default))*100,2)
##
## Yes No
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testSet$Default))*100,2)
##
## Yes No
## 22.31 77.69
Part 6: Run the Machine Learning algorithm – kNN
Part 6a: Model Building Using kNN
library(caret)
## Loading required package: lattice
# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(123)
# Run kNN Classifier in package caret
knn_fit <- train(Default ~ .,
data = trainingSet,
method = "knn",
trControl = trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
# kNN model summary
knn_fit
## k-Nearest Neighbors
##
## 23681 samples
## 8 predictor
## 2 classes: 'Yes', 'No'
##
## Pre-processing: centered (11), scaled (11)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 21313, 21313, 21312, 21313, 21313, 21314, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7332323 0.04144949
## 7 0.7466466 0.03837324
## 9 0.7524177 0.02828311
## 11 0.7588644 0.02824895
## 13 0.7629885 0.02539935
## 15 0.7660992 0.02232876
## 17 0.7686470 0.01989299
## 19 0.7703924 0.01715124
## 21 0.7719830 0.01592183
## 23 0.7721380 0.01225606
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
Part 6b: Testing the kNN Model
# predicting the test set observations
kNNPred <- predict(knn_fit, testSet, type = "raw")
Part 6c: Confusion Matrix
# confusion matrix
table(Predicted = kNNPred, Actual = testSet$Default)
## Actual
## Predicted Yes No
## Yes 26 50
## No 1295 4549
Part 7: Run the Machine Learning algorithm – Logistic Regression
Part 7a: Model Building Using Logistic Regression
# fit logistic regression model
logitModel <- glm(Default ~
CreditLimit
+ Male
+ Education
+ MaritalStatus
+ Age
+ BillOutstanding
+ LastPayment,
data = trainingSet,
family = binomial())
# summary of the logistic regression model
summary(logitModel)
##
## Call:
## glm(formula = Default ~ CreditLimit + Male + Education + MaritalStatus +
## Age + BillOutstanding + LastPayment, family = binomial(),
## data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0795 0.3568 0.6467 0.7761 0.9938
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.746e-01 8.675e-02 8.929 < 2e-16 ***
## CreditLimit 3.363e-06 1.631e-07 20.616 < 2e-16 ***
## Male1 -1.526e-01 3.254e-02 -4.690 2.73e-06 ***
## Education2 -3.524e-03 3.778e-02 -0.093 0.92569
## Education3 1.038e-02 5.018e-02 0.207 0.83616
## Education4 1.500e+00 4.615e-01 3.251 0.00115 **
## MaritalStatus2 2.282e-01 3.674e-02 6.212 5.23e-10 ***
## MaritalStatus3 1.397e-01 1.454e-01 0.961 0.33662
## Age -3.679e-03 1.963e-03 -1.874 0.06093 .
## BillOutstanding -1.941e-06 2.632e-07 -7.374 1.65e-13 ***
## LastPayment 3.090e-05 3.168e-06 9.754 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25142 on 23680 degrees of freedom
## Residual deviance: 24199 on 23670 degrees of freedom
## AIC: 24221
##
## Number of Fisher Scoring iterations: 6
Part 7b: Testing the Model
# predicting the test set observations
logitModelPred <- predict(logitModel, testSet, type = "response")
Part 7c: Plotting the Predicted Probabilities
# plot of probabilities
plot(logitModelPred,
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")

Part 7d: Confusion Matrix
# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.5,"Yes","No")
# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testSet$Default <- ordered(testSet$Default, levels = c("Yes", "No"))
# confusion matrix
cm <- table(Predicted = classify50, Actual = testSet$Default)
cm
## Actual
## Predicted Yes No
## Yes 1321 4599
## No 0 0
Part 7e: Confusion Matrix using carat package
library(caret)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 1321 4599
## No 0 0
##
## Accuracy : 0.2231
## 95% CI : (0.2126, 0.234)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.2231
## Neg Pred Value : NaN
## Prevalence : 0.2231
## Detection Rate : 0.2231
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
Part 7f: Measuring Machine Learning Metrics at different Cut-off Probabilities
library(caret)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
logitModelPred <- predict(logitModel, testSet, type = "response")
C1 <- ifelse(logitModelPred > cutoff, "Yes", "No")
C2 <- testSet$Default
predY <- as.factor(C1)
actualY <- as.factor(C2)
predY <- ordered(predY, levels = c("Yes", "No"))
actualY <- ordered(actualY, levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(table(predY,actualY))
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# making sequence of cut-off probabilities
cutoff1 <- seq( .1, .9, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# extra coding for saving table as desired format
tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
tab2[[15]],tab2[[16]],tab2[[17]])
tab3
## Accuracy Sensitivity Specificity Kappa
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2293919 0.95457986 0.02109154 -0.01106665
## Accuracy 0.2790541 0.73959122 0.14677104 -0.05780682
## Accuracy 0.3846284 0.43149129 0.37116765 -0.12508918
## Accuracy 0.5130068 0.24072672 0.59121548 -0.13590326
## Accuracy 0.6339527 0.11127933 0.78408350 -0.11000896
## Accuracy 0.7113176 0.03028009 0.90693629 -0.08155029
Part 7g: ROC Curve
# loading the package
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(gplots)
PredLR <- predict(logitModel, testSet, type = "response")
lgPredObj <- prediction(PredLR, testSet$Default)
lgPerfObj <- performance(lgPredObj, "tpr", "fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

Part 7h: AUC (Area Under The Curve)
# area under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR
## [1] 0.6265644