Part 1
# reading external data and storing into a dataframe called "cc.df"
setwd("d:/IIML/Term 4/MLM/Session 9")
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
# attaching the data
attach(CCdefault.dt)
# dimension of the data table
dim(CCdefault.dt)
## [1] 29601 9
# column names
colnames(CCdefault.dt)
## [1] "Id" "CreditLimit" "Male" "Education"
## [5] "MaritalStatus" "Age" "BillOutstanding" "LastPayment"
## [9] "Default"
# structure of the dataframe
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : int 0 0 0 0 1 1 1 0 0 1 ...
## $ Education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : int 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : int 1 1 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# convert 'Id' as a factor
CCdefault.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]
# Changing the lavels of 'Default' variable
levels(CCdefault.dt$Default) <- c("No","Yes")
# verifying conversion
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : Factor w/ 29601 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
## $ Education : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# levels of the target variable
levels(CCdefault.dt$Default)
## [1] "No" "Yes"
# ordering the levels
CCdefault.dt$Default <- ordered(CCdefault.dt$Default, levels = c("Yes", "No"))
# verifying the new order of levels
levels(CCdefault.dt$Default)
## [1] "Yes" "No"
Part 2
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
# data partition
set.seed(2341)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]
# 20% testing data
testData.dt <- CCdefault.dt[-trainIndex, ]
# dimension of training dataset
dim(trainData.dt)
## [1] 23681 9
# dimension of testing dataset
dim(testData.dt)
## [1] 5920 9
# proportion of defaulters in training dataset
round(prop.table(table(trainData.dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testData.dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
Part 3
# fit logistic regression model
logitModel <- glm(Default ~ .,data = trainData.dt[,-c(1)], family = binomial())
# summary of the logistic regression model
summary(logitModel)
##
## Call:
## glm(formula = Default ~ ., family = binomial(), data = trainData.dt[,
## -c(1)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2345 0.3797 0.6489 0.7746 0.9939
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.914e-01 8.671e-02 9.127 < 2e-16 ***
## CreditLimit 3.290e-06 1.621e-07 20.290 < 2e-16 ***
## Male1 -1.755e-01 3.250e-02 -5.400 6.65e-08 ***
## Education2 -5.101e-03 3.766e-02 -0.135 0.892
## Education3 -9.346e-03 5.003e-02 -0.187 0.852
## Education4 9.785e-01 3.982e-01 2.457 0.014 *
## MaritalStatus2 2.140e-01 3.670e-02 5.833 5.46e-09 ***
## MaritalStatus3 1.877e-01 1.497e-01 1.254 0.210
## Age -2.805e-03 1.980e-03 -1.417 0.157
## BillOutstanding -1.839e-06 2.571e-07 -7.154 8.43e-13 ***
## LastPayment 2.471e-05 2.851e-06 8.666 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25142 on 23680 degrees of freedom
## Residual deviance: 24276 on 23670 degrees of freedom
## AIC: 24298
##
## Number of Fisher Scoring iterations: 6
Part 3
# predicting the test set observations
logitModelPred <- predict(logitModel, testData.dt, type = "response")
plot(logitModelPred,
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")

Part 4
# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.5,"Yes","No")
# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))
# confusion matrix
cm <- table(Predicted = classify50, Actual = testData.dt$Default)
cm
## Actual
## Predicted Yes No
## Yes 1321 4599
## No 0 0
library(caret)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 1321 4599
## No 0 0
##
## Accuracy : 0.2231
## 95% CI : (0.2126, 0.234)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.2231
## Neg Pred Value : NaN
## Prevalence : 0.2231
## Detection Rate : 0.2231
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##