Part 1

# reading external data and storing into a dataframe called "cc.df"
setwd("d:/IIML/Term 4/MLM/Session 9")
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
# attaching the data
attach(CCdefault.dt)

# dimension of the data table
dim(CCdefault.dt)
## [1] 29601     9
# column names
colnames(CCdefault.dt)
## [1] "Id"              "CreditLimit"     "Male"            "Education"      
## [5] "MaritalStatus"   "Age"             "BillOutstanding" "LastPayment"    
## [9] "Default"
# structure of the dataframe
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# convert 'Id' as a factor
CCdefault.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]


# Changing the lavels of 'Default' variable
levels(CCdefault.dt$Default) <- c("No","Yes")

# verifying conversion
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : Factor w/ 29601 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# levels of the target variable
levels(CCdefault.dt$Default)
## [1] "No"  "Yes"
# ordering the levels
CCdefault.dt$Default <- ordered(CCdefault.dt$Default, levels = c("Yes", "No"))

# verifying the new order of levels
levels(CCdefault.dt$Default)
## [1] "Yes" "No"

Part 2

library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
# data partition
set.seed(2341)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)

# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]

# 20% testing data
testData.dt <- CCdefault.dt[-trainIndex, ]

# dimension of training dataset
dim(trainData.dt)
## [1] 23681     9
# dimension of testing dataset
dim(testData.dt)
## [1] 5920    9
# proportion of defaulters in training dataset
round(prop.table(table(trainData.dt$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testData.dt$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69

Part 3

# fit logistic regression model 
 logitModel <- glm(Default ~ .,data = trainData.dt[,-c(1)], family = binomial())
# summary of the logistic regression model 
summary(logitModel)
## 
## Call:
## glm(formula = Default ~ ., family = binomial(), data = trainData.dt[, 
##     -c(1)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2345   0.3797   0.6489   0.7746   0.9939  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      7.914e-01  8.671e-02   9.127  < 2e-16 ***
## CreditLimit      3.290e-06  1.621e-07  20.290  < 2e-16 ***
## Male1           -1.755e-01  3.250e-02  -5.400 6.65e-08 ***
## Education2      -5.101e-03  3.766e-02  -0.135    0.892    
## Education3      -9.346e-03  5.003e-02  -0.187    0.852    
## Education4       9.785e-01  3.982e-01   2.457    0.014 *  
## MaritalStatus2   2.140e-01  3.670e-02   5.833 5.46e-09 ***
## MaritalStatus3   1.877e-01  1.497e-01   1.254    0.210    
## Age             -2.805e-03  1.980e-03  -1.417    0.157    
## BillOutstanding -1.839e-06  2.571e-07  -7.154 8.43e-13 ***
## LastPayment      2.471e-05  2.851e-06   8.666  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25142  on 23680  degrees of freedom
## Residual deviance: 24276  on 23670  degrees of freedom
## AIC: 24298
## 
## Number of Fisher Scoring iterations: 6

Part 3

# predicting the test set observations
logitModelPred <- predict(logitModel, testData.dt, type = "response")

plot(logitModelPred, 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

Part 4

# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.5,"Yes","No")

# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify50, Actual = testData.dt$Default)
cm
##          Actual
## Predicted  Yes   No
##       Yes 1321 4599
##       No     0    0
library(caret)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes 1321 4599
##       No     0    0
##                                          
##                Accuracy : 0.2231         
##                  95% CI : (0.2126, 0.234)
##     No Information Rate : 0.7769         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.2231         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.2231         
##          Detection Rate : 0.2231         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : Yes            
##