#PART 1: Read the data

library(data.table)
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
# attaching the data
attach(CCdefault.dt)

# dimension of the data table
dim(CCdefault.dt)
## [1] 29601     9
# column names
colnames(CCdefault.dt)
## [1] "Id"              "CreditLimit"     "Male"            "Education"      
## [5] "MaritalStatus"   "Age"             "BillOutstanding" "LastPayment"    
## [9] "Default"

#PART 2: Verifying data type structure

# structure of the dataframe
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# convert 'Id' as a factor
#CCdefault.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]


# Changing the lavels of 'Default' variable
levels(CCdefault.dt$Default) <- c("No","Yes")

# verifying conversion
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

#PART 3: Check / Reset the Levels of the target variable

# levels of the target variable
levels(CCdefault.dt$Default)
## [1] "No"  "Yes"
# ordering the levels
CCdefault.dt$Default <- ordered(CCdefault.dt$Default, levels = c("Yes", "No"))

# verifying the new order of levels
levels(CCdefault.dt$Default)
## [1] "Yes" "No"

#PART 4: Split the data into a training set (80%) and a testing set (20%)

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
# data partition
set.seed(2341)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)

# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]

# 20% testing data
testData.dt <- CCdefault.dt[-trainIndex, ]

#PART 5: Verify the Split

# dimension of training dataset
dim(trainData.dt)
## [1] 23681     9
# dimension of testing dataset
dim(testData.dt)
## [1] 5920    9
# proportion of defaulters in training dataset
round(prop.table(table(trainData.dt$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testData.dt$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69

#Run the Machine Learning algorithm – Logistic Regression

# fit logistic regression model 
 logitModel <- glm(Default ~ CreditLimit + Male + Education + MaritalStatus  + Age+ BillOutstanding + LastPayment, 
                        data = trainData.dt, 
                        family = binomial())
# summary of the logistic regression model 
summary(logitModel)
## 
## Call:
## glm(formula = Default ~ CreditLimit + Male + Education + MaritalStatus + 
##     Age + BillOutstanding + LastPayment, family = binomial(), 
##     data = trainData.dt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.5626   0.3635   0.6509   0.7763   0.9929  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      8.005e-01  8.678e-02   9.225  < 2e-16 ***
## CreditLimit      3.195e-06  1.616e-07  19.768  < 2e-16 ***
## Male1           -1.538e-01  3.254e-02  -4.726 2.29e-06 ***
## Education2      -1.569e-02  3.767e-02  -0.417  0.67696    
## Education3       2.036e-02  5.043e-02   0.404  0.68636    
## Education4       1.468e+00  4.619e-01   3.179  0.00148 ** 
## MaritalStatus2   2.066e-01  3.666e-02   5.637 1.73e-08 ***
## MaritalStatus3   1.454e-01  1.481e-01   0.982  0.32618    
## Age             -3.318e-03  1.971e-03  -1.683  0.09235 .  
## BillOutstanding -1.925e-06  2.595e-07  -7.417 1.20e-13 ***
## LastPayment      2.989e-05  3.112e-06   9.606  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25142  on 23680  degrees of freedom
## Residual deviance: 24260  on 23670  degrees of freedom
## AIC: 24282
## 
## Number of Fisher Scoring iterations: 6
# predicting the test set observations
logitModelPred <- predict(logitModel, testData.dt, type = "response")

# plot of probabilities
plot(logitModelPred, 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.8,"Yes","No")

# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify50, Actual = testData.dt$Default)
cm
##          Actual
## Predicted  Yes   No
##       Yes  264 1791
##       No  1057 2808
library(caret)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes  264 1791
##       No  1057 2808
##                                           
##                Accuracy : 0.5189          
##                  95% CI : (0.5061, 0.5317)
##     No Information Rate : 0.7769          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.1582         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.19985         
##             Specificity : 0.61057         
##          Pos Pred Value : 0.12847         
##          Neg Pred Value : 0.72652         
##              Prevalence : 0.22314         
##          Detection Rate : 0.04459         
##    Detection Prevalence : 0.34713         
##       Balanced Accuracy : 0.40521         
##                                           
##        'Positive' Class : Yes             
## 
library(caret)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {

       # predicting the test set results
         logitModelPred <- predict(logitModel, testData.dt, type = "response")
         C1 <- ifelse(logitModelPred > cutoff, "Yes", "No")
         C2 <- testData.dt$Default
         predY   <- as.factor(C1)
         actualY <- as.factor(C2)

      predY <- ordered(predY, levels = c("Yes", "No"))
      actualY <- ordered(actualY, levels = c("Yes", "No"))

        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(table(predY,actualY))
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]

        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # making sequence of cut-off probabilities       
        cutoff1 <- seq( .1, .9, by = .05 )
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # extra coding for saving table as desired format
        tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
                 tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
                 tab2[[15]],tab2[[16]],tab2[[17]])
        tab3
##           Accuracy Sensitivity Specificity       Kappa
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2263514  0.95836488  0.01609045 -0.01158003
## Accuracy 0.2673986  0.73277820  0.13372472 -0.06743714
## Accuracy 0.3836149  0.42770628  0.37095021 -0.12771359
## Accuracy 0.5189189  0.19984860  0.61056751 -0.15824770
## Accuracy 0.6457770  0.08629826  0.80647967 -0.11724933
## Accuracy 0.7197635  0.02952309  0.91802566 -0.06939397
# trainData.dt
# testData.dt

        # loading the package
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
PredLR1 <- predict(logitModel, testData.dt,type = "response")
lgPredObj1 <- prediction(PredLR1,testData.dt$Default)
lgPerfObj1 <- performance(lgPredObj1, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj1,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

# area under curve
aucLR1 <- performance(lgPredObj1, measure = "auc")
aucLR1 <- aucLR1@y.values[[1]]
aucLR1
## [1] 0.6410443