Part 1

# reading external data and storing into a dataframe called "cc.df"
setwd("d:/IIML/Term 4/MLM/Session 9")
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
# reading data as data.table
CCdefault_dt <- read.csv("MCICreditCardDefault.csv")
CCdefault_dt<-data.table(CCdefault_dt)
# CCdefault_dt<-CCdefault_dt[,-c(1)]
# attaching the data
attach(CCdefault_dt)

# dimension of the data table
dim(CCdefault_dt)
## [1] 29601     9
# column names
colnames(CCdefault_dt)
## [1] "Id"              "CreditLimit"     "Male"            "Education"      
## [5] "MaritalStatus"   "Age"             "BillOutstanding" "LastPayment"    
## [9] "Default"
# structure of the dataframe
str(CCdefault_dt)
## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# convert 'Id' as a factor
CCdefault_dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault_dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault_dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault_dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault_dt[, Default := as.factor(Default)]


# Changing the lavels of 'Default' variable
levels(CCdefault_dt$Default) <- c("No","Yes")

# verifying conversion
str(CCdefault_dt)
## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : Factor w/ 29601 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# levels of the target variable
levels(CCdefault_dt$Default)
## [1] "No"  "Yes"
# ordering the levels
CCdefault_dt$Default <- ordered(CCdefault_dt$Default, levels = c("Yes", "No"))

# verifying the new order of levels
levels(CCdefault_dt$Default)
## [1] "Yes" "No"
CCdefault_dt<-CCdefault_dt[,-c(1)]

Part 2

library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
# data partition
set.seed(2341)
trainIndex <- createDataPartition(CCdefault_dt$Default, p = 0.80, list = FALSE)

# 80% training data
trainData_dt <- CCdefault_dt[trainIndex, ]

# 20% testing data
testData_dt <- CCdefault_dt[-trainIndex, ]

# dimension of training dataset
dim(trainData_dt)
## [1] 23681     8
# dimension of testing dataset
dim(testData_dt)
## [1] 5920    8
# proportion of defaulters in training dataset
round(prop.table(table(trainData_dt$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testData_dt$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69

KNN

library(caret)
# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3)
set.seed(3333)

# Run kNN Classifier in package caret
knn_fit  <- train(Default ~ .,data = trainData_dt,method = "knn",trControl = trctrl,preProcess = c("center", "scale"),
                         tuneLength = 10)
# kNN model summary
knn_fit
## k-Nearest Neighbors 
## 
## 23681 samples
##     7 predictor
##     2 classes: 'Yes', 'No' 
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 21313, 21312, 21312, 21312, 21313, 21314, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    5  0.7390734  0.06149083
##    7  0.7510663  0.05990479
##    9  0.7567252  0.04618397
##   11  0.7610041  0.03788105
##   13  0.7642978  0.03263982
##   15  0.7669441  0.03063838
##   17  0.7689851  0.02874293
##   19  0.7699705  0.02067016
##   21  0.7708854  0.01642895
##   23  0.7715469  0.01430021
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.

Testing the KNN model

# predicting the test set observations
kNNPred <- predict(knn_fit, testData_dt, type = "prob")

# setting the cut-off probablity
classify20 <- ifelse(kNNPred[,1] > 0.2,"Yes","No")

# ordering the levels
classify20 <- ordered(classify20, levels = c("Yes", "No"))
testData_dt$Default <- ordered(testData_dt$Default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify20, Actual = testData_dt$Default)
cm
##          Actual
## Predicted  Yes   No
##       Yes  956 2424
##       No   365 2175
library(caret)
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.5.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.5.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes  956 2424
##       No   365 2175
##                                           
##                Accuracy : 0.5289          
##                  95% CI : (0.5161, 0.5417)
##     No Information Rate : 0.7769          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1264          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7237          
##             Specificity : 0.4729          
##          Pos Pred Value : 0.2828          
##          Neg Pred Value : 0.8563          
##              Prevalence : 0.2231          
##          Detection Rate : 0.1615          
##    Detection Prevalence : 0.5709          
##       Balanced Accuracy : 0.5983          
##                                           
##        'Positive' Class : Yes             
## 
knnPredObj <- prediction(kNNPred[,1],testData_dt$Default)
knnPerfObj <- performance(knnPredObj, "tpr","fpr")
# plotting ROC curve
plot(knnPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

# auc for knn
knn_pred<-prediction(kNNPred[,1],testData_dt$Default)
aucknn <- performance(knn_pred, measure = "auc")
aucknn <- aucknn@y.values[[1]]
aucknn
## [1] 0.3755618

Logistic Regression

# model building using caret package
set.seed(766)
# fit logistic regression model 
 logitModel <- glm(Default ~., data = trainData_dt,family = binomial())
# summary of the logistic regression model 
summary(logitModel)
## 
## Call:
## glm(formula = Default ~ ., family = binomial(), data = trainData_dt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2345   0.3797   0.6489   0.7746   0.9939  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      7.914e-01  8.671e-02   9.127  < 2e-16 ***
## CreditLimit      3.290e-06  1.621e-07  20.290  < 2e-16 ***
## Male1           -1.755e-01  3.250e-02  -5.400 6.65e-08 ***
## Education2      -5.101e-03  3.766e-02  -0.135    0.892    
## Education3      -9.346e-03  5.003e-02  -0.187    0.852    
## Education4       9.785e-01  3.982e-01   2.457    0.014 *  
## MaritalStatus2   2.140e-01  3.670e-02   5.833 5.46e-09 ***
## MaritalStatus3   1.877e-01  1.497e-01   1.254    0.210    
## Age             -2.805e-03  1.980e-03  -1.417    0.157    
## BillOutstanding -1.839e-06  2.571e-07  -7.154 8.43e-13 ***
## LastPayment      2.471e-05  2.851e-06   8.666  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25142  on 23680  degrees of freedom
## Residual deviance: 24276  on 23670  degrees of freedom
## AIC: 24298
## 
## Number of Fisher Scoring iterations: 6

Testing Logistic Regression

# predicting the test set observations
logitModelPred <- predict(logitModel, testData_dt, type = "response")


# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.2,"Yes","No")

# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testData_dt$Default <- ordered(testData_dt$Default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify50, Actual = testData_dt$Default)
cm
##          Actual
## Predicted  Yes   No
##       Yes 1321 4599
##       No     0    0
library(caret)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes 1321 4599
##       No     0    0
##                                          
##                Accuracy : 0.2231         
##                  95% CI : (0.2126, 0.234)
##     No Information Rate : 0.7769         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.2231         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.2231         
##          Detection Rate : 0.2231         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : Yes            
## 
lgPredObj <- prediction(logitModelPred,testData_dt$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

# auc for logistic regression
library(ROCR)
logit_pred<-prediction(logitModelPred,testData_dt$Default)
auclogit <- performance(logit_pred, measure = "auc")
auclogit <- auclogit@y.values[[1]]
auclogit
## [1] 0.6396441