Part 1
# reading external data and storing into a dataframe called "cc.df"
# reading data as data.table
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.2
CCdefault_dt <- read.csv("MCICreditCardDefault.csv")
CCdefault_dt<-data.table(CCdefault_dt)
# CCdefault_dt<-CCdefault_dt[,-c(1)]
# attaching the data
attach(CCdefault_dt)
# dimension of the data table
dim(CCdefault_dt)
## [1] 29601 8
# column names
colnames(CCdefault_dt)
## [1] "CreditLimit" "Male" "Education" "MaritalStatus"
## [5] "Age" "BillOutstanding" "LastPayment" "Default"
# structure of the dataframe
str(CCdefault_dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 8 variables:
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : int 0 0 0 0 1 1 1 0 0 1 ...
## $ Education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : int 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : int 1 1 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# convert 'Id' as a factor
#CCdefault_dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault_dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault_dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault_dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault_dt[, Default := as.factor(Default)]
# Changing the lavels of 'Default' variable
levels(CCdefault_dt$Default) <- c("No","Yes")
# verifying conversion
str(CCdefault_dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 8 variables:
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
## $ Education : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# levels of the target variable
levels(CCdefault_dt$Default)
## [1] "No" "Yes"
# ordering the levels
CCdefault_dt$Default <- ordered(CCdefault_dt$Default, levels = c("Yes", "No"))
# verifying the new order of levels
levels(CCdefault_dt$Default)
## [1] "Yes" "No"
CCdefault_dt<-CCdefault_dt[,-c(1)]
Part 2
library(caret)
## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
# data partition
set.seed(2341)
trainIndex <- createDataPartition(CCdefault_dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData_dt <- CCdefault_dt[trainIndex, ]
# 20% testing data
testData_dt <- CCdefault_dt[-trainIndex, ]
# dimension of training dataset
dim(trainData_dt)
## [1] 23681 7
# dimension of testing dataset
dim(testData_dt)
## [1] 5920 7
# proportion of defaulters in training dataset
round(prop.table(table(trainData_dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testData_dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
KNN
library(caret)
# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(3333)
# Run kNN Classifier in package caret
knn_fit <- train(Default ~ .,data = trainData_dt,method = "knn",trControl = trctrl,preProcess = c("center", "scale"),
tuneLength = 10)
# kNN model summary
knn_fit
## k-Nearest Neighbors
##
## 23681 samples
## 6 predictor
## 2 classes: 'Yes', 'No'
##
## Pre-processing: centered (9), scaled (9)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 21313, 21312, 21312, 21312, 21313, 21314, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7375254 0.037758667
## 7 0.7490676 0.026958316
## 9 0.7586814 0.030083579
## 11 0.7636500 0.022504496
## 13 0.7658883 0.014397808
## 15 0.7685909 0.010278135
## 17 0.7707867 0.008171859
## 19 0.7716875 0.003448281
## 21 0.7729544 0.002464291
## 23 0.7739960 0.001638288
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
Testing the KNN model
# predicting the test set observations
kNNPred <- predict(knn_fit, testData_dt, type = "prob")
# setting the cut-off probablity
classify20 <- ifelse(kNNPred[,1] > 0.2,"Yes","No")
# ordering the levels
classify20 <- ordered(classify20, levels = c("Yes", "No"))
testData_dt$Default <- ordered(testData_dt$Default, levels = c("Yes", "No"))
# confusion matrix
cm <- table(Predicted = classify20, Actual = testData_dt$Default)
cm
## Actual
## Predicted Yes No
## Yes 880 2494
## No 441 2105
library(caret)
library(ROCR)
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.5.2
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 880 2494
## No 441 2105
##
## Accuracy : 0.5042
## 95% CI : (0.4914, 0.517)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0797
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6662
## Specificity : 0.4577
## Pos Pred Value : 0.2608
## Neg Pred Value : 0.8268
## Prevalence : 0.2231
## Detection Rate : 0.1486
## Detection Prevalence : 0.5699
## Balanced Accuracy : 0.5619
##
## 'Positive' Class : Yes
##
knnPredObj <- prediction(kNNPred[,1],testData_dt$Default)
knnPerfObj <- performance(knnPredObj, "tpr","fpr")
# plotting ROC curve
plot(knnPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

# auc for knn
knn_pred<-prediction(kNNPred[,1],testData_dt$Default)
aucknn <- performance(knn_pred, measure = "auc")
aucknn <- aucknn@y.values[[1]]
aucknn
## [1] 0.4190757
Logistic Regression
# model building using caret package
set.seed(766)
# fit logistic regression model
logitModel <- glm(Default ~., data = trainData_dt,family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# summary of the logistic regression model
summary(logitModel)
##
## Call:
## glm(formula = Default ~ ., family = binomial(), data = trainData_dt)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.8122 0.4446 0.6885 0.7507 0.9207
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.186e+00 8.528e-02 13.906 < 2e-16 ***
## Male1 -2.143e-01 3.211e-02 -6.673 2.50e-11 ***
## Education2 -2.163e-01 3.602e-02 -6.004 1.92e-09 ***
## Education3 -3.023e-01 4.755e-02 -6.359 2.04e-10 ***
## Education4 9.819e-01 3.964e-01 2.477 0.0132 *
## MaritalStatus2 1.495e-01 3.614e-02 4.136 3.54e-05 ***
## MaritalStatus3 -3.373e-02 1.488e-01 -0.227 0.8206
## Age 2.719e-03 1.978e-03 1.374 0.1693
## BillOutstanding -4.428e-07 2.391e-07 -1.852 0.0640 .
## LastPayment 3.530e-05 3.061e-06 11.535 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25142 on 23680 degrees of freedom
## Residual deviance: 24737 on 23671 degrees of freedom
## AIC: 24757
##
## Number of Fisher Scoring iterations: 6
Testing Logistic Regression
# predicting the test set observations
logitModelPred <- predict(logitModel, testData_dt, type = "response")
# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.2,"Yes","No")
# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testData_dt$Default <- ordered(testData_dt$Default, levels = c("Yes", "No"))
# confusion matrix
cm <- table(Predicted = classify50, Actual = testData_dt$Default)
cm
## Actual
## Predicted Yes No
## Yes 1321 4599
## No 0 0
library(caret)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 1321 4599
## No 0 0
##
## Accuracy : 0.2231
## 95% CI : (0.2126, 0.234)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.2231
## Neg Pred Value : NaN
## Prevalence : 0.2231
## Detection Rate : 0.2231
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
lgPredObj <- prediction(logitModelPred,testData_dt$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

# auc for logistic regression
library(ROCR)
logit_pred<-prediction(logitModelPred,testData_dt$Default)
auclogit <- performance(logit_pred, measure = "auc")
auclogit <- auclogit@y.values[[1]]
auclogit
## [1] 0.6078588