task 3: Prepare and split data

df <- read.csv("MCICreditCardDefault.csv")

# number of rows and columns in the df
dim(df)
## [1] 29601     9
# column names
colnames(df)
## [1] "Id"              "CreditLimit"     "Male"            "Education"      
## [5] "MaritalStatus"   "Age"             "BillOutstanding" "LastPayment"    
## [9] "Default"
#Verifying datastructures
str(df)
## 'data.frame':    29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
#Convert default col into a factor array

#Check before convert
typeof(df$Default)
## [1] "integer"
class(df$Default)
## [1] "integer"
#Now convert
df$Default[df$Default == 1] = "Yes"
df$Default[df$Default == 0] = "No"
df$Default = factor(df$Default)
#Check after convert
typeof(df$Default)
## [1] "integer"
class(df$Default)
## [1] "factor"
# levels of the target variable
levels(df$Default)
## [1] "No"  "Yes"
#Reset the order of levels of the target variable, as (“Event” = 1, “No Event” = 0)
# ordering the levels
df$Default <- ordered(df$Default, levels = c("Yes", "No"))

# verifying the new order of levels
levels(df$Default)
## [1] "Yes" "No"
df$Male = factor(df$Male)
df$Education = factor(df$Education)
df$MaritalStatus = factor(df$MaritalStatus)
str(df)
## 'data.frame':    29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Ord.factor w/ 2 levels "Yes"<"No": 1 1 2 2 2 2 2 2 2 2 ...
library(caTools)
# get the same split when you re-run the code
set.seed(2341)

# splitting the data set into ratio 0.80:0.20
split <- sample.split(df$Default, SplitRatio = 0.80)

# create the training dataset
trainingSet <- subset(df, split == TRUE)

# create the testing dataset
testSet <- subset(df, split == FALSE)


#Verify the split
# dimension of training dataset
dim(trainingSet)
## [1] 23681     9
# dimension of testing dataset
dim(testSet)
## [1] 5920    9
# proportion of defaulters in training dataset
round(prop.table(table(trainingSet$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testSet$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69

Run kNN and construct the Confusion Matrix

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
# Set control parameters
 trctrl <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3,classProbs = TRUE)
set.seed(3333)

# Run kNN Classifier in package caret
#knn_fit  <- train(Default~CreditLimit
#                  +Male
#                  +Education
#                  +MaritalStatus
#                  +Age
#                  +BillOutstanding
#                  +LastPayment,data = trainingSet,method = "knn",
#                          trControl = trctrl,
#                          preProcess = c("center", "scale"),
#                          tuneLength = 10)
knn_fit  <- train(Default ~ .,data = trainingSet,method = "knn",
                         trControl = trctrl,
                         preProcess = c("center", "scale"),
                         tuneLength = 10)
# kNN model summary
print(knn_fit)
## k-Nearest Neighbors 
## 
## 23681 samples
##     8 predictor
##     2 classes: 'Yes', 'No' 
## 
## Pre-processing: centered (11), scaled (11) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 21313, 21314, 21313, 21313, 21312, 21314, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    5  0.7330771  0.03989157
##    7  0.7450837  0.03248630
##    9  0.7518685  0.02514880
##   11  0.7583154  0.02630552
##   13  0.7622564  0.02329528
##   15  0.7655644  0.02232675
##   17  0.7688443  0.02203234
##   19  0.7696888  0.01772097
##   21  0.7709555  0.01348009
##   23  0.7718706  0.01127404
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
# predicting the test set observations
#kNNPred <- predict(knn_fit, testSet, type = "raw")
kNNPred <- predict(knn_fit, testSet, type = "prob")

predictedLabels <- ifelse(kNNPred$Yes > 0.2,"Yes","No")
# ordering the levels
predictedLabels <- ordered(predictedLabels, levels = c("Yes", "No"))

# confusion matrix
cm = table(Predicted = predictedLabels, Actual = testSet$Default)
print(cm)
##          Actual
## Predicted  Yes   No
##       Yes  940 2598
##       No   381 2001
#Calculate stats from confusion matrix
tp = cm[1,1]
fp = cm[1,2]
fn = cm[2,1]
tn = cm[2,2]

Accuracy = 100*(tp + tn)/(tp + fp + fn + tn)
Sensitivity = 100*(tp)/(tp + fn)
Specificity = 100*(tn)/(fp + tn)
Precision = 100*(tp)/(tp + fp)

message("type 1 errors= ",fp)
## type 1 errors= 2598
message("type 2 errors= ",fn)
## type 2 errors= 381
message("Accuracy= ",Accuracy)
## Accuracy= 49.6790540540541
message("Sensitivity= ",Sensitivity)
## Sensitivity= 71.1582134746404
message("Specificity= ",Specificity)
## Specificity= 43.5094585779517
message("Precision= ",Precision)
## Precision= 26.5686828716789
#Plot ROC Curve
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
lgPredObj <- prediction(kNNPred$Yes,testSet$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
message("AUC is ",aucLR)
## AUC is 0.40049946677346