#PART 1: Read the data

library(data.table)
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
# attaching the data
attach(CCdefault.dt)

# dimension of the data table
dim(CCdefault.dt)
## [1] 29601     9
# column names
colnames(CCdefault.dt)
## [1] "Id"              "CreditLimit"     "Male"            "Education"      
## [5] "MaritalStatus"   "Age"             "BillOutstanding" "LastPayment"    
## [9] "Default"

#PART 2: Verifying data type structure

# structure of the dataframe
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# convert 'Id' as a factor
#CCdefault.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]


# Changing the lavels of 'Default' variable
levels(CCdefault.dt$Default) <- c("No","Yes")

# verifying conversion
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

#PART 3: Check / Reset the Levels of the target variable

# levels of the target variable
levels(CCdefault.dt$Default)
## [1] "No"  "Yes"
# ordering the levels
CCdefault.dt$Default <- ordered(CCdefault.dt$Default, levels = c("Yes", "No"))

# verifying the new order of levels
levels(CCdefault.dt$Default)
## [1] "Yes" "No"

#PART 4: Split the data into a training set (80%) and a testing set (20%)

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
# data partition
set.seed(2341)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)

# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]

# 20% testing data
testData.dt <- CCdefault.dt[-trainIndex, ]

#PART 5: Verify the Split

# dimension of training dataset
dim(trainData.dt)
## [1] 23681     9
# dimension of testing dataset
dim(testData.dt)
## [1] 5920    9
# proportion of defaulters in training dataset
round(prop.table(table(trainData.dt$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testData.dt$Default))*100,2)
## 
##   Yes    No 
## 22.31 77.69

#Run the Machine Learning algorithm – kNN

library(caret)
# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3)
set.seed(3333)

# Run kNN Classifier in package caret
knn_fit  <- train(Default ~ CreditLimit + Male + Education + MaritalStatus  + Age+ BillOutstanding + LastPayment, 
                         data = trainData.dt,
                         method = "knn",
                         trControl = trctrl,
                         preProcess = c("center", "scale"),
                         tuneLength = 10)
# kNN model summary
knn_fit 
## k-Nearest Neighbors 
## 
## 23681 samples
##     7 predictor
##     2 classes: 'Yes', 'No' 
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 21313, 21314, 21313, 21313, 21312, 21314, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    5  0.7363992  0.05716480
##    7  0.7483638  0.05526080
##    9  0.7558805  0.05114314
##   11  0.7623696  0.04975468
##   13  0.7649173  0.04025378
##   15  0.7666202  0.03245795
##   17  0.7684078  0.02854856
##   19  0.7693933  0.02367882
##   21  0.7711811  0.02270309
##   23  0.7725323  0.02163401
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
# predicting the test set observations
kNNPred <- predict(knn_fit, testData.dt, type = "raw")
# confusion matrix
table(Predicted = kNNPred, Actual = testData.dt$Default)
##          Actual
## Predicted  Yes   No
##       Yes   24   58
##       No  1297 4541
# loading the package
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
kNNPred <- predict(knn_fit, testData.dt,type = "raw")
kNNPredObj <- prediction(as.numeric(kNNPred),as.numeric(testData.dt$Default))

kNNPerfObj <- performance(kNNPredObj, "tpr","fpr")
# plotting ROC curve
plot(kNNPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

# area under curve
aucLR <- performance(kNNPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR
## [1] 0.5027783
# trainData.dt
# testData.dt