Part 1: Read the data

# reading external data and storing into a dataframe called "credit.df"
setwd("C:/Users/adi/Downloads/MLM 2019")
credit.df <- read.csv("MCICreditCardDefault.csv")

Part 2: Data table

#install.packages("data.table")
library(data.table)
credit.dt <- data.table(credit.df)
attach(credit.dt)
str(credit.dt)

## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

#dimensions

# dimension of the data table
dim(credit.dt)

## [1] 29601     9

#Converting Data Type Structure

#credit.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
credit.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
credit.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
credit.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
credit.dt[, Default := as.factor(Default)]

# Changing the lavels of 'Default' variable
levels(credit.dt$Default) <- c("No","Yes")

# verifying conversion
str(credit.dt)

## Classes 'data.table' and 'data.frame':   29601 obs. of  9 variables:
##  $ Id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

# levels of the target variable
levels(credit.dt$Default)

## [1] "No"  "Yes"

#setting the levels

# ordering the levels
credit.dt$Default <- ordered(credit.dt$Default, levels = c("Yes", "No"))
require('ggplot2')

## Loading required package: ggplot2

# verifying the new order of levels
levels(credit.dt$Default)

## [1] "Yes" "No"

#splitting the Data into Train & Test Set

library(caret)

## Loading required package: lattice

# data partition
set.seed(2341)
trainIndex <- createDataPartition(credit.dt$Default, p = 0.80, list = FALSE)

80% training data

trainData.dt <- credit.dt[trainIndex, ]

20% testing data

testData.dt <- credit.dt[-trainIndex, ]

dimension of training dataset

dim(trainData.dt)

## [1] 23681     9

dimension of testing dataset

dim(testData.dt)

## [1] 5920    9

proportion of defaulters in training dataset

round(prop.table(table(trainData.dt$Default))*100,2)

## 
##   Yes    No 
## 22.31 77.69

proportion of defaulters in test dataset

round(prop.table(table(testData.dt$Default))*100,2)

## 
##   Yes    No 
## 22.31 77.69

#kNN Analysis

# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3)
set.seed(3333)
#install.packages('e1071', dependencies=TRUE)

# Run kNN Classifier in package caret
knn_fit  <- train(Default ~ ., 
                  data = trainData.dt,
                  method = "knn",
                  trControl = trctrl,
                  preProcess = c("center", "scale"),
                  tuneLength = 10)
# kNN model summary
knn_fit

## k-Nearest Neighbors 
## 
## 23681 samples
##     8 predictor
##     2 classes: 'Yes', 'No' 
## 
## Pre-processing: centered (11), scaled (11) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 21313, 21314, 21313, 21313, 21312, 21314, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    5  0.7327261  0.04264164
##    7  0.7467312  0.04021462
##    9  0.7529805  0.02928387
##   11  0.7581885  0.02459777
##   13  0.7627492  0.02378213
##   15  0.7662680  0.02270087
##   17  0.7686751  0.01919706
##   19  0.7708850  0.01794406
##   21  0.7722223  0.01641531
##   23  0.7731231  0.01487522
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.

#memory.limit()
#gc()

#Testing the Model

kNNPred <- predict(knn_fit, testData.dt, type = "raw")
# confusion matrix
table(Predicted = kNNPred, Actual = testData.dt$Default)

##          Actual
## Predicted  Yes   No
##       Yes   28   48
##       No  1293 4551

loading the package

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

#Testing the Model

kNNPred <- predict(knn_fit, testData.dt,type = "raw")
kNNPredObj <- prediction(as.numeric(kNNPred),as.numeric(testData.dt$Default))
kNNPerfObj <- performance(kNNPredObj, "tpr","fpr")

plotting ROC curve

plot(kNNPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

area under curve

aucLR <- performance(kNNPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR

## [1] 0.5053795

Getting Started

Aditya Kumar

today

Part 1: Read the data

Part 2: Data table

80% training data

20% testing data

dimension of training dataset

dimension of testing dataset

proportion of defaulters in training dataset

proportion of defaulters in test dataset

loading the package

plotting ROC curve

area under curve