# reading external data and storing into a dataframe called "credit.df"
setwd("C:/Users/adi/Downloads/MLM 2019")
credit.df <- read.csv("MCICreditCardDefault.csv")
#install.packages("data.table")
library(data.table)
credit.dt <- data.table(credit.df)
attach(credit.dt)
str(credit.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : int 0 0 0 0 1 1 1 0 0 1 ...
## $ Education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : int 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : int 1 1 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
#dimensions
# dimension of the data table
dim(credit.dt)
## [1] 29601 9
#Converting Data Type Structure
#credit.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
credit.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
credit.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
credit.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
credit.dt[, Default := as.factor(Default)]
# Changing the lavels of 'Default' variable
levels(credit.dt$Default) <- c("No","Yes")
# verifying conversion
str(credit.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
## $ Education : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# levels of the target variable
levels(credit.dt$Default)
## [1] "No" "Yes"
#setting the levels
# ordering the levels
credit.dt$Default <- ordered(credit.dt$Default, levels = c("Yes", "No"))
require('ggplot2')
## Loading required package: ggplot2
# verifying the new order of levels
levels(credit.dt$Default)
## [1] "Yes" "No"
#splitting the Data into Train & Test Set
library(caret)
## Loading required package: lattice
# data partition
set.seed(2341)
trainIndex <- createDataPartition(credit.dt$Default, p = 0.80, list = FALSE)
trainData.dt <- credit.dt[trainIndex, ]
testData.dt <- credit.dt[-trainIndex, ]
dim(trainData.dt)
## [1] 23681 9
dim(testData.dt)
## [1] 5920 9
round(prop.table(table(trainData.dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
round(prop.table(table(testData.dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
#kNN Analysis
# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(3333)
#install.packages('e1071', dependencies=TRUE)
# Run kNN Classifier in package caret
knn_fit <- train(Default ~ .,
data = trainData.dt,
method = "knn",
trControl = trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
# kNN model summary
knn_fit
## k-Nearest Neighbors
##
## 23681 samples
## 8 predictor
## 2 classes: 'Yes', 'No'
##
## Pre-processing: centered (11), scaled (11)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 21313, 21314, 21313, 21313, 21312, 21314, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7327261 0.04264164
## 7 0.7467312 0.04021462
## 9 0.7529805 0.02928387
## 11 0.7581885 0.02459777
## 13 0.7627492 0.02378213
## 15 0.7662680 0.02270087
## 17 0.7686751 0.01919706
## 19 0.7708850 0.01794406
## 21 0.7722223 0.01641531
## 23 0.7731231 0.01487522
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
#memory.limit()
#gc()
#Testing the Model
kNNPred <- predict(knn_fit, testData.dt, type = "raw")
# confusion matrix
table(Predicted = kNNPred, Actual = testData.dt$Default)
## Actual
## Predicted Yes No
## Yes 28 48
## No 1293 4551
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
#Testing the Model
kNNPred <- predict(knn_fit, testData.dt,type = "raw")
kNNPredObj <- prediction(as.numeric(kNNPred),as.numeric(testData.dt$Default))
kNNPerfObj <- performance(kNNPredObj, "tpr","fpr")
plot(kNNPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
aucLR <- performance(kNNPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR
## [1] 0.5053795