Step 1: Reading the data
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
#attach the data table
attach(CCdefault.dt)
# dimension of the data table
dim(CCdefault.dt)
## [1] 29601 9
# first few rows of the data table
head(CCdefault.dt)
## Id CreditLimit Male Education MaritalStatus Age BillOutstanding
## 1: 1 20000 0 2 1 24 3913
## 2: 2 120000 0 2 2 26 2682
## 3: 3 90000 0 2 2 34 29239
## 4: 4 50000 0 2 1 37 46990
## 5: 5 50000 1 2 1 57 8617
## 6: 6 50000 1 1 2 37 64400
## LastPayment Default
## 1: 0 1
## 2: 0 1
## 3: 1518 0
## 4: 2000 0
## 5: 2000 0
## 6: 2500 0
# data types of the data columns
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : int 0 0 0 0 1 1 1 0 0 1 ...
## $ Education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : int 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : int 1 1 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# convert 'Id' as a factor
CCdefault.dt[, Id := as.factor(Id)]
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]
# Changing the lavels of 'Default' variable
levels(CCdefault.dt$Default) <- c("No","Yes")
# verifying conversion
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : Factor w/ 29601 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
## $ Education : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# descriptive statistics
library(psych)
## Warning: package 'psych' was built under R version 3.5.3
describe(CCdefault.dt)[, c(1:5, 8:9)]
## vars n mean sd median min max
## Id* 1 29601 14801.00 8545.22 14801 1 29601
## CreditLimit 2 29601 167550.54 129944.02 140000 10000 1000000
## Male* 3 29601 1.40 0.49 1 1 2
## Education* 4 29601 1.82 0.71 2 1 4
## MaritalStatus* 5 29601 1.56 0.52 2 1 3
## Age 6 29601 35.46 9.21 34 21 79
## BillOutstanding 7 29601 50957.43 73370.24 22259 -165580 964511
## LastPayment 8 29601 5649.56 16568.26 2100 0 873552
## Default* 9 29601 1.22 0.42 1 1 2
Step 2: Checking and reseting the order of the target varible
# levels of the target variable
levels(CCdefault.dt$Default)
## [1] "No" "Yes"
# ordering the levels
CCdefault.dt$Default <- ordered(CCdefault.dt$Default, levels = c("Yes", "No"))
# verifying the new order of levels
levels(CCdefault.dt$Default)
## [1] "Yes" "No"
Step 3: Splitting the data into Training set and Test set
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
# data partition
set.seed(2341)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]
# 20% testing data
testData.dt <- CCdefault.dt[-trainIndex, ]
Step 4: Verifing Test and Training datasets
# dimension of training dataset
dim(trainData.dt)
## [1] 23681 9
# dimension of testing dataset
dim(testData.dt)
## [1] 5920 9
# proportion of defaulters in training dataset
round(prop.table(table(trainData.dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testData.dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
# verifing average income and balance
library(psych)
describe(trainData.dt)[,c(2:5)]
describe(testData.dt)[,c(2:5)]