library(DescTools)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
library(rpart) # Decision Tree models
library(rpart.plot) # Visualization
library(randomForest) # Random Forest
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071) # Naive Bayes
library(corrplot) #visualization package
## corrplot 0.92 loaded
library(car) #visualization package
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
##
## Recode
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
CCD <- read.csv("Data.csv", stringsAsFactors = F)
# ID: Unique identifier of customer
# LIMIT_BALANCE: Amount of the given credit in dollars,
# including both the individual consumer credit and family (supplementary) credit
# GENDER: 1 = male; 2 = female
# EDUCATION: 1 = graduate school; 2 = university; 3 = high school; 4 = others
# MARITAL_STATUS: 1 = married; 2 = single; 3 = others
# AGE: Age in years.
# PAY_1 - PAY_6: History of past payment (PAY_1 = September, PAY_6 = April) .
# Represents the payment records of the customer (from April to September, 2005)
# -1 = on-time payment;
# 1 = payment delay for one month;
# 2 = payment delay for two months; . . .;
# 8 = payment delay for eight months;
# 9 = payment delay for nine months and above.
# -2 indicates that the customer did not have any consumption and
# 0 indicates the use of revolving credit. (you pay the minimum amount)
# BILL_AMT1 - BILL_AMT6: Amount of monthly bill (BILL_AMT1 = September, BILL_AMT6 = April)
# PAY_AMT1 - PAY_AMT6: Amount paid of monthly bill (PAY_AMT1 = September, PAY_AMT6 = April)
# DEFAULT: Indicates if the customes credit card is in default (1) or not (0)
Data Exploration
Abstract(CCD)
## ------------------------------------------------------------------------------
## CCD
##
## data frame: 2251 obs. of 25 variables
## 2251 complete cases (100.0%)
##
## Nr ColName Class NAs Levels
## 1 ID integer .
## 2 LIMIT_BAL integer .
## 3 GENDER integer .
## 4 EDUCATION integer .
## 5 MARITAL_STATUS integer .
## 6 AGE integer .
## 7 PAY_1 integer .
## 8 PAY_2 integer .
## 9 PAY_3 integer .
## 10 PAY_4 integer .
## 11 PAY_5 integer .
## 12 PAY_6 integer .
## 13 BILL_AMT1 integer .
## 14 BILL_AMT2 integer .
## 15 BILL_AMT3 integer .
## 16 BILL_AMT4 integer .
## 17 BILL_AMT5 integer .
## 18 BILL_AMT6 integer .
## 19 PAY_AMT1 integer .
## 20 PAY_AMT2 integer .
## 21 PAY_AMT3 integer .
## 22 PAY_AMT4 integer .
## 23 PAY_AMT5 integer .
## 24 PAY_AMT6 integer .
## 25 DEFAULT integer .
Descriptive statistics
Desc(CCD, plotit = FALSE)
## ------------------------------------------------------------------------------
## Describe CCD (data.frame):
##
## data frame: 2251 obs. of 25 variables
## 2251 complete cases (100.0%)
##
## Nr ColName Class NAs Levels
## 1 ID integer .
## 2 LIMIT_BAL integer .
## 3 GENDER integer .
## 4 EDUCATION integer .
## 5 MARITAL_STATUS integer .
## 6 AGE integer .
## 7 PAY_1 integer .
## 8 PAY_2 integer .
## 9 PAY_3 integer .
## 10 PAY_4 integer .
## 11 PAY_5 integer .
## 12 PAY_6 integer .
## 13 BILL_AMT1 integer .
## 14 BILL_AMT2 integer .
## 15 BILL_AMT3 integer .
## 16 BILL_AMT4 integer .
## 17 BILL_AMT5 integer .
## 18 BILL_AMT6 integer .
## 19 PAY_AMT1 integer .
## 20 PAY_AMT2 integer .
## 21 PAY_AMT3 integer .
## 22 PAY_AMT4 integer .
## 23 PAY_AMT5 integer .
## 24 PAY_AMT6 integer .
## 25 DEFAULT integer .
##
##
## ------------------------------------------------------------------------------
## 1 - ID (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 = n 0 15'229.45 14'865.51
## 100.0% 0.0% 0.0% 15'593.38
##
## .05 .10 .25 median .75 .90 .95
## 1'488.50 3'029.00 7'485.50 15'555.00 23'054.50 27'263.00 28'662.50
##
## range sd vcoef mad IQR skew kurt
## 29'995.00 8'804.98 0.58 11'559.83 15'569.00 -0.03 -1.24
##
## lowest : 4, 5, 12, 21, 29
## highest: 29'949, 29'951, 29'955, 29'967, 29'999
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 2 - LIMIT_BAL (integer)
##
## length n NAs unique 0s mean'
## 2'251 2'251 0 63 0 167'569.97
## 100.0% 0.0% 0.0%
##
## .05 .10 .25 median .75 .90
## 20'000.00 30'000.00 50'000.00 140'000.00 240'000.00 360'000.00
##
## range sd vcoef mad IQR skew
## 990'000.00 133'023.99 0.79 133'434.00 190'000.00 1.04
##
## meanCI
## 162'071.73
## 173'068.21
##
## .95
## 440'000.00
##
## kurt
## 0.87
##
## lowest : 10'000 (42), 20'000 (147), 30'000 (136), 40'000 (21), 50'000 (263)
## highest: 620'000, 630'000, 680'000, 700'000, 1'000'000
##
## heap(?): remarkable frequency (11.7%) for the mode(s) (= 50000)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 3 - GENDER (integer - dichotomous)
##
## length n NAs unique
## 2'251 2'251 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 1 919 40.8% 38.8% 42.9%
## 2 1'332 59.2% 57.1% 61.2%
##
## ' 95%-CI (Wilson)
##
## ------------------------------------------------------------------------------
## 4 - EDUCATION (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 6 0 1.87 1.84
## 100.0% 0.0% 0.0% 1.91
##
## .05 .10 .25 median .75 .90 .95
## 1.00 1.00 1.00 2.00 2.00 3.00 3.00
##
## range sd vcoef mad IQR skew kurt
## 5.00 0.83 0.44 1.48 1.00 1.17 2.76
##
##
## level freq perc cumfreq cumperc
## 1 1 800 35.5% 800 35.5%
## 2 2 1'028 45.7% 1'828 81.2%
## 3 3 376 16.7% 2'204 97.9%
## 4 4 12 0.5% 2'216 98.4%
## 5 5 27 1.2% 2'243 99.6%
## 6 6 8 0.4% 2'251 100.0%
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 5 - MARITAL_STATUS (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 4 6 1.54 1.52
## 100.0% 0.0% 0.3% 1.56
##
## .05 .10 .25 median .75 .90 .95
## 1.00 1.00 1.00 2.00 2.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 3.00 0.52 0.34 0.00 1.00 -0.05 -1.47
##
##
## level freq perc cumfreq cumperc
## 1 0 6 0.3% 6 0.3%
## 2 1 1'046 46.5% 1'052 46.7%
## 3 2 1'183 52.6% 2'235 99.3%
## 4 3 16 0.7% 2'251 100.0%
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 6 - AGE (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 48 0 35.41 35.03
## 100.0% 0.0% 0.0% 35.78
##
## .05 .10 .25 median .75 .90 .95
## 23.00 25.00 28.00 34.00 41.00 48.00 52.00
##
## range sd vcoef mad IQR skew kurt
## 54.00 9.08 0.26 8.90 13.00 0.71 0.02
##
## lowest : 21 (6), 22 (41), 23 (70), 24 (70), 25 (106)
## highest: 64 (2), 65 (4), 67, 71, 75
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 7 - PAY_1 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 7 1'109 -0.02 -0.07
## 100.0% 0.0% 49.3% 0.02
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -1.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 6.00 1.09 -52.33 1.48 1.00 0.43 0.60
##
##
## level freq perc cumfreq cumperc
## 1 -2 202 9.0% 202 9.0%
## 2 -1 431 19.1% 633 28.1%
## 3 0 1'109 49.3% 1'742 77.4%
## 4 1 272 12.1% 2'014 89.5%
## 5 2 204 9.1% 2'218 98.5%
## 6 3 24 1.1% 2'242 99.6%
## 7 4 9 0.4% 2'251 100.0%
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 8 - PAY_2 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 8 1'187 -0.16 -0.20
## 100.0% 0.0% 52.7% -0.11
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -2.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 7.00 1.17 -7.45 0.00 1.00 0.65 0.71
##
##
## level freq perc cumfreq cumperc
## 1 -2 284 12.6% 284 12.6%
## 2 -1 462 20.5% 746 33.1%
## 3 0 1'187 52.7% 1'933 85.9%
## 4 1 4 0.2% 1'937 86.1%
## 5 2 277 12.3% 2'214 98.4%
## 6 3 30 1.3% 2'244 99.7%
## 7 4 5 0.2% 2'249 99.9%
## 8 5 2 0.1% 2'251 100.0%
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 9 - PAY_3 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 9 1'176 -0.16 -0.21
## 100.0% 0.0% 52.2% -0.11
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -2.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 9.00 1.19 -7.43 0.00 1.00 0.73 1.41
##
##
## level freq perc cumfreq cumperc
## 1 -2 303 13.5% 303 13.5%
## 2 -1 447 19.9% 750 33.3%
## 3 0 1'176 52.2% 1'926 85.6%
## 4 1 1 0.0% 1'927 85.6%
## 5 2 296 13.1% 2'223 98.8%
## 6 3 19 0.8% 2'242 99.6%
## 7 4 6 0.3% 2'248 99.9%
## 8 5 1 0.0% 2'249 99.9%
## 9 7 2 0.1% 2'251 100.0%
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 10 - PAY_4 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 9 1'227 -0.20 -0.25
## 100.0% 0.0% 54.5% -0.15
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -2.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 9.00 1.18 -5.96 0.00 1.00 0.87 2.50
##
##
## level freq perc cumfreq cumperc
## 1 -2 322 14.3% 322 14.3%
## 2 -1 419 18.6% 741 32.9%
## 3 0 1'227 54.5% 1'968 87.4%
## 4 1 1 0.0% 1'969 87.5%
## 5 2 254 11.3% 2'223 98.8%
## 6 3 14 0.6% 2'237 99.4%
## 7 4 8 0.4% 2'245 99.7%
## 8 5 3 0.1% 2'248 99.9%
## 9 7 3 0.1% 2'251 100.0%
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 11 - PAY_5 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 7 1'254 -0.25 -0.30
## 100.0% 0.0% 55.7% -0.20
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -2.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 9.00 1.14 -4.52 0.00 1.00 0.87 2.82
##
##
## level freq perc cumfreq cumperc
## 1 -2 337 15.0% 337 15.0%
## 2 -1 421 18.7% 758 33.7%
## 3 0 1'254 55.7% 2'012 89.4%
## 4 2 211 9.4% 2'223 98.8%
## 5 3 15 0.7% 2'238 99.4%
## 6 4 10 0.4% 2'248 99.9%
## 7 7 3 0.1% 2'251 100.0%
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 12 - PAY_6 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 7 1'203 -0.27 -0.31
## 100.0% 0.0% 53.4% -0.22
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -2.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 9.00 1.16 -4.35 0.00 1.00 0.80 2.29
##
##
## level freq perc cumfreq cumperc
## 1 -2 361 16.0% 361 16.0%
## 2 -1 431 19.1% 792 35.2%
## 3 0 1'203 53.4% 1'995 88.6%
## 4 2 231 10.3% 2'226 98.9%
## 5 3 17 0.8% 2'243 99.6%
## 6 4 5 0.2% 2'248 99.9%
## 7 7 3 0.1% 2'251 100.0%
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 13 - BILL_AMT1 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 2'013 150 49'061.25 45'960.73
## 100.0% 0.0% 6.7% 52'161.78
##
## .05 .10 .25 median .75 .90 .95
## 0.00 292.00 3'486.00 21'605.00 60'535.00 133'598.00 190'135.50
##
## range sd vcoef mad IQR skew kurt
## 970'211.00 75'013.88 1.53 30'977.44 57'049.00 3.49 20.16
##
## lowest : -5'700, -4'316, -3'549, -3'258, -2'012
## highest: 546'485, 548'551, 569'023, 630'458, 964'511
##
## heap(?): remarkable frequency (6.7%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 14 - BILL_AMT2 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 1'962 195 46'945.02 43'968.63
## 100.0% 0.0% 8.7% 49'921.41
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 2'851.50 20'195.00 58'799.50 125'746.00 188'358.00
##
## range sd vcoef mad IQR skew kurt
## 992'895.00 72'010.56 1.53 29'362.89 55'948.00 3.60 22.86
##
## lowest : -8'964, -7'334, -6'256, -5'700, -2'012
## highest: 499'489, 546'741, 562'316, 646'770, 983'931
##
## heap(?): remarkable frequency (8.7%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 15 - BILL_AMT3 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 1'959 206 44'699.90 41'903.94
## 100.0% 0.0% 9.2% 47'495.87
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 2'849.00 19'703.00 56'432.00 118'652.00 174'133.00
##
## range sd vcoef mad IQR skew kurt
## 704'166.00 67'645.22 1.51 28'482.23 53'583.00 3.17 14.51
##
## lowest : -11'035, -10'922, -5'700, -4'244, -3'764
## highest: 477'884, 488'166, 535'020, 535'509, 693'131
##
## heap(?): remarkable frequency (9.2%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 16 - BILL_AMT4 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 1'925 239 41'239.24 38'519.57
## 100.0% 0.0% 10.6% 43'958.91
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 2'526.00 18'637.00 49'857.50 111'960.00 162'010.50
##
## range sd vcoef mad IQR skew kurt
## 913'694.00 65'799.33 1.60 26'519.27 47'331.50 3.87 25.33
##
## lowest : -22'108, -15'588, -3'518, -1'944, -1'868
## highest: 488'808, 501'496, 548'020, 628'699, 891'586
##
## heap(?): remarkable frequency (10.6%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 17 - BILL_AMT5 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 1'904 254 38'340.36 35'758.49
## 100.0% 0.0% 11.3% 40'922.22
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 1'757.00 17'154.00 47'326.00 107'030.00 149'416.50
##
## range sd vcoef mad IQR skew kurt
## 947'425.00 62'465.34 1.63 24'467.35 45'569.00 4.14 30.71
##
## lowest : -20'254, -20'006, -19'205, -4'430, -2'300
## highest: 481'896, 508'213, 530'672, 547'880, 927'171
##
## heap(?): remarkable frequency (11.3%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 18 - BILL_AMT6 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 1'859 297 36'896.28 34'402.92
## 100.0% 0.0% 13.2% 39'389.63
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 1'142.00 16'667.00 46'931.00 104'809.00 146'691.00
##
## range sd vcoef mad IQR skew kurt
## 991'841.00 60'323.97 1.63 24'117.45 45'789.00 4.19 34.53
##
## lowest : -30'177, -24'920, -5'813, -4'543, -3'594
## highest: 436'172, 466'570, 496'801, 513'798, 961'664
##
## heap(?): remarkable frequency (13.2%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 19 - PAY_AMT1 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 1'100 401 5'364.85 4'825.51
## 100.0% 0.0% 17.8% 5'904.20
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 939.00 2'100.00 5'000.00 10'120.00 17'719.00
##
## range sd vcoef mad IQR skew kurt
## 242'247.00 13'048.87 2.43 2'816.94 4'061.00 7.64 84.41
##
## lowest : 0 (401), 1, 2, 4 (3), 6
## highest: 130'000, 135'868, 137'308, 160'812, 242'247
##
## heap(?): remarkable frequency (17.8%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 20 - PAY_AMT2 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 1'122 392 5'590.31 4'980.97
## 100.0% 0.0% 17.4% 6'199.64
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 981.00 2'014.00 5'000.00 10'513.00 19'772.50
##
## range sd vcoef mad IQR skew kurt
## 300'018.00 14'742.26 2.64 2'702.78 4'019.00 8.62 110.31
##
## lowest : 0 (392), 2 (3), 3, 4, 5
## highest: 140'951, 164'395, 168'499, 188'000, 300'018
##
## heap(?): remarkable frequency (17.4%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 21 - PAY_AMT3 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 1'041 457 5'637.46 4'541.71
## 100.0% 0.0% 20.3% 6'733.21
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 342.50 1'644.00 4'400.00 10'000.00 16'485.50
##
## range sd vcoef mad IQR skew kurt
## 896'040.00 26'510.50 4.70 2'437.39 4'057.50 21.86 636.59
##
## lowest : 0 (457), 1 (2), 2, 3 (2), 5
## highest: 203'000, 213'509, 325'000, 508'229, 896'040
##
## heap(?): remarkable frequency (20.3%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 22 - PAY_AMT4 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 976 485 4'498.85 3'979.70
## 100.0% 0.0% 21.5% 5'018.01
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 300.00 1'500.00 4'000.00 10'000.00 16'383.00
##
## range sd vcoef mad IQR skew kurt
## 225'616.00 12'560.42 2.79 2'223.90 3'700.00 8.97 111.03
##
## lowest : 0 (485), 2 (4), 3, 4 (2), 5
## highest: 161'110, 168'159, 171'716, 188'840, 225'616
##
## heap(?): remarkable frequency (21.5%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 23 - PAY_AMT5 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 957 504 4'806.63 4'150.61
## 100.0% 0.0% 22.4% 5'462.65
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 230.00 1'500.00 3'861.00 8'900.00 16'202.00
##
## range sd vcoef mad IQR skew kurt
## 326'889.00 15'871.77 3.30 2'223.90 3'631.00 10.91 166.99
##
## lowest : 0 (504), 1 (2), 2, 3, 4
## highest: 155'067, 163'123, 207'440, 310'135, 326'889
##
## heap(?): remarkable frequency (22.4%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 24 - PAY_AMT6 (integer)
##
## length n NAs unique 0s mean meanCI'
## 2'251 2'251 0 956 566 5'203.24 4'476.64
## 100.0% 0.0% 25.1% 5'929.85
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 0.00 1'402.00 4'000.00 9'098.00 18'000.00
##
## range sd vcoef mad IQR skew kurt
## 403'500.00 17'579.43 3.38 2'078.61 4'000.00 10.60 167.64
##
## lowest : 0 (566), 1, 3 (2), 4 (2), 5
## highest: 158'064, 158'215, 250'005, 254'000, 403'500
##
## heap(?): remarkable frequency (25.1%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
##
## ------------------------------------------------------------------------------
## 25 - DEFAULT (integer - dichotomous)
##
## length n NAs unique
## 2'251 2'251 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 1'753 77.9% 76.1% 79.5%
## 1 498 22.1% 20.5% 23.9%
##
## ' 95%-CI (Wilson)
Prepare Target (Y) Variable:
Convert our target variable, DEFAULT to a nominal factor variable
CCD$DEFAULT <- factor(x = CCD$DEFAULT)
Let’s take a closer look at our dependent variable (Y), DEFAULT
Desc(CCD$DEFAULT)
## ------------------------------------------------------------------------------
## CCD$DEFAULT (factor - dichotomous)
##
## length n NAs unique
## 2'251 2'251 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 1'753 77.9% 76.1% 79.5%
## 1 498 22.1% 20.5% 23.9%
##
## ' 95%-CI (Wilson)
Prepare Predictor (X) Variables:
Nominal Variables
noms <- c("GENDER","MARITAL_STATUS") #unordered factors
Convert variables to unordered factors
CCD[ ,noms] <- lapply(X = CCD[ ,noms],
FUN = factor)
Ordinal Variables
ords <- c ("EDUCATION","AGE","PAY_1","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6")
Convert variables to ordered factors
CCD[,c("EDUCATION","AGE")] <- lapply(X = CCD[,c("EDUCATION","AGE")],
FUN = factor,
ordered = TRUE)
CCD[,c("PAY_1","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6")] <-lapply(X = CCD[,c("PAY_1","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6")],
FUN = factor,
ordered= TRUE,
levels = c("9","8","7","6","5","4","3","2","1","0","-1","-2"))
Numeric Variables
nums <- names(CCD)[!names(CCD)%in% c("ID","DEFAULT",noms, ords) ]
nums
## [1] "LIMIT_BAL" "BILL_AMT1" "BILL_AMT2" "BILL_AMT3" "BILL_AMT4" "BILL_AMT5"
## [7] "BILL_AMT6" "PAY_AMT1" "PAY_AMT2" "PAY_AMT3" "PAY_AMT4" "PAY_AMT5"
## [13] "PAY_AMT6"
We will set up convenience vectors called preds, that consists of all our predictors:
preds <- names(CCD[,-25])
preds
## [1] "ID" "LIMIT_BAL" "GENDER" "EDUCATION"
## [5] "MARITAL_STATUS" "AGE" "PAY_1" "PAY_2"
## [9] "PAY_3" "PAY_4" "PAY_5" "PAY_6"
## [13] "BILL_AMT1" "BILL_AMT2" "BILL_AMT3" "BILL_AMT4"
## [17] "BILL_AMT5" "BILL_AMT6" "PAY_AMT1" "PAY_AMT2"
## [21] "PAY_AMT3" "PAY_AMT4" "PAY_AMT5" "PAY_AMT6"
Decision Trees can handle missing values (we can impute or eliminate up-front or tell the model how to handle them), irrelevant and redundant variables and no rescaling/standardization is needed.
Duplicates
duplicated(CCD)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [649] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [661] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [673] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [685] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [697] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [709] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [721] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [733] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [745] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [757] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [769] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [781] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [793] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [805] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [817] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [829] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [841] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [853] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [865] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [877] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [889] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [901] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [913] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [925] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [937] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [949] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [961] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [973] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [985] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [997] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1009] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1021] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1033] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1045] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1057] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1069] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1081] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1093] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1105] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1117] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1129] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1153] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1165] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1189] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1213] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1225] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1237] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1249] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1273] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1285] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1297] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1309] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1321] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1333] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1345] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1357] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1369] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1381] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1393] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1405] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1417] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1429] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1441] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1453] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1465] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1477] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1489] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1501] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1513] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1525] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1537] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1549] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1561] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1573] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1585] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1597] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1609] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1621] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1633] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1645] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1657] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1669] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1681] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1693] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1705] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1717] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1729] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1741] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1753] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1765] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1777] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1789] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1801] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1813] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1825] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1837] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1849] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1861] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1873] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1885] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1897] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1909] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1921] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1933] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1945] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1957] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1969] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1981] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1993] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2005] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2017] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2029] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2041] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2053] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2065] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2077] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2089] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2113] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2125] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2137] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2149] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2173] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2185] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2197] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2209] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2233] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2245] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Missing Values
PlotMiss(x = CCD,
main = "Missing Values by Variable")
!complete.cases(CCD)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [649] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [661] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [673] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [685] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [697] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [709] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [721] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [733] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [745] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [757] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [769] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [781] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [793] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [805] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [817] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [829] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [841] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [853] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [865] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [877] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [889] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [901] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [913] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [925] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [937] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [949] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [961] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [973] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [985] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [997] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1009] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1021] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1033] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1045] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1057] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1069] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1081] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1093] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1105] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1117] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1129] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1153] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1165] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1189] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1213] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1225] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1237] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1249] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1273] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1285] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1297] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1309] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1321] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1333] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1345] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1357] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1369] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1381] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1393] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1405] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1417] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1429] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1441] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1453] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1465] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1477] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1489] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1501] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1513] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1525] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1537] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1549] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1561] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1573] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1585] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1597] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1609] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1621] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1633] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1645] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1657] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1669] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1681] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1693] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1705] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1717] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1729] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1741] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1753] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1765] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1777] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1789] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1801] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1813] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1825] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1837] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1849] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1861] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1873] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1885] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1897] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1909] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1921] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1933] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1945] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1957] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1969] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1981] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1993] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2005] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2017] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2029] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2041] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2053] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2065] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2077] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2089] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2113] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2125] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2137] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2149] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2173] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2185] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2197] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2209] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2233] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2245] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# No missing values
Checking outliers
Outlier(x = CCD$PAY_AMT1, method = "boxplot")
## [1] 21818 57087 15000 80000 12006 13647 21197 80004 100000 40000
## [11] 50784 13713 14599 28000 30000 13000 89187 20091 18000 11200
## [21] 60267 17409 18718 16920 13517 242247 30253 40000 14443 11680
## [31] 22000 14813 15980 112000 12611 137308 13106 14127 16008 43533
## [41] 12564 78130 30000 40000 28000 16611 43894 27657 99159 14596
## [51] 88444 15000 18407 20015 53563 13576 20000 12005 15395 135868
## [61] 25000 11268 18500 11406 29882 73700 60000 32254 14528 50263
## [71] 15100 50006 80648 21255 15105 18696 11684 20680 23009 12000
## [81] 21006 70000 105072 13855 15210 12700 12020 11394 69616 12000
## [91] 20000 15325 12500 36027 27800 16140 15000 18641 14651 21507
## [101] 40529 23537 25000 13953 32153 32305 15450 43018 65668 34661
## [111] 11602 15349 14773 18615 130000 12012 40991 25486 53860 12275
## [121] 50000 18089 15023 52720 28204 15000 16529 20092 14000 12277
## [131] 14805 13994 27004 89524 13000 30027 11194 35749 29658 30043
## [141] 12500 11250 37039 12651 14329 18546 160812 15000 60396 18588
## [151] 35993 21094 13000 13455 12000 65000 20052 16833 18864 50000
## [161] 15000 23565 12077 13156 12590 13437 34163 14200 13987 15000
## [171] 25853 84361 18539 13000 82392 19020 16100 68000 26232 13003
## [181] 74081 24913 15480 12701 30553 100741 30018 27821 18000 85387
## [191] 68800 14304 16780 54209 14000 17438 39032 85900
Outlier(x = CCD$PAY_2, method = "boxplot")
## ordered(0)
## Levels: 9 < 8 < 7 < 6 < 5 < 4 < 3 < 2 < 1 < 0 < -1 < -2
boxplot(x = CCD$BILL_AMT2,
main = "BILL_AMOUNT2 Box Plot")
Distribution
hist(x = CCD$LIMIT_BAL,
main = "Limit Balance Histogram",
xlab = "",
col = "steelblue")
hist(x = CCD$BILL_AMT3,
main = "BILL_AMT3 Histogram",
xlab = "",
col = "steelblue")
Target Variable’s Distribution
plot(CCD$DEFAULT)
#Our Target variable has imbalance issue
Correlation Matrix of continuous variables
cmatCCD <- cor(CCD[,nums])
cmatCCD
## LIMIT_BAL BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6
## LIMIT_BAL 1.0000000 0.2948802 0.2863079 0.2887990 0.3001144 0.3060862 0.3070078
## BILL_AMT1 0.2948802 1.0000000 0.9542134 0.9092657 0.8829816 0.8635683 0.8296937
## BILL_AMT2 0.2863079 0.9542134 1.0000000 0.9468779 0.9181048 0.8953405 0.8640378
## BILL_AMT3 0.2887990 0.9092657 0.9468779 1.0000000 0.9212876 0.8897037 0.8516060
## BILL_AMT4 0.3001144 0.8829816 0.9181048 0.9212876 1.0000000 0.9410933 0.9083115
## BILL_AMT5 0.3060862 0.8635683 0.8953405 0.8897037 0.9410933 1.0000000 0.9465879
## BILL_AMT6 0.3070078 0.8296937 0.8640378 0.8516060 0.9083115 0.9465879 1.0000000
## PAY_AMT1 0.2262652 0.1686349 0.2975077 0.2564514 0.2464372 0.2297914 0.2150062
## PAY_AMT2 0.2355298 0.1454071 0.1583340 0.3247299 0.2476497 0.2408888 0.2204290
## PAY_AMT3 0.2477429 0.3568970 0.3855268 0.2457028 0.4213199 0.4251150 0.4349505
## PAY_AMT4 0.2180976 0.1825550 0.1770652 0.1631720 0.1916176 0.2859317 0.2352703
## PAY_AMT5 0.2162154 0.1305348 0.1353963 0.1385646 0.1507075 0.1206218 0.3031038
## PAY_AMT6 0.2515170 0.2383271 0.2162284 0.2059132 0.1990741 0.2137193 0.1889870
## PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
## LIMIT_BAL 0.2262652 0.2355298 0.2477429 0.2180976 0.2162154 0.2515170
## BILL_AMT1 0.1686349 0.1454071 0.3568970 0.1825550 0.1305348 0.2383271
## BILL_AMT2 0.2975077 0.1583340 0.3855268 0.1770652 0.1353963 0.2162284
## BILL_AMT3 0.2564514 0.3247299 0.2457028 0.1631720 0.1385646 0.2059132
## BILL_AMT4 0.2464372 0.2476497 0.4213199 0.1916176 0.1507075 0.1990741
## BILL_AMT5 0.2297914 0.2408888 0.4251150 0.2859317 0.1206218 0.2137193
## BILL_AMT6 0.2150062 0.2204290 0.4349505 0.2352703 0.3031038 0.1889870
## PAY_AMT1 1.0000000 0.2678089 0.2363814 0.1280426 0.1284044 0.1214585
## PAY_AMT2 0.2678089 1.0000000 0.1335056 0.1572967 0.1278698 0.1266298
## PAY_AMT3 0.2363814 0.1335056 1.0000000 0.1431790 0.1641548 0.1293255
## PAY_AMT4 0.1280426 0.1572967 0.1431790 1.0000000 0.1217357 0.1291814
## PAY_AMT5 0.1284044 0.1278698 0.1641548 0.1217357 1.0000000 0.1317634
## PAY_AMT6 0.1214585 0.1266298 0.1293255 0.1291814 0.1317634 1.0000000
corrplot.mixed(cmatCCD)
There is high correlation between variables from BILL_AMT1 to BILL_AMT6.
Other continuous variables have low to medium correlation.
Distribution of Limit Balance
h <- hist(CCD$LIMIT_BAL, breaks = 30)
h$density = h$counts/sum(h$counts)*100
plot(h,main="Histogram of Limit Balance",
xlab="Limit Balance",
ylab="Relative Frequency %",
col = "steelblue", breaks = 20, freq = FALSE,
xaxt = "n")
## Warning in plot.window(xlim, ylim, "", ...): "breaks" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "breaks" is not a graphical parameter
## Warning in axis(1, ...): "breaks" is not a graphical parameter
## Warning in axis(2, ...): "breaks" is not a graphical parameter
axis(side=1, at=seq(0,1000000,by=50000))
Limit Balance By Sex
Preprocess Variable
CCD_dup <- CCD
levels(CCD_dup$GENDER) <- c("Male","Female")
levels(CCD_dup$MARITAL_STATUS) <- c("Not Provided","Married",
"Single","Others")
levels(CCD_dup $DEFAULT) <- c("Not Default", "Default")
Boxplot
CCD_dup %>% ggplot(aes(x=GENDER, y=LIMIT_BAL,
fill=GENDER
)) +
geom_boxplot(width=0.5,lwd=1)+
ggtitle("Credit Limit Balance by Sex")
The data of limit balance among sex is quite evenly distributed
Distribution of Default Frequency By Sex and Marital Status
Barplot
ggplot(CCD_dup, aes(x = DEFAULT, fill = GENDER)) +
geom_bar(position = "dodge")
Our visualization indicates Female customers are less likely to default than Male customers. We still need to inspect more to see this is true.
Training and Testing
Splitting the data into training and testing sets using an 85/15 split rule
Initialize random seed
set.seed(41972795)
Create list of training indices
sub <- createDataPartition(y = CCD$DEFAULT, # target variable
p = 0.85, # % in training
list = FALSE)
Subset the transformed data to create the training (train) and testing (test) datasets
train <- CCD[sub, ] # create train dataframe
test <- CCD[-sub, ] # create test dataframe
The cp hyperparameter defaults to 0.01. We will set the cp to 0 to grow the fulltree and then will ‘prune’ it by tuning the cp parameter.
Behind the scenes, the rpart() function will perform cross validation, so we will initialize our random seed value before modeling
set.seed(41972795)
CCD.rpart <- rpart(formula = DEFAULT ~ ., # Y ~ all other variables in dataframe
data = train, # include only relevant variables
method = "class", # classification, have to classify or else it will automatically assume regression
control = rpart.control(cp = 0))
We can see the basic output of our Decision Tree model
CCD.rpart
## n= 1915
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1915 424 0 (0.77859008 0.22140992)
## 2) PAY_1=0,-1,-2 1492 214 0 (0.85656836 0.14343164)
## 4) PAY_AMT2>=1863.5 891 89 0 (0.90011223 0.09988777)
## 8) AGE=21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53 860 76 0 (0.91162791 0.08837209)
## 16) PAY_AMT3>=1.5 783 61 0 (0.92209451 0.07790549)
## 32) PAY_AMT1>=4409.5 365 19 0 (0.94794521 0.05205479) *
## 33) PAY_AMT1< 4409.5 418 42 0 (0.89952153 0.10047847)
## 66) PAY_AMT1< 4308.5 411 38 0 (0.90754258 0.09245742)
## 132) BILL_AMT1>=1309 382 32 0 (0.91623037 0.08376963)
## 264) BILL_AMT3< 76204.5 321 22 0 (0.93146417 0.06853583) *
## 265) BILL_AMT3>=76204.5 61 10 0 (0.83606557 0.16393443)
## 530) PAY_AMT5>=1306.5 54 6 0 (0.88888889 0.11111111) *
## 531) PAY_AMT5< 1306.5 7 3 1 (0.42857143 0.57142857) *
## 133) BILL_AMT1< 1309 29 6 0 (0.79310345 0.20689655)
## 266) PAY_AMT3< 3164 20 1 0 (0.95000000 0.05000000) *
## 267) PAY_AMT3>=3164 9 4 1 (0.44444444 0.55555556) *
## 67) PAY_AMT1>=4308.5 7 3 1 (0.42857143 0.57142857) *
## 17) PAY_AMT3< 1.5 77 15 0 (0.80519481 0.19480519)
## 34) ID>=7656 57 7 0 (0.87719298 0.12280702) *
## 35) ID< 7656 20 8 0 (0.60000000 0.40000000)
## 70) GENDER=2 11 2 0 (0.81818182 0.18181818) *
## 71) GENDER=1 9 3 1 (0.33333333 0.66666667) *
## 9) AGE=54,55,56,57,58,59,60,61,62,63,64,65,67,71,75 31 13 0 (0.58064516 0.41935484)
## 18) AGE=57,58,59,60,61,62,63,64,65,67,71,75 18 4 0 (0.77777778 0.22222222) *
## 19) AGE=21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56 13 4 1 (0.30769231 0.69230769) *
## 5) PAY_AMT2< 1863.5 601 125 0 (0.79201331 0.20798669)
## 10) PAY_6=0,-1,-2 563 108 0 (0.80817052 0.19182948)
## 20) PAY_AMT3>=1107.5 222 26 0 (0.88288288 0.11711712) *
## 21) PAY_AMT3< 1107.5 341 82 0 (0.75953079 0.24046921)
## 42) PAY_AMT1>=1421.5 145 24 0 (0.83448276 0.16551724) *
## 43) PAY_AMT1< 1421.5 196 58 0 (0.70408163 0.29591837)
## 86) PAY_AMT4>=969.5 34 4 0 (0.88235294 0.11764706) *
## 87) PAY_AMT4< 969.5 162 54 0 (0.66666667 0.33333333)
## 174) BILL_AMT1< 677 80 18 0 (0.77500000 0.22500000) *
## 175) BILL_AMT1>=677 82 36 0 (0.56097561 0.43902439)
## 350) ID< 6504.5 23 5 0 (0.78260870 0.21739130) *
## 351) ID>=6504.5 59 28 1 (0.47457627 0.52542373)
## 702) BILL_AMT2< 92 18 5 0 (0.72222222 0.27777778) *
## 703) BILL_AMT2>=92 41 15 1 (0.36585366 0.63414634)
## 1406) AGE=30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,71,75 30 14 1 (0.46666667 0.53333333)
## 2812) BILL_AMT2>=403 23 10 0 (0.56521739 0.43478261)
## 5624) PAY_AMT5< 949 16 4 0 (0.75000000 0.25000000) *
## 5625) PAY_AMT5>=949 7 1 1 (0.14285714 0.85714286) *
## 2813) BILL_AMT2< 403 7 1 1 (0.14285714 0.85714286) *
## 1407) AGE=21,22,23,24,25,26,27,28,29 11 1 1 (0.09090909 0.90909091) *
## 11) PAY_6=9,8,7,6,5,4,3,2,1 38 17 0 (0.55263158 0.44736842)
## 22) PAY_AMT3< 317 9 0 0 (1.00000000 0.00000000) *
## 23) PAY_AMT3>=317 29 12 1 (0.41379310 0.58620690)
## 46) BILL_AMT5< 13132.5 10 3 0 (0.70000000 0.30000000) *
## 47) BILL_AMT5>=13132.5 19 5 1 (0.26315789 0.73684211) *
## 3) PAY_1=9,8,7,6,5,4,3,2,1 423 210 0 (0.50354610 0.49645390)
## 6) PAY_2=-1,-2 126 28 0 (0.77777778 0.22222222)
## 12) AGE=21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50 112 18 0 (0.83928571 0.16071429) *
## 13) AGE=51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,71,75 14 4 1 (0.28571429 0.71428571) *
## 7) PAY_2=9,8,7,6,5,4,3,2,1,0 297 115 1 (0.38720539 0.61279461)
## 14) BILL_AMT3< 55830.5 207 93 1 (0.44927536 0.55072464)
## 28) PAY_6=0,-1,-2 118 52 0 (0.55932203 0.44067797)
## 56) ID< 20246 89 32 0 (0.64044944 0.35955056)
## 112) PAY_AMT4>=1050 36 7 0 (0.80555556 0.19444444) *
## 113) PAY_AMT4< 1050 53 25 0 (0.52830189 0.47169811)
## 226) PAY_AMT4< 637 33 10 0 (0.69696970 0.30303030) *
## 227) PAY_AMT4>=637 20 5 1 (0.25000000 0.75000000) *
## 57) ID>=20246 29 9 1 (0.31034483 0.68965517)
## 114) PAY_AMT3>=1515.5 10 4 0 (0.60000000 0.40000000) *
## 115) PAY_AMT3< 1515.5 19 3 1 (0.15789474 0.84210526) *
## 29) PAY_6=9,8,7,6,5,4,3,2,1 89 27 1 (0.30337079 0.69662921)
## 58) BILL_AMT2< 17109.5 42 18 1 (0.42857143 0.57142857)
## 116) ID< 11029 13 4 0 (0.69230769 0.30769231) *
## 117) ID>=11029 29 9 1 (0.31034483 0.68965517)
## 234) PAY_AMT3>=1162 9 3 0 (0.66666667 0.33333333) *
## 235) PAY_AMT3< 1162 20 3 1 (0.15000000 0.85000000) *
## 59) BILL_AMT2>=17109.5 47 9 1 (0.19148936 0.80851064) *
## 15) BILL_AMT3>=55830.5 90 22 1 (0.24444444 0.75555556)
## 30) ID< 3195 7 1 0 (0.85714286 0.14285714) *
## 31) ID>=3195 83 16 1 (0.19277108 0.80722892) *
Tree Plots
We can use either the prp() function or the rpart.plot() function in the rpart.plot package to plot our rpart object (DM.rpart).
prp(x = CCD.rpart)
We can use the variable.importance component of the rpart object to obtain variable importance
CCD.rpart$variable.importance
## PAY_1 PAY_2 PAY_3 BILL_AMT1 BILL_AMT2
## 83.534923 68.605080 45.303021 43.810753 39.930641
## PAY_4 PAY_5 PAY_AMT1 BILL_AMT4 PAY_AMT3
## 36.970878 36.296979 32.837154 26.854887 23.534350
## BILL_AMT5 BILL_AMT3 AGE ID BILL_AMT6
## 23.197538 22.876164 22.044086 21.866955 20.676526
## PAY_AMT4 PAY_AMT2 PAY_AMT5 PAY_6 MARITAL_STATUS
## 20.300272 12.449281 11.501015 11.296336 3.196864
## PAY_AMT6 GENDER LIMIT_BAL
## 2.560477 2.327273 1.098621
We use the predict() function to generate class predictions for our training set
base.trpreds <- predict(object = CCD.rpart, # DT model
newdata = train, # class predictions
type = "class") # class predictions
We can use the confusionMatrix() function from the caret package to obtain a confusion matrix and obtain performance measures for our model applied to the training dataset (train).
DT_train_conf <- confusionMatrix(data = base.trpreds, # predictions
reference = train$DEFAULT, # actual
positive = "1", #Default
mode = "everything")
DT_train_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1426 197
## 1 65 227
##
## Accuracy : 0.8632
## 95% CI : (0.847, 0.8783)
## No Information Rate : 0.7786
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5534
##
## Mcnemar's Test P-Value : 5.811e-16
##
## Sensitivity : 0.5354
## Specificity : 0.9564
## Pos Pred Value : 0.7774
## Neg Pred Value : 0.8786
## Precision : 0.7774
## Recall : 0.5354
## F1 : 0.6341
## Prevalence : 0.2214
## Detection Rate : 0.1185
## Detection Prevalence : 0.1525
## Balanced Accuracy : 0.7459
##
## 'Positive' Class : 1
##
We use the predict() function to generate class predictions for our testing set
base.tepreds <- predict(object = CCD.rpart, # DT model
newdata = test, # testing data
type = "class")
We can use the confusionMatrix() function from the caret package to obtain a confusion matrix and obtain performance measures for our model applied to the testing dataset (test).
DT_test_conf <- confusionMatrix(data = base.tepreds, # predictions
reference = test$DEFAULT, # actual
positive = "1",
mode = "everything")
DT_test_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 234 45
## 1 28 29
##
## Accuracy : 0.7827
## 95% CI : (0.7348, 0.8257)
## No Information Rate : 0.7798
## P-Value [Acc > NIR] : 0.47865
##
## Kappa : 0.3106
##
## Mcnemar's Test P-Value : 0.06112
##
## Sensitivity : 0.39189
## Specificity : 0.89313
## Pos Pred Value : 0.50877
## Neg Pred Value : 0.83871
## Precision : 0.50877
## Recall : 0.39189
## F1 : 0.44275
## Prevalence : 0.22024
## Detection Rate : 0.08631
## Detection Prevalence : 0.16964
## Balanced Accuracy : 0.64251
##
## 'Positive' Class : 1
##
To assess if the model is balanced, underfitting or overfitting, we compare the performance on the training and testing. We can use the cbind() function to compare side-by-side.
Overall
cbind(Training = DT_train_conf$overall,
Testing = DT_test_conf$overall)
## Training Testing
## Accuracy 8.631854e-01 0.78273810
## Kappa 5.534319e-01 0.31062395
## AccuracyLower 8.469763e-01 0.73475623
## AccuracyUpper 8.782730e-01 0.82565721
## AccuracyNull 7.785901e-01 0.77976190
## AccuracyPValue 3.181827e-21 0.47864746
## McnemarPValue 5.811397e-16 0.06111558
Class-Level
cbind(Training = DT_train_conf$byClass,
Testing = DT_test_conf$byClass)
## Training Testing
## Sensitivity 0.5353774 0.39189189
## Specificity 0.9564051 0.89312977
## Pos Pred Value 0.7773973 0.50877193
## Neg Pred Value 0.8786198 0.83870968
## Precision 0.7773973 0.50877193
## Recall 0.5353774 0.39189189
## F1 0.6340782 0.44274809
## Prevalence 0.2214099 0.22023810
## Detection Rate 0.1185379 0.08630952
## Detection Prevalence 0.1524804 0.16964286
## Balanced Accuracy 0.7458912 0.64251083
The model seems to overfit a bit and also not a good predicting model as Low sensitivity on both training and testing sets (Low true positive rate - predicting someone as likely to default, while they’re good customers/ or High false negative rate - predicting someone as not likely to default, while they actually default)
Pruning the Tree
Behind the scenes of the rpart() function, cross-validation is being used to find the optimal value of cp (complexity parameter)
printcp(x = CCD.rpart)
##
## Classification tree:
## rpart(formula = DEFAULT ~ ., data = train, method = "class",
## control = rpart.control(cp = 0))
##
## Variables actually used in tree construction:
## [1] AGE BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT5 GENDER ID
## [8] PAY_1 PAY_2 PAY_6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4
## [15] PAY_AMT5
##
## Root node error: 424/1915 = 0.22141
##
## n= 1915
##
## CP nsplit rel error xerror xstd
## 1 0.0790094 0 1.00000 1.00000 0.042852
## 2 0.0165094 2 0.84198 0.87264 0.040749
## 3 0.0141509 5 0.78302 0.88915 0.041039
## 4 0.0117925 6 0.76887 0.89858 0.041203
## 5 0.0058962 9 0.73349 0.90330 0.041284
## 6 0.0047170 12 0.71462 0.95047 0.042071
## 7 0.0043239 19 0.67689 0.95991 0.042223
## 8 0.0023585 28 0.63208 1.00236 0.042888
## 9 0.0011792 31 0.62500 1.03774 0.043418
## 10 0.0000000 37 0.61792 1.09670 0.044255
We can identify the best cp, that has the minimum cross-validated error (xerror) in the cptable and save it for use in the prune() function
min_cp <- CCD.rpart$cptable[which.min(CCD.rpart$cptable[,"xerror"]),"CP"]
min_cp
## [1] 0.01650943
Get the minimize x-error
We can use this value as the cp, and use the prune() function to stop the tree from growing beyond the corresponding number of splits
pruneTree <- prune(tree = CCD.rpart, cp = min_cp)
We can use the prp() function to view our pruned tree
prp(x = pruneTree,
extra = 2) # include prop. of correct predictions
par(mfrow = c(1,2)) # split plot window into 1 row and 2 columns
prp(x = CCD.rpart) # plot original tree
prp(x = pruneTree) # plot pruned tree (min xerror)
par(mfrow = c(1,1)) # return plot window to 1 row and 1 column
We can choose the size of the tree (nsplit + 1) that is the first to the left that is below the dashed line (1 SE above the minimum)
plotcp(x = CCD.rpart, # rpart object
upper = "splits") # ID splits to match printcp output
Based on the plot, we still prefer to use trade-off cp = 0.01650943, which is in row 2
min2_cp <- CCD.rpart$cptable[2,"CP"]
We can use this value as the cp, and use the prune() function to stop the tree from growing beyond the corresponding number of splits
pruneTree2 <- prune(tree = CCD.rpart, # rpart object
cp = min2_cp) # tradeoff cp value
We can use the prp() function to view our pruned tree
prp(x = pruneTree2, extra = 2) # include prop. of correct predictions
Visualize Full and Tuned (tradeoff) Tree
par(mfrow = c(1,2)) # split plot window into 1 row and 2 columns
prp(x = CCD.rpart) # plot original tree
prp(x = pruneTree2) # plot pruned tree (tradeoff)
par(mfrow = c(1,1)) # return plot window to 1 row and 1 column
We will use the less complex, tradeoff based pruned tree, pruneTree2
We use the predict() function to generate class predictions for our training data set
tune.trpreds <- predict(object = pruneTree2,
newdata = train,
type = "class")
We use the confusionMatrix() functionfrom the caret package
DT_trtune_conf <- confusionMatrix(data = tune.trpreds, # predictions
reference = train$DEFAULT, # actual
positive = "1",
mode = "everything")
DT_trtune_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1376 242
## 1 115 182
##
## Accuracy : 0.8136
## 95% CI : (0.7954, 0.8308)
## No Information Rate : 0.7786
## P-Value [Acc > NIR] : 9.614e-05
##
## Kappa : 0.3944
##
## Mcnemar's Test P-Value : 2.582e-11
##
## Sensitivity : 0.42925
## Specificity : 0.92287
## Pos Pred Value : 0.61279
## Neg Pred Value : 0.85043
## Precision : 0.61279
## Recall : 0.42925
## F1 : 0.50485
## Prevalence : 0.22141
## Detection Rate : 0.09504
## Detection Prevalence : 0.15509
## Balanced Accuracy : 0.67606
##
## 'Positive' Class : 1
##
We use the predict() function to generate class predictions for our testing data set
tune.tepreds <- predict(object = pruneTree2,
newdata = test,
type = "class")
We use the confusionMatrix() function from the caret package
DT_tetune_conf <- confusionMatrix(data = tune.tepreds, # predictions
reference = test$DEFAULT, # actual
positive = "1",
mode = "everything")
DT_tetune_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 241 34
## 1 21 40
##
## Accuracy : 0.8363
## 95% CI : (0.7923, 0.8742)
## No Information Rate : 0.7798
## P-Value [Acc > NIR] : 0.006117
##
## Kappa : 0.4914
##
## Mcnemar's Test P-Value : 0.105645
##
## Sensitivity : 0.5405
## Specificity : 0.9198
## Pos Pred Value : 0.6557
## Neg Pred Value : 0.8764
## Precision : 0.6557
## Recall : 0.5405
## F1 : 0.5926
## Prevalence : 0.2202
## Detection Rate : 0.1190
## Detection Prevalence : 0.1815
## Balanced Accuracy : 0.7302
##
## 'Positive' Class : 1
##
To assess if the model is balanced, underfitting or overfitting, we compare the performance on the training and testing. We can use the cbind() function to compare side-by-side.
Overall
cbind(Training = DT_trtune_conf$overall,
Testing = DT_tetune_conf$overall)
## Training Testing
## Accuracy 8.135770e-01 0.836309524
## Kappa 3.943841e-01 0.491357481
## AccuracyLower 7.953979e-01 0.792327101
## AccuracyUpper 8.307903e-01 0.874241805
## AccuracyNull 7.785901e-01 0.779761905
## AccuracyPValue 9.614277e-05 0.006117034
## McnemarPValue 2.582073e-11 0.105645429
Class-Level
cbind(Training = DT_trtune_conf$byClass,
Testing = DT_tetune_conf$byClass)
## Training Testing
## Sensitivity 0.42924528 0.5405405
## Specificity 0.92287056 0.9198473
## Pos Pred Value 0.61279461 0.6557377
## Neg Pred Value 0.85043263 0.8763636
## Precision 0.61279461 0.6557377
## Recall 0.42924528 0.5405405
## F1 0.50485437 0.5925926
## Prevalence 0.22140992 0.2202381
## Detection Rate 0.09503916 0.1190476
## Detection Prevalence 0.15509138 0.1815476
## Balanced Accuracy 0.67605792 0.7301939
It seems like the model still poorly predicts if a customer is going to default on their credit cards (which is what we’re interested in), with the Sensitivity of 54%.
Class Imbalance
First, we can evaluate if class imbalance is present in our target (Y) variable, DEFAULT.
Desc(CCD$DEFAULT)
## ------------------------------------------------------------------------------
## CCD$DEFAULT (factor - dichotomous)
##
## length n NAs unique
## 2'251 2'251 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 1'753 77.9% 76.1% 79.5%
## 1 498 22.1% 20.5% 23.9%
##
## ' 95%-CI (Wilson)
As shown, we have a significant difference in the frequency of our majority class 0(No Default) and our minority class 1(Default)
We use the train() function from the caret package to tune a Decision Tree model using repeated 5-Fold Cross Validation, and search for our optimal cp value across the default grid of 5 cp values.
First, we set up our control object as input to the trControl argument in the train() function.
ctrl <- trainControl(method = "repeatedcv",
number = 5, # k = 5 folds
repeats = 3) # repeated 3 times
set.seed(41972795)
DTFit <-train( x = train[,preds],#predictors
y = train$DEFAULT,#target
method = "rpart",#rpart package
trControl = ctrl,#control object
tuneLength = 5# try 5 cp)
)
We can view the average Accuracy and Kappa values across our cp values and identify the optimal value.
DTFit
## CART
##
## 1915 samples
## 24 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 1533, 1532, 1532, 1532, 1531, 1532, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.005896226 0.8015749 0.3187212
## 0.011792453 0.8041895 0.3189088
## 0.014150943 0.8050566 0.3238480
## 0.016509434 0.8043576 0.3175738
## 0.079009434 0.7946000 0.2097132
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01415094.
We will obtain testing predictions and performance, which we will use to compare the original model against resampled models, which will aim to correct for the class imbalance present in the target variable.
base_preds <- predict(object = DTFit,
newdata = test)
base_CM <- confusionMatrix(data = base_preds, # predictions
reference = test$DEFAULT, # actual
positive = "1",
mode = "everything")
We will choose the oversampling method to resample our data set because the number of rows in dataset is quite large.
set.seed(41972795)
train_os <- upSample(x = train[ ,preds], # predictors
y = train$DEFAULT, # target
yname = "DEFAULT") # name of y variable
plot(train$DEFAULT)
plot(train_os$DEFAULT)
We can compare the distribution of the target variable, DEFAULT, before and after random oversampling.
Next, we can use our oversampled training data (train_os) to train another DT model, using all of the same arguments as in the baseline model.
set.seed(41972795)
DTFit_ROS <- train(x = train_os[,preds], # predictors
y = train_os$DEFAULT, # target
method = "rpart", # use the rpart package
trControl = ctrl, # control object
tuneLength = 5) # try 5 cp values
DTFit_ROS
## CART
##
## 2982 samples
## 24 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2386, 2386, 2386, 2385, 2385, 2384, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.008048290 0.7147308 0.4294645
## 0.009389671 0.7104821 0.4209721
## 0.010395708 0.7095880 0.4191848
## 0.029175050 0.6846570 0.3692917
## 0.364855801 0.5793992 0.1588869
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.00804829.
We obtain testing predictions and performance, which we will use to compare against the original model.
ROS_preds <- predict(object = DTFit_ROS,
newdata = test)
ROS_CM <- confusionMatrix(data = ROS_preds,
reference = test$DEFAULT,
positive = "1",
mode = "everything")
ROS_CM
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 209 29
## 1 53 45
##
## Accuracy : 0.756
## 95% CI : (0.7064, 0.8009)
## No Information Rate : 0.7798
## P-Value [Acc > NIR] : 0.86785
##
## Kappa : 0.3635
##
## Mcnemar's Test P-Value : 0.01109
##
## Sensitivity : 0.6081
## Specificity : 0.7977
## Pos Pred Value : 0.4592
## Neg Pred Value : 0.8782
## Precision : 0.4592
## Recall : 0.6081
## F1 : 0.5233
## Prevalence : 0.2202
## Detection Rate : 0.1339
## Detection Prevalence : 0.2917
## Balanced Accuracy : 0.7029
##
## 'Positive' Class : 1
##
Comparing the models
cbind(Base = base_CM$overall,
Over = ROS_CM$overall)
## Base Over
## Accuracy 0.8541666667 0.75595238
## Kappa 0.5003035086 0.36351876
## AccuracyLower 0.8118236187 0.70639657
## AccuracyUpper 0.8901244638 0.80093257
## AccuracyNull 0.7797619048 0.77976190
## AccuracyPValue 0.0003828486 0.86785335
## McnemarPValue 0.0000182153 0.01108762
cbind(Base = base_CM$byClass,
Over = ROS_CM$byClass)
## Base Over
## Sensitivity 0.4594595 0.6081081
## Specificity 0.9656489 0.7977099
## Pos Pred Value 0.7906977 0.4591837
## Neg Pred Value 0.8634812 0.8781513
## Precision 0.7906977 0.4591837
## Recall 0.4594595 0.6081081
## F1 0.5811966 0.5232558
## Prevalence 0.2202381 0.2202381
## Detection Rate 0.1011905 0.1339286
## Detection Prevalence 0.1279762 0.2916667
## Balanced Accuracy 0.7125542 0.7029090
Sensitivity improved by 15%, although still low. Model is a bit better at predicting the True Positive cases, reducing the false negative rate.
cbind(Base = DT_test_conf$overall,
Tuned = DT_tetune_conf$overall,
Over = ROS_CM$overall)
## Base Tuned Over
## Accuracy 0.78273810 0.836309524 0.75595238
## Kappa 0.31062395 0.491357481 0.36351876
## AccuracyLower 0.73475623 0.792327101 0.70639657
## AccuracyUpper 0.82565721 0.874241805 0.80093257
## AccuracyNull 0.77976190 0.779761905 0.77976190
## AccuracyPValue 0.47864746 0.006117034 0.86785335
## McnemarPValue 0.06111558 0.105645429 0.01108762
cbind(Base = DT_test_conf$byClass,
Tuned = DT_tetune_conf$byClass,
Over = ROS_CM$byClass)
## Base Tuned Over
## Sensitivity 0.39189189 0.5405405 0.6081081
## Specificity 0.89312977 0.9198473 0.7977099
## Pos Pred Value 0.50877193 0.6557377 0.4591837
## Neg Pred Value 0.83870968 0.8763636 0.8781513
## Precision 0.50877193 0.6557377 0.4591837
## Recall 0.39189189 0.5405405 0.6081081
## F1 0.44274809 0.5925926 0.5232558
## Prevalence 0.22023810 0.2202381 0.2202381
## Detection Rate 0.08630952 0.1190476 0.1339286
## Detection Prevalence 0.16964286 0.1815476 0.2916667
## Balanced Accuracy 0.64251083 0.7301939 0.7029090
Across decision tree methods, we recommend the oversampled decision tree where the highest sensitivity of 60.8% is present, while Accuracy and Kappa are moderate at 75.6% and 36.3% respectively. We would be the most interested in a high sensitivity as it indicates that we are correctly predicting the customers who are likely to default.
Use the tuneRF() function to tune the mtry hyperparameter in a RF model. Use ntreeTry = 500, improve = 1e-5 and stepFactor = 2. Make sure to use your balanced training data (trainbal).
Base Model
set.seed(41972795) # initialize random seed
rf_mod <- randomForest(formula = DEFAULT ~. , # use all other variables to predict Amount
data = train, # training data
importance = TRUE, # obtain variable importance
ntree = 500) # number of trees in forest
rf_mod
##
## Call:
## randomForest(formula = DEFAULT ~ ., data = train, importance = TRUE, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 19.11%
## Confusion matrix:
## 0 1 class.error
## 0 1412 79 0.05298457
## 1 287 137 0.67688679
varImpPlot(x = rf_mod, # randomForest object
main = "Variable Importance Plot") # title
The most important variable based on the two criteria is PAY_1 (September’s pay)
base.RFpreds <- predict(object = rf_mod, # RF model
type = "class") # class predictions
RF_btrain_conf <- confusionMatrix(data = base.RFpreds, # predictions
reference = train$DEFAULT, # actual
positive = "1",
mode = "everything")
base.teRFpreds <- predict(object = rf_mod, # RF model
newdata = test, # testing data
type = "class")
RF_btest_conf <- confusionMatrix(data = base.teRFpreds, # predictions
reference = test$DEFAULT, # actual
positive = "1",
mode = "everything")
RF_btest_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 252 39
## 1 10 35
##
## Accuracy : 0.8542
## 95% CI : (0.8118, 0.8901)
## No Information Rate : 0.7798
## P-Value [Acc > NIR] : 0.0003828
##
## Kappa : 0.5059
##
## Mcnemar's Test P-Value : 6.334e-05
##
## Sensitivity : 0.4730
## Specificity : 0.9618
## Pos Pred Value : 0.7778
## Neg Pred Value : 0.8660
## Precision : 0.7778
## Recall : 0.4730
## F1 : 0.5882
## Prevalence : 0.2202
## Detection Rate : 0.1042
## Detection Prevalence : 0.1339
## Balanced Accuracy : 0.7174
##
## 'Positive' Class : 1
##
This testing set model has a high accuracy of 85% but low sensitivity of 0.47. Specificity is high, meaning model is much better at predicting negative class than positive class.
cbind(Training = RF_btrain_conf$overall,
Testing = RF_btest_conf$overall)
## Training Testing
## Accuracy 8.088773e-01 8.541667e-01
## Kappa 3.276396e-01 5.059417e-01
## AccuracyLower 7.905384e-01 8.118236e-01
## AccuracyUpper 8.262650e-01 8.901245e-01
## AccuracyNull 7.785901e-01 7.797619e-01
## AccuracyPValue 6.539275e-04 3.828486e-04
## McnemarPValue 2.765862e-27 6.334248e-05
cbind(Training = RF_btrain_conf$byClass,
Testing = RF_btest_conf$byClass)
## Training Testing
## Sensitivity 0.32311321 0.4729730
## Specificity 0.94701543 0.9618321
## Pos Pred Value 0.63425926 0.7777778
## Neg Pred Value 0.83107710 0.8659794
## Precision 0.63425926 0.7777778
## Recall 0.32311321 0.4729730
## F1 0.42812500 0.5882353
## Prevalence 0.22140992 0.2202381
## Detection Rate 0.07154047 0.1041667
## Detection Prevalence 0.11279373 0.1339286
## Balanced Accuracy 0.63506432 0.7174025
Sensitivity seems to be underfitting but other measures look balanced. Let’s try to optimize the model to see if our model will improve
Please note that we will use the oversampled training data in the tuned model due to class imbalance issue in our target variable
set.seed(41972795)
tuneR <- tuneRF(x=train_os[,preds], #Use only predictor variables
y= train_os$DEFAULT, #Target variable
ntreeTry = 500, #Use 500 trees in the forest
improve = 1e-5,
stepFactor = 2,
doBest = TRUE #create optimal model
)
## mtry = 4 OOB error = 4.59%
## Searching left ...
## mtry = 2 OOB error = 5.26%
## -0.1459854 1e-05
## Searching right ...
## mtry = 8 OOB error = 4.59%
## 0 1e-05
tuneR
##
## Call:
## randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1])
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 4.66%
## Confusion matrix:
## 0 1 class.error
## 0 1378 113 0.07578806
## 1 26 1465 0.01743796
Optimal mtry is 8 with lowest OOB error
varImpPlot(x = tuneR, main = "Variable Importance Plot")
PAY_1 is still the most important variable
Use the predict() function to obtain predictions for the test data set. Then, use the confusionMatrix() function to obtain testing performance measures.
tune.trRFpreds <- predict(object = tuneR, # Tuned RF model
type = "class") # class predictions
RF_ttrain_conf <- confusionMatrix(data = tune.trRFpreds, # predictions
reference = train_os$DEFAULT, # actual
positive = "1",
mode = "everything")
tune.teRFpreds <- predict(object = tuneR,
newdata=test,
type = "class")
We use the class predictions and actual class in the confusionMatrix() function to obtain testing performance.
RF_te_conf <- confusionMatrix(data = tune.teRFpreds, #predictions
reference = test$DEFAULT,#actual.
positive = "1",
mode = "everything")
RF_te_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 243 37
## 1 19 37
##
## Accuracy : 0.8333
## 95% CI : (0.7891, 0.8716)
## No Information Rate : 0.7798
## P-Value [Acc > NIR] : 0.00903
##
## Kappa : 0.4684
##
## Mcnemar's Test P-Value : 0.02310
##
## Sensitivity : 0.5000
## Specificity : 0.9275
## Pos Pred Value : 0.6607
## Neg Pred Value : 0.8679
## Precision : 0.6607
## Recall : 0.5000
## F1 : 0.5692
## Prevalence : 0.2202
## Detection Rate : 0.1101
## Detection Prevalence : 0.1667
## Balanced Accuracy : 0.7137
##
## 'Positive' Class : 1
##
Model is still quite poor although sensitivity improves slightly.
cbind(Training = RF_ttrain_conf$overall,
Testing = RF_te_conf$overall)
## Training Testing
## Accuracy 9.533870e-01 0.833333333
## Kappa 9.067740e-01 0.468354430
## AccuracyLower 9.451961e-01 0.789094294
## AccuracyUpper 9.606727e-01 0.871577720
## AccuracyNull 5.000000e-01 0.779761905
## AccuracyPValue 0.000000e+00 0.009029752
## McnemarPValue 2.999377e-13 0.023103394
cbind(Training = RF_ttrain_conf$byClass,
Testing = RF_te_conf$byClass)
## Training Testing
## Sensitivity 0.9825620 0.5000000
## Specificity 0.9242119 0.9274809
## Pos Pred Value 0.9283904 0.6607143
## Neg Pred Value 0.9814815 0.8678571
## Precision 0.9283904 0.6607143
## Recall 0.9825620 0.5000000
## F1 0.9547084 0.5692308
## Prevalence 0.5000000 0.2202381
## Detection Rate 0.4912810 0.1101190
## Detection Prevalence 0.5291751 0.1666667
## Balanced Accuracy 0.9533870 0.7137405
Model seems to overfit especially for sensitivity and precision
cbind(Base_Mod = RF_btest_conf$byClass,
Tuned_Mod = RF_te_conf$byClass)
## Base_Mod Tuned_Mod
## Sensitivity 0.4729730 0.5000000
## Specificity 0.9618321 0.9274809
## Pos Pred Value 0.7777778 0.6607143
## Neg Pred Value 0.8659794 0.8678571
## Precision 0.7777778 0.6607143
## Recall 0.4729730 0.5000000
## F1 0.5882353 0.5692308
## Prevalence 0.2202381 0.2202381
## Detection Rate 0.1041667 0.1101190
## Detection Prevalence 0.1339286 0.1666667
## Balanced Accuracy 0.7174025 0.7137405
The tuned model’s sensitivity improves slightly although it has overfitting issue.
Let’s try using feature selection to tackle the overfitting issue
imp <- varImp(DTFit_ROS)
plot(imp)
EDUCATION, PAY_6, MARITAL_STATUS, GENDER, PAY_AMT6 are least important
We can try to remove it from our list of potential predictors.
FS_preds <- preds[!preds %in% c("EDUCATION","PAY_6","MARITAL_STATUS",
"GENDER", "PAY_AMT6")]
FS_preds
## [1] "ID" "LIMIT_BAL" "AGE" "PAY_1" "PAY_2" "PAY_3"
## [7] "PAY_4" "PAY_5" "BILL_AMT1" "BILL_AMT2" "BILL_AMT3" "BILL_AMT4"
## [13] "BILL_AMT5" "BILL_AMT6" "PAY_AMT1" "PAY_AMT2" "PAY_AMT3" "PAY_AMT4"
## [19] "PAY_AMT5"
set.seed(41972795)
tuneR_FS <- tuneRF(x=train_os[,FS_preds], #Use only predictor variables
y= train_os$DEFAULT, #Target variable
ntreeTry = 500, #Use 500 trees in the forest
improve = 1e-5,
stepFactor = 2,
doBest = TRUE #create optimal model
)
## mtry = 4 OOB error = 5.26%
## Searching left ...
## mtry = 2 OOB error = 5.7%
## -0.08280255 1e-05
## Searching right ...
## mtry = 8 OOB error = 5.06%
## 0.03821656 1e-05
## mtry = 16 OOB error = 5.06%
## 0 1e-05
tuneR_FS
##
## Call:
## randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1])
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 4.86%
## Confusion matrix:
## 0 1 class.error
## 0 1370 121 0.08115359
## 1 24 1467 0.01609658
Optimal mtry is 16 with lowest OOB error
varImpPlot(x = tuneR_FS, main = "Variable Importance Plot")
PAY_1 is still the most important variable
Use the predict() function to obtain predictions for the test data set. Then, use the confusionMatrix() function to obtain testing performance measures.
tune.trRFpreds_FS <- predict(object = tuneR_FS, # Tuned RF model
type = "class") # class predictions
RF_ttrain_conf_FS <- confusionMatrix(data = tune.trRFpreds_FS, # predictions
reference = train_os$DEFAULT, # actual
positive = "1",
mode = "everything")
tune.teRFpreds_FS <- predict(object = tuneR_FS,
newdata=test,
type = "class")
We use the class predictions and actual class in the confusionMatrix() function to obtain testing performance.
RF_te_conf_FS <- confusionMatrix(data = tune.teRFpreds_FS, #predictions
reference = test$DEFAULT,#actual.
positive = "1",
mode = "everything")
RF_te_conf_FS
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 241 33
## 1 21 41
##
## Accuracy : 0.8393
## 95% CI : (0.7956, 0.8769)
## No Information Rate : 0.7798
## P-Value [Acc > NIR] : 0.004062
##
## Kappa : 0.5032
##
## Mcnemar's Test P-Value : 0.134417
##
## Sensitivity : 0.5541
## Specificity : 0.9198
## Pos Pred Value : 0.6613
## Neg Pred Value : 0.8796
## Precision : 0.6613
## Recall : 0.5541
## F1 : 0.6029
## Prevalence : 0.2202
## Detection Rate : 0.1220
## Detection Prevalence : 0.1845
## Balanced Accuracy : 0.7370
##
## 'Positive' Class : 1
##
Model is still quite poor although sensitivity improves slightly.
cbind(Training = RF_ttrain_conf_FS$overall,
Testing = RF_te_conf_FS$overall)
## Training Testing
## Accuracy 9.513749e-01 0.839285714
## Kappa 9.027498e-01 0.503176342
## AccuracyLower 9.430331e-01 0.795564478
## AccuracyUpper 9.588160e-01 0.876901224
## AccuracyNull 5.000000e-01 0.779761905
## AccuracyPValue 0.000000e+00 0.004061855
## McnemarPValue 1.556649e-15 0.134416574
cbind(Training = RF_ttrain_conf_FS$byClass,
Testing = RF_te_conf_FS$byClass)
## Training Testing
## Sensitivity 0.9839034 0.5540541
## Specificity 0.9188464 0.9198473
## Pos Pred Value 0.9238035 0.6612903
## Neg Pred Value 0.9827834 0.8795620
## Precision 0.9238035 0.6612903
## Recall 0.9839034 0.5540541
## F1 0.9529068 0.6029412
## Prevalence 0.5000000 0.2202381
## Detection Rate 0.4919517 0.1220238
## Detection Prevalence 0.5325285 0.1845238
## Balanced Accuracy 0.9513749 0.7369507
Model seems to overfit especially for sensitivity and precision
cbind(Base_Mod = RF_btest_conf$overall,
Tuned_Mod = RF_te_conf$overall,
FS_Mod = RF_te_conf_FS$overall)
## Base_Mod Tuned_Mod FS_Mod
## Accuracy 8.541667e-01 0.833333333 0.839285714
## Kappa 5.059417e-01 0.468354430 0.503176342
## AccuracyLower 8.118236e-01 0.789094294 0.795564478
## AccuracyUpper 8.901245e-01 0.871577720 0.876901224
## AccuracyNull 7.797619e-01 0.779761905 0.779761905
## AccuracyPValue 3.828486e-04 0.009029752 0.004061855
## McnemarPValue 6.334248e-05 0.023103394 0.134416574
cbind(Base_Mod = RF_btest_conf$byClass,
Tuned_Mod = RF_te_conf$byClass,
FS_Mod = RF_te_conf_FS$byClass)
## Base_Mod Tuned_Mod FS_Mod
## Sensitivity 0.4729730 0.5000000 0.5540541
## Specificity 0.9618321 0.9274809 0.9198473
## Pos Pred Value 0.7777778 0.6607143 0.6612903
## Neg Pred Value 0.8659794 0.8678571 0.8795620
## Precision 0.7777778 0.6607143 0.6612903
## Recall 0.4729730 0.5000000 0.5540541
## F1 0.5882353 0.5692308 0.6029412
## Prevalence 0.2202381 0.2202381 0.2202381
## Detection Rate 0.1041667 0.1101190 0.1220238
## Detection Prevalence 0.1339286 0.1666667 0.1845238
## Balanced Accuracy 0.7174025 0.7137405 0.7369507
The model still has overfitting issue, we recommend increase the size of the dataset.
We need to identify highly correlated numeric input (X) variables and exclude them from our predictive model.
First we obtain the correlation matrix. For our numeric variables :
cor_vars <- cor(x = CCD[,nums])
We can start by looking at the symbolic correlation Matrix to manually identify correlated variables
symnum(x = cor_vars,
corr = TRUE)
## L BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6
## LIMIT_BAL 1
## BILL_AMT1 1
## BILL_AMT2 B 1
## BILL_AMT3 * * 1
## BILL_AMT4 . + * * 1
## BILL_AMT5 . + + + * 1
## BILL_AMT6 . + + + * * 1
## PAY_AMT1
## PAY_AMT2 .
## PAY_AMT3 . . . . .
## PAY_AMT4
## PAY_AMT5 .
## PAY_AMT6
## PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
## LIMIT_BAL
## BILL_AMT1
## BILL_AMT2
## BILL_AMT3
## BILL_AMT4
## BILL_AMT5
## BILL_AMT6
## PAY_AMT1 1
## PAY_AMT2 1
## PAY_AMT3 1
## PAY_AMT4 1
## PAY_AMT5 1
## PAY_AMT6 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
We can use the findCorrelation() function in the caret package to identify redundant variables for us. Setting names = TRUE will output a list of the variable name that are determined to be redundant. We can then remove them from our CCD dataset and exclude them from our analysis.
high_corrs <- findCorrelation(x = cor_vars,
cutoff = .75,
names = TRUE)
By running a code line with the name of the output object (high_corrs), we can view the names of the redundant variables
high_corrs
## [1] "BILL_AMT5" "BILL_AMT4" "BILL_AMT6" "BILL_AMT2" "BILL_AMT3"
Now, we can remove them from our CCD dataset so that we exclude them from our list of input (X) variable names Now, we can remove them from our vars vector so that we exclude them from ourlist of input (X) variable names
nums <- nums[!nums %in% high_corrs]
preds2 <- preds[!preds %in% high_corrs]
We use the naiveBayes() function from the e1071 package to perform NB classification
The default laplace value is 0, meaning laplace smoothing is not applied. To determine if we need to use Laplace smoothing, we need to look for zero probability categories
aggregate(train_os[ ,c(noms,ords)],
by = list(train_os$DEFAULT),
FUN = table)
## Group.1 GENDER.1 GENDER.2 MARITAL_STATUS.0 MARITAL_STATUS.1 MARITAL_STATUS.2
## 1 0 583 908 4 679 796
## 2 1 638 853 0 768 713
## MARITAL_STATUS.3 EDUCATION.1 EDUCATION.2 EDUCATION.3 EDUCATION.4 EDUCATION.5
## 1 12 546 677 230 12 21
## 2 10 442 744 289 0 5
## EDUCATION.6 AGE.21 AGE.22 AGE.23 AGE.24 AGE.25 AGE.26 AGE.27 AGE.28 AGE.29
## 1 5 6 28 44 46 71 58 71 82 72
## 2 11 0 32 54 46 89 45 66 59 38
## AGE.30 AGE.31 AGE.32 AGE.33 AGE.34 AGE.35 AGE.36 AGE.37 AGE.38 AGE.39 AGE.40
## 1 70 64 66 61 62 56 58 48 42 55 43
## 2 66 66 74 61 54 80 24 37 48 39 32
## AGE.41 AGE.42 AGE.43 AGE.44 AGE.45 AGE.46 AGE.47 AGE.48 AGE.49 AGE.50 AGE.51
## 1 45 40 35 25 35 27 34 24 22 17 11
## 2 33 37 42 36 35 35 38 34 17 24 22
## AGE.52 AGE.53 AGE.54 AGE.55 AGE.56 AGE.57 AGE.58 AGE.59 AGE.60 AGE.61 AGE.62
## 1 18 14 6 7 3 10 1 4 1 2 2
## 2 17 7 16 15 19 10 10 19 3 0 0
## AGE.63 AGE.64 AGE.65 AGE.67 AGE.71 AGE.75 PAY_1.9 PAY_1.8 PAY_1.7 PAY_1.6
## 1 2 0 2 0 1 0 0 0 0 0
## 2 0 6 3 3 0 0 0 0 0 0
## PAY_1.5 PAY_1.4 PAY_1.3 PAY_1.2 PAY_1.1 PAY_1.0 PAY_1.-1 PAY_1.-2 PAY_2.9
## 1 0 1 5 62 145 824 308 146 0
## 2 0 6 45 398 308 419 218 97 0
## PAY_2.8 PAY_2.7 PAY_2.6 PAY_2.5 PAY_2.4 PAY_2.3 PAY_2.2 PAY_2.1 PAY_2.0
## 1 0 0 0 0 2 6 105 1 850
## 2 0 0 0 4 3 57 487 4 547
## PAY_2.-1 PAY_2.-2 PAY_3.9 PAY_3.8 PAY_3.7 PAY_3.6 PAY_3.5 PAY_3.4 PAY_3.3
## 1 334 193 0 0 1 0 1 1 5
## 2 222 167 0 0 0 0 0 8 20
## PAY_3.2 PAY_3.1 PAY_3.0 PAY_3.-1 PAY_3.-2 PAY_4.9 PAY_4.8 PAY_4.7 PAY_4.6
## 1 130 0 820 328 205 0 0 1 0
## 2 445 4 623 202 189 0 0 3 0
## PAY_4.5 PAY_4.4 PAY_4.3 PAY_4.2 PAY_4.1 PAY_4.0 PAY_4.-1 PAY_4.-2 PAY_5.9
## 1 1 1 4 106 0 856 311 211 0
## 2 5 8 36 384 4 669 158 224 0
## PAY_5.8 PAY_5.7 PAY_5.6 PAY_5.5 PAY_5.4 PAY_5.3 PAY_5.2 PAY_5.1 PAY_5.0
## 1 0 1 0 0 2 5 78 0 876
## 2 0 3 0 0 12 23 355 0 697
## PAY_5.-1 PAY_5.-2 PAY_6.9 PAY_6.8 PAY_6.7 PAY_6.6 PAY_6.5 PAY_6.4 PAY_6.3
## 1 303 226 0 0 1 0 0 1 7
## 2 169 232 0 0 3 0 0 9 24
## PAY_6.2 PAY_6.1 PAY_6.0 PAY_6.-1 PAY_6.-2
## 1 92 0 843 305 242
## 2 373 0 627 222 233
Since we have categories with 0 frequency, we will set laplace = 1
nb_mod <- naiveBayes(x = train_os[ ,preds2],
y = train_os$DEFAULT,
laplace = 1)
###View the model output
nb_mod
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = train_os[, preds2], y = train_os$DEFAULT,
## laplace = 1)
##
## A-priori probabilities:
## train_os$DEFAULT
## 0 1
## 0.5 0.5
##
## Conditional probabilities:
## ID
## train_os$DEFAULT [,1] [,2]
## 0 15044.61 8834.044
## 1 15329.48 8677.338
##
## LIMIT_BAL
## train_os$DEFAULT [,1] [,2]
## 0 179470.2 137591.7
## 1 131522.5 121747.5
##
## GENDER
## train_os$DEFAULT 1 2
## 0 0.3911587 0.6088413
## 1 0.4279973 0.5720027
##
## EDUCATION
## train_os$DEFAULT 1 2 3 4
## 0 0.3653974616 0.4529058116 0.1543086172 0.0086840347
## 1 0.2959251837 0.4976619906 0.1937207749 0.0006680027
## EDUCATION
## train_os$DEFAULT 5 6
## 0 0.0146960588 0.0040080160
## 1 0.0040080160 0.0080160321
##
## MARITAL_STATUS
## train_os$DEFAULT 0 1 2 3
## 0 0.0033444816 0.4548494983 0.5331103679 0.0086956522
## 1 0.0006688963 0.5143812709 0.4775919732 0.0073578595
##
## AGE
## train_os$DEFAULT 21 22 23 24
## 0 0.0045484081 0.0188434048 0.0292397661 0.0305393112
## 1 0.0006497726 0.0214424951 0.0357374919 0.0305393112
## AGE
## train_os$DEFAULT 25 26 27 28
## 0 0.0467836257 0.0383365822 0.0467836257 0.0539311241
## 1 0.0584795322 0.0298895387 0.0435347628 0.0389863548
## AGE
## train_os$DEFAULT 29 30 31 32
## 0 0.0474333983 0.0461338532 0.0422352177 0.0435347628
## 1 0.0253411306 0.0435347628 0.0435347628 0.0487329435
## AGE
## train_os$DEFAULT 33 34 35 36
## 0 0.0402858999 0.0409356725 0.0370370370 0.0383365822
## 1 0.0402858999 0.0357374919 0.0526315789 0.0162443145
## AGE
## train_os$DEFAULT 37 38 39 40
## 0 0.0318388564 0.0279402209 0.0363872645 0.0285899935
## 1 0.0246913580 0.0318388564 0.0259909032 0.0214424951
## AGE
## train_os$DEFAULT 41 42 43 44
## 0 0.0298895387 0.0266406758 0.0233918129 0.0168940871
## 1 0.0220922677 0.0246913580 0.0279402209 0.0240415854
## AGE
## train_os$DEFAULT 45 46 47 48
## 0 0.0233918129 0.0181936322 0.0227420403 0.0162443145
## 1 0.0233918129 0.0233918129 0.0253411306 0.0227420403
## AGE
## train_os$DEFAULT 49 50 51 52
## 0 0.0149447693 0.0116959064 0.0077972710 0.0123456790
## 1 0.0116959064 0.0162443145 0.0149447693 0.0116959064
## AGE
## train_os$DEFAULT 53 54 55 56
## 0 0.0097465887 0.0045484081 0.0051981806 0.0025990903
## 1 0.0051981806 0.0110461339 0.0103963613 0.0129954516
## AGE
## train_os$DEFAULT 57 58 59 60
## 0 0.0071474984 0.0012995452 0.0032488629 0.0012995452
## 1 0.0071474984 0.0071474984 0.0129954516 0.0025990903
## AGE
## train_os$DEFAULT 61 62 63 64
## 0 0.0019493177 0.0019493177 0.0019493177 0.0006497726
## 1 0.0006497726 0.0006497726 0.0006497726 0.0045484081
## AGE
## train_os$DEFAULT 65 67 71 75
## 0 0.0019493177 0.0006497726 0.0012995452 0.0006497726
## 1 0.0025990903 0.0025990903 0.0006497726 0.0006497726
##
## PAY_1
## train_os$DEFAULT 9 8 7 6 5
## 0 0.000665336 0.000665336 0.000665336 0.000665336 0.000665336
## 1 0.000665336 0.000665336 0.000665336 0.000665336 0.000665336
## PAY_1
## train_os$DEFAULT 4 3 2 1 0
## 0 0.001330672 0.003992016 0.041916168 0.097139055 0.548902196
## 1 0.004657352 0.030605456 0.265469062 0.205588822 0.279441118
## PAY_1
## train_os$DEFAULT -1 -2
## 0 0.205588822 0.097804391
## 1 0.145708583 0.065202927
##
## PAY_2
## train_os$DEFAULT 9 8 7 6 5
## 0 0.000665336 0.000665336 0.000665336 0.000665336 0.000665336
## 1 0.000665336 0.000665336 0.000665336 0.000665336 0.003326680
## PAY_2
## train_os$DEFAULT 4 3 2 1 0
## 0 0.001996008 0.004657352 0.070525615 0.001330672 0.566200931
## 1 0.002661344 0.038589488 0.324683965 0.003326680 0.364604125
## PAY_2
## train_os$DEFAULT -1 -2
## 0 0.222887558 0.129075183
## 1 0.148369927 0.111776447
##
## PAY_3
## train_os$DEFAULT 9 8 7 6 5
## 0 0.000665336 0.000665336 0.001330672 0.000665336 0.001330672
## 1 0.000665336 0.000665336 0.000665336 0.000665336 0.000665336
## PAY_3
## train_os$DEFAULT 4 3 2 1 0
## 0 0.001330672 0.003992016 0.087159015 0.000665336 0.546240852
## 1 0.005988024 0.013972056 0.296739854 0.003326680 0.415169661
## PAY_3
## train_os$DEFAULT -1 -2
## 0 0.218895542 0.137059215
## 1 0.135063207 0.126413839
##
## PAY_4
## train_os$DEFAULT 9 8 7 6 5
## 0 0.000665336 0.000665336 0.001330672 0.000665336 0.001330672
## 1 0.000665336 0.000665336 0.002661344 0.000665336 0.003992016
## PAY_4
## train_os$DEFAULT 4 3 2 1 0
## 0 0.001330672 0.003326680 0.071190951 0.000665336 0.570192947
## 1 0.005988024 0.024617432 0.256154358 0.003326680 0.445775116
## PAY_4
## train_os$DEFAULT -1 -2
## 0 0.207584830 0.141051231
## 1 0.105788423 0.149700599
##
## PAY_5
## train_os$DEFAULT 9 8 7 6 5
## 0 0.000665336 0.000665336 0.001330672 0.000665336 0.000665336
## 1 0.000665336 0.000665336 0.002661344 0.000665336 0.000665336
## PAY_5
## train_os$DEFAULT 4 3 2 1 0
## 0 0.001996008 0.003992016 0.052561544 0.000665336 0.583499667
## 1 0.008649368 0.015968064 0.236859614 0.000665336 0.464404524
## PAY_5
## train_os$DEFAULT -1 -2
## 0 0.202262142 0.151031271
## 1 0.113107119 0.155023287
##
## PAY_6
## train_os$DEFAULT 9 8 7 6 5
## 0 0.000665336 0.000665336 0.001330672 0.000665336 0.000665336
## 1 0.000665336 0.000665336 0.002661344 0.000665336 0.000665336
## PAY_6
## train_os$DEFAULT 4 3 2 1 0
## 0 0.001330672 0.005322688 0.061876248 0.000665336 0.561543580
## 1 0.006653360 0.016633400 0.248835662 0.000665336 0.417831005
## PAY_6
## train_os$DEFAULT -1 -2
## 0 0.203592814 0.161676647
## 1 0.148369927 0.155688623
##
## BILL_AMT1
## train_os$DEFAULT [,1] [,2]
## 0 49312.21 76076.95
## 1 46954.28 72542.87
##
## PAY_AMT1
## train_os$DEFAULT [,1] [,2]
## 0 5664.924 13582.00
## 1 3992.181 11962.79
##
## PAY_AMT2
## train_os$DEFAULT [,1] [,2]
## 0 6290.263 16629.66
## 1 3075.388 5232.08
##
## PAY_AMT3
## train_os$DEFAULT [,1] [,2]
## 0 6041.785 28418.46
## 1 4431.292 25384.78
##
## PAY_AMT4
## train_os$DEFAULT [,1] [,2]
## 0 4941.826 13558.810
## 1 2659.176 5960.767
##
## PAY_AMT5
## train_os$DEFAULT [,1] [,2]
## 0 5361.818 17891.53
## 1 3120.036 10257.66
##
## PAY_AMT6
## train_os$DEFAULT [,1] [,2]
## 0 5699.572 18696.19
## 1 4290.451 17714.00
To assess the goodness of fit of the model, we compare the training and testing performance.
First, we use the predict() function to obtain the class predictions (type = “class”) for the training data based on our NB model.
nb.train <- predict(object = nb_mod, # NB model
newdata = train_os[ ,preds2], # predictors
type = "class")
We can use the confusionMatrix() function from the caret package to obtain a confusion matrix and obtain performance measures for our model applied to the training dataset (train). We can set mode = “everything” to obtain all available performance measures. We identify the “1” class as positive, since that is the class we are more interested in being able to predict. We will save it so that we can make comparisons
train_conf <- confusionMatrix(data = nb.train, # predictions
reference = train_os$DEFAULT, # actual
positive = "1", # this is the class we're more interested
mode = "everything") # to get all performance measures
train_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 525 193
## 1 966 1298
##
## Accuracy : 0.6113
## 95% CI : (0.5936, 0.6289)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2227
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8706
## Specificity : 0.3521
## Pos Pred Value : 0.5733
## Neg Pred Value : 0.7312
## Precision : 0.5733
## Recall : 0.8706
## F1 : 0.6913
## Prevalence : 0.5000
## Detection Rate : 0.4353
## Detection Prevalence : 0.7592
## Balanced Accuracy : 0.6113
##
## 'Positive' Class : 1
##
To assess model performance, we focus on the performance of the model applied to the testing set. Next, we use the predict() function to obtain the class predictions (type = “class”) for the testing data based on our NB model.
nb.test <- predict(object = nb_mod, # NB model
newdata = test[ ,preds2], # predictors
type = "class")
Again, we use the confusionMatrix() function from the caret package to obtain a confusion matrix and obtain performance measures for our model, this time applied to the testing dataset (test).
test_conf <- confusionMatrix(data = nb.test,
reference = test$DEFAULT,
positive = "1",
mode = "everything")
test_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 96 10
## 1 166 64
##
## Accuracy : 0.4762
## 95% CI : (0.4217, 0.5311)
## No Information Rate : 0.7798
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1317
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8649
## Specificity : 0.3664
## Pos Pred Value : 0.2783
## Neg Pred Value : 0.9057
## Precision : 0.2783
## Recall : 0.8649
## F1 : 0.4211
## Prevalence : 0.2202
## Detection Rate : 0.1905
## Detection Prevalence : 0.6845
## Balanced Accuracy : 0.6156
##
## 'Positive' Class : 1
##
We can describe the overall performance based on our accuracy and kappa values.
test_conf$overall[c("Accuracy", "Kappa")]
## Accuracy Kappa
## 0.4761905 0.1316810
We can describe class-level performance for the different class levels. Note, above, we set positive = “1”, since we are more interested in predicting customers who will default on their credit card
test_conf$byClass
## Sensitivity Specificity Pos Pred Value
## 0.8648649 0.3664122 0.2782609
## Neg Pred Value Precision Recall
## 0.9056604 0.2782609 0.8648649
## F1 Prevalence Detection Rate
## 0.4210526 0.2202381 0.1904762
## Detection Prevalence Balanced Accuracy
## 0.6845238 0.6156385
To assess if the model is balanced, underfitting or overfitting, we compare the performance on the training and testing. We can use the cbind() function to compare side-by-side.
We want to see balanced accuracy and kappa measures for both testing and training models
cbind(Training = train_conf$overall,
Testing = test_conf$overall)
## Training Testing
## Accuracy 6.113347e-01 4.761905e-01
## Kappa 2.226693e-01 1.316810e-01
## AccuracyLower 5.935694e-01 4.217167e-01
## AccuracyUpper 6.288812e-01 5.310875e-01
## AccuracyNull 5.000000e-01 7.797619e-01
## AccuracyPValue 1.720275e-34 1.000000e+00
## McnemarPValue 7.647226e-114 1.546704e-31
cbind(Training = train_conf$byClass,
Testing = test_conf$byClass)
## Training Testing
## Sensitivity 0.8705567 0.8648649
## Specificity 0.3521127 0.3664122
## Pos Pred Value 0.5733216 0.2782609
## Neg Pred Value 0.7311978 0.9056604
## Precision 0.5733216 0.2782609
## Recall 0.8705567 0.8648649
## F1 0.6913449 0.4210526
## Prevalence 0.5000000 0.2202381
## Detection Rate 0.4352783 0.1904762
## Detection Prevalence 0.7592220 0.6845238
## Balanced Accuracy 0.6113347 0.6156385
As shown, we have similar performance on our training and testing sets, although Accuracy & Kappa are a bit low, which is expected since we oversampled our data. However, we can conclude that the NB model is balanced. Also, it is a higher performing model than DT/RF, with pretty high Sensitivity (True positive rate - meaning we’re predicting correctly the positive class (1 - Default)) (except for a fair kappa of 0.14 - indicating a poor agreement between observed and expected output)
cbind(DT = ROS_CM$byClass,
NB = test_conf$byClass)
## DT NB
## Sensitivity 0.6081081 0.8648649
## Specificity 0.7977099 0.3664122
## Pos Pred Value 0.4591837 0.2782609
## Neg Pred Value 0.8781513 0.9056604
## Precision 0.4591837 0.2782609
## Recall 0.6081081 0.8648649
## F1 0.5232558 0.4210526
## Prevalence 0.2202381 0.2202381
## Detection Rate 0.1339286 0.1904762
## Detection Prevalence 0.2916667 0.6845238
## Balanced Accuracy 0.7029090 0.6156385
cbind(DT = ROS_CM$overall,
NB = test_conf$overall)
## DT NB
## Accuracy 0.75595238 4.761905e-01
## Kappa 0.36351876 1.316810e-01
## AccuracyLower 0.70639657 4.217167e-01
## AccuracyUpper 0.80093257 5.310875e-01
## AccuracyNull 0.77976190 7.797619e-01
## AccuracyPValue 0.86785335 1.000000e+00
## McnemarPValue 0.01108762 1.546704e-31
It seems that NB is our best model among the three (DT, FR, NB) in terms of Specificity (TPR). This is also what we recommend for the credit card company, since we’re interested in predicting people who are likely to default. Accuracy and kappa are both not so high we oversampled our data. Therefore, we need to gather more data in order to improve our model, or use alternate method to predict the positive class “Default” better.
save.image(file = "Final_Group18.RData")