Objectives

Develop and deploy a classification model an a product purchase data set.
End to end analysis using R
Learn the caret package for ML
Learn to present the case using Rmarkdown

Read in the dataset

library(tidyverse, quietly = T)
library(dplyr, quietly = T)      #used by Caret
library(ggplot2, quietly = T)
library(corrplot, quietly =T)
library(caret, quietly = T)
library(Amelia, quietly = T)
library(gridExtra, quietly = T)
library(xgboost, quietly = T)
library(ROCR, quietly = T)
library(Matrix, quietly = T)
library(rpart.plot, quietly =T)
library(doParallel, quietly=T)

#Import the data
training.raw <- read_csv('AT2_credit_train_STUDENT.csv')
predict.raw <- read_csv('AT2_credit_test_STUDENT.csv')

23,101 observations over 17 variables for the training/test split 6899 observations over 17 variables for the prediction set.

Clean the data based on new observations to make train and test the same

glimpse(training.raw)

## Observations: 23,101
## Variables: 17
## $ ID        <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ LIMIT_BAL <dbl> 20000, 120000, 90000, 50000, 50000, 500000, 100000, ...
## $ SEX       <int> 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2...
## $ EDUCATION <int> 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 3, 1, 1, 3...
## $ MARRIAGE  <int> 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2...
## $ AGE       <int> 24, 26, 34, 37, 37, 29, 23, 28, 35, 34, 51, 41, 30, ...
## $ PAY_PC1   <dbl> 0.47746305, -1.46161240, -0.39330764, -0.39330764, -...
## $ PAY_PC2   <dbl> -3.224589961, 0.853866590, 0.175554996, 0.175554996,...
## $ PAY_PC3   <dbl> 0.145040802, -0.360863449, 0.004885522, 0.004885522,...
## $ AMT_PC1   <dbl> -1.7522077, -1.6633432, -1.1348380, -0.3971748, -0.3...
## $ AMT_PC2   <dbl> -0.22434761, -0.14388558, -0.17660872, -0.45109777, ...
## $ AMT_PC3   <dbl> -0.077841055, -0.054600404, 0.015954854, -0.09978950...
## $ AMT_PC4   <dbl> 0.006957244, -0.002851947, -0.129071306, -0.03533896...
## $ AMT_PC5   <dbl> -0.041356958, 0.043889122, 0.098245278, -0.055306582...
## $ AMT_PC6   <dbl> 0.0008865935, -0.0261897987, -0.0223825102, 0.050465...
## $ AMT_PC7   <dbl> -0.05626505, -0.09997756, -0.06898686, -0.02820475, ...
## $ default   <chr> "Y", "Y", "N", "N", "N", "N", "N", "Y", "N", "N", "N...

training.raw$default <- as.factor(training.raw$default)
training.raw$SEX <- as.factor(training.raw$SEX)
training.raw$MARRIAGE <- as.factor(training.raw$MARRIAGE)
training.raw$AGE <- as.numeric(training.raw$AGE)
#remove random anomilies

#remove NA - 2 rows from the sex being cat/dog
training.raw <- training.raw[complete.cases(training.raw), ]

#set negative balance values to 0. Max Limit_bal for test set is 760K and validation set for 800k. Lets set these at median value.
training.raw$LIMIT_BAL[training.raw$LIMIT_BAL <= 0] <- 0
training.raw$LIMIT_BAL[training.raw$LIMIT_BAL >750000] <-140000 


#all education factors greater than 4, set them to 4, also set 0 to 4
training.raw$EDUCATION[training.raw$EDUCATION == 5] <- 4
training.raw$EDUCATION[training.raw$EDUCATION == 6] <- 4
training.raw$EDUCATION[training.raw$EDUCATION == 0] <- 4
training.raw$EDUCATION <- factor(training.raw$EDUCATION)

#clean marriage
training.raw$MARRIAGE <- factor(training.raw$MARRIAGE)
training.raw$MARRIAGE[training.raw$MARRIAGE == 0] <- 3

#lets label our factors properly
training.raw$SEX <- factor(training.raw$SEX,
                           levels =c('1', '2'),
                           labels =c('Male', 'Female'))

training.raw$EDUCATION <- factor(training.raw$EDUCATION,
                                 levels =c('1', '2', '3', '4'),
                                 labels = c('Grad_School', 'Uni', 'High_School', 'Other'))
training.raw$EDUCATION <- as.factor(training.raw$EDUCATION)
training.raw$MARRIAGE <- factor(training.raw$MARRIAGE,
                                levels = c('1','2','3'),
                                labels = c('Married', 'Single', 'Other'))

summary(training.raw)

##        ID          LIMIT_BAL          SEX              EDUCATION    
##  Min.   :    1   Min.   :     0   Male  : 9244   Grad_School: 8192  
##  1st Qu.: 7488   1st Qu.: 50000   Female:13854   Uni        :10722  
##  Median :14989   Median :140000                  High_School: 3820  
##  Mean   :14982   Mean   :167397                  Other      :  364  
##  3rd Qu.:22453   3rd Qu.:240000                                     
##  Max.   :30000   Max.   :750000                                     
##     MARRIAGE          AGE           PAY_PC1              PAY_PC2        
##  Married:10507   Min.   : 21.0   Min.   :-11.859675   Min.   :-4.42243  
##  Single :12304   1st Qu.: 28.0   1st Qu.: -0.393308   1st Qu.:-0.23617  
##  Other  :  287   Median : 34.0   Median : -0.393308   Median : 0.17555  
##                  Mean   : 35.7   Mean   : -0.001513   Mean   :-0.00187  
##                  3rd Qu.: 41.0   3rd Qu.:  1.360047   3rd Qu.: 0.36042  
##                  Max.   :141.0   Max.   :  3.813348   Max.   : 5.44103  
##     PAY_PC3             AMT_PC1            AMT_PC2        
##  Min.   :-3.864638   Min.   :-3.41080   Min.   :-4.71769  
##  1st Qu.:-0.283941   1st Qu.:-1.50825   1st Qu.:-0.42972  
##  Median : 0.004886   Median :-0.86429   Median :-0.20781  
##  Mean   : 0.000605   Mean   : 0.00431   Mean   : 0.00124  
##  3rd Qu.: 0.093942   3rd Qu.: 0.49754   3rd Qu.: 0.09062  
##  Max.   : 3.364030   Max.   :37.49240   Max.   :83.52137  
##     AMT_PC3             AMT_PC4              AMT_PC5         
##  Min.   :-38.46500   Min.   :-21.593416   Min.   :-42.37665  
##  1st Qu.: -0.13709   1st Qu.: -0.068216   1st Qu.: -0.08239  
##  Median : -0.07044   Median :  0.018389   Median : -0.03200  
##  Mean   :  0.00396   Mean   :  0.004458   Mean   :  0.00144  
##  3rd Qu.:  0.00325   3rd Qu.:  0.083214   3rd Qu.:  0.02637  
##  Max.   : 21.98483   Max.   : 21.823749   Max.   : 17.43097  
##     AMT_PC6             AMT_PC7          default  
##  Min.   :-38.88504   Min.   :-41.71546   N:17518  
##  1st Qu.: -0.04241   1st Qu.: -0.09262   Y: 5580  
##  Median : -0.00216   Median : -0.04099            
##  Mean   : -0.00159   Mean   : -0.00406            
##  3rd Qu.:  0.06754   3rd Qu.:  0.03157            
##  Max.   : 20.22670   Max.   : 22.92727

cplot <- training.raw %>%
  select(-default) %>%
  select_if(is.numeric)

M <- cor(cplot)
p.mat <- cor.mtest(M)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(M, 
         method = "color",
         order= "hclust",
         type="full",
         col=col(200),
         diag =F,
         title="Correlation of Numeric Variables",
         addCoef.col = "black",
         sig.level = 0.05,
         insig ="blank",
         mar=c(0,0,3,0))

Fix age imputation

training.raw$AGE <- ifelse(training.raw$AGE >=70, NA, training.raw$AGE)

map_int(training.raw,~sum(is.na(.x)))

##        ID LIMIT_BAL       SEX EDUCATION  MARRIAGE       AGE   PAY_PC1 
##         0         0         0         0         0        69         0 
##   PAY_PC2   PAY_PC3   AMT_PC1   AMT_PC2   AMT_PC3   AMT_PC4   AMT_PC5 
##         0         0         0         0         0         0         0 
##   AMT_PC6   AMT_PC7   default 
##         0         0         0

'we can see that there is 50 age that are missing, out of the scheme of things, it is <1% of the total dataset, however lets play with using rpart to predict the missing age'

## [1] "we can see that there is 50 age that are missing, out of the scheme of things, it is <1% of the total dataset, however lets play with using rpart to predict the missing age"

Missing Values Imputation

train.imp <- training.raw[-1]

map_int(train.imp,~sum(is.na(.x)))

## LIMIT_BAL       SEX EDUCATION  MARRIAGE       AGE   PAY_PC1   PAY_PC2 
##         0         0         0         0        69         0         0 
##   PAY_PC3   AMT_PC1   AMT_PC2   AMT_PC3   AMT_PC4   AMT_PC5   AMT_PC6 
##         0         0         0         0         0         0         0 
##   AMT_PC7   default 
##         0         0

head(train.imp)

## # A tibble: 6 x 16
##   LIMIT_BAL SEX   EDUCATION MARRIAGE   AGE PAY_PC1 PAY_PC2  PAY_PC3 AMT_PC1
##       <dbl> <fct> <fct>     <fct>    <dbl>   <dbl>   <dbl>    <dbl>   <dbl>
## 1     20000 Fema~ Uni       Married     24   0.477  -3.22   0.145    -1.75 
## 2    120000 Fema~ Uni       Single      26  -1.46    0.854 -0.361    -1.66 
## 3     90000 Fema~ Uni       Single      34  -0.393   0.176  0.00489  -1.13 
## 4     50000 Fema~ Uni       Married     37  -0.393   0.176  0.00489  -0.397
## 5     50000 Male  Grad_Sch~ Single      37  -0.393   0.176  0.00489  -0.393
## 6    500000 Male  Grad_Sch~ Single      29  -0.393   0.176  0.00489  15.7  
## # ... with 7 more variables: AMT_PC2 <dbl>, AMT_PC3 <dbl>, AMT_PC4 <dbl>,
## #   AMT_PC5 <dbl>, AMT_PC6 <dbl>, AMT_PC7 <dbl>, default <fct>

Age Imputation.

#run in parallel
cl <- makeCluster(detectCores())
registerDoParallel(cl)

age.predictors <- train.imp %>%
  filter(complete.cases(.))
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5)
rpartGrid <- data.frame(maxdepth = seq(2,10,1))
rpartFit_ageimputation <- train (x=age.predictors[,-3],
                                 y=age.predictors$AGE,
                                 method='rpart2',
                                 trControl = ctrl,
                                 tuneGrid = rpartGrid
                                 )

## Warning: Setting row names on a tibble is deprecated.

rpartFit_ageimputation

plot(rpartFit_ageimputation)

# Tree depth of 7 is optimal

rpart.plot(rpartFit_ageimputation$finalModel)

#save the model externally to load back in
saveRDS(rpartFit_ageimputation, file = 'rpartFit_ageimputation.rds')

#insert missing age back into the dataset
missing_age <- is.na(train.imp$AGE)
age.predicted <- predict(rpartFit_ageimputation, newdata = train.imp[missing_age,])

train.imp[missing_age, 'AGE'] <- age.predicted

summary(train.imp)

##    LIMIT_BAL          SEX              EDUCATION        MARRIAGE    
##  Min.   :     0   Male  : 9244   Grad_School: 8192   Married:10507  
##  1st Qu.: 50000   Female:13854   Uni        :10722   Single :12304  
##  Median :140000                  High_School: 3820   Other  :  287  
##  Mean   :167397                  Other      :  364                  
##  3rd Qu.:240000                                                     
##  Max.   :750000                                                     
##       AGE           PAY_PC1              PAY_PC2        
##  Min.   :21.00   Min.   :-11.859675   Min.   :-4.42243  
##  1st Qu.:28.00   1st Qu.: -0.393308   1st Qu.:-0.23617  
##  Median :34.00   Median : -0.393308   Median : 0.17555  
##  Mean   :35.45   Mean   : -0.001513   Mean   :-0.00187  
##  3rd Qu.:41.00   3rd Qu.:  1.360047   3rd Qu.: 0.36042  
##  Max.   :69.00   Max.   :  3.813348   Max.   : 5.44103  
##     PAY_PC3             AMT_PC1            AMT_PC2        
##  Min.   :-3.864638   Min.   :-3.41080   Min.   :-4.71769  
##  1st Qu.:-0.283941   1st Qu.:-1.50825   1st Qu.:-0.42972  
##  Median : 0.004886   Median :-0.86429   Median :-0.20781  
##  Mean   : 0.000605   Mean   : 0.00431   Mean   : 0.00124  
##  3rd Qu.: 0.093942   3rd Qu.: 0.49754   3rd Qu.: 0.09062  
##  Max.   : 3.364030   Max.   :37.49240   Max.   :83.52137  
##     AMT_PC3             AMT_PC4              AMT_PC5         
##  Min.   :-38.46500   Min.   :-21.593416   Min.   :-42.37665  
##  1st Qu.: -0.13709   1st Qu.: -0.068216   1st Qu.: -0.08239  
##  Median : -0.07044   Median :  0.018389   Median : -0.03200  
##  Mean   :  0.00396   Mean   :  0.004458   Mean   :  0.00144  
##  3rd Qu.:  0.00325   3rd Qu.:  0.083214   3rd Qu.:  0.02637  
##  Max.   : 21.98483   Max.   : 21.823749   Max.   : 17.43097  
##     AMT_PC6             AMT_PC7          default  
##  Min.   :-38.88504   Min.   :-41.71546   N:17518  
##  1st Qu.: -0.04241   1st Qu.: -0.09262   Y: 5580  
##  Median : -0.00216   Median : -0.04099            
##  Mean   : -0.00159   Mean   : -0.00406            
##  3rd Qu.:  0.06754   3rd Qu.:  0.03157            
##  Max.   : 20.22670   Max.   : 22.92727

Train-Test Split

set.seed(42)
training_rows <- createDataPartition(y = train.imp$default, p=0.8, list=F)

test.imp <- train.imp %>% filter(!(rownames(.) %in% training_rows))
train.imp <- train.imp %>% filter(rownames(.) %in% training_rows)
dim(train.imp)

## [1] 18479    16

## [1] 4619   16

Missing values analysis

map_int(train.imp,~sum(is.na(.x)))

## LIMIT_BAL       SEX EDUCATION  MARRIAGE       AGE   PAY_PC1   PAY_PC2 
##         0         0         0         0         0         0         0 
##   PAY_PC3   AMT_PC1   AMT_PC2   AMT_PC3   AMT_PC4   AMT_PC5   AMT_PC6 
##         0         0         0         0         0         0         0 
##   AMT_PC7   default 
##         0         0

No missing values.

EDA

default is the response variable. The response variable is a Yes/No boolean variable therefor is appropriate for our classification problem.

round(prop.table(table(train.imp$default)),2)

## 
##    N    Y 
## 0.76 0.24

round(prop.table(table(train.imp$SEX)),2)

## 
##   Male Female 
##    0.4    0.6

round(prop.table(table(train.imp$EDUCATION)),2)

## 
## Grad_School         Uni High_School       Other 
##        0.35        0.46        0.17        0.02

round(prop.table(table(train.imp$MARRIAGE)),2)

## 
## Married  Single   Other 
##    0.45    0.53    0.01

76% of the dataset do not default and 24% have defaulted. This is a classic imbalanced dataset. We will need to deal with this either prior to modelling or using packages to sample for us.

Predictor Variables

Univariate & Bivariate

First step is to look at all variables available using the ggplot2 framework for visuals.

Continuous Variables

LIMIT_BAL The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.
AGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

p1 <- ggplot(data=train.imp, aes(x=LIMIT_BAL)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip() +
  scale_x_continuous(labels=comma)
p2 <- ggplot(data=train.imp, aes(x=AGE)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
grid.arrange(p1,p2, nrow=1)

ggplot(data=train.imp, aes(x=LIMIT_BAL, y=AGE)) +
  geom_jitter(aes(colour=default)) +
  coord_flip() +
  scale_x_continuous(label = comma)

p3 <- ggplot(data=train.imp, aes(x=PAY_PC1)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p4 <- ggplot(data=train.imp, aes(x=PAY_PC2)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p5 <- ggplot(data=train.imp, aes(x=PAY_PC3)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
grid.arrange(p3,p4,p5, nrow=1)

p6 <- ggplot(data=train.imp, aes(x=AMT_PC1)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p7 <- ggplot(data=train.imp, aes(x=AMT_PC2)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p8 <- ggplot(data=train.imp, aes(x=AMT_PC3)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p9 <- ggplot(data=train.imp, aes(x=AMT_PC4)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p10 <- ggplot(data=train.imp, aes(x=AMT_PC5)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p11 <- ggplot(data=train.imp, aes(x=AMT_PC6)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p12 <- ggplot(data=train.imp, aes(x=AMT_PC7)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
grid.arrange(p6,p7,p8,p9,p10,p11,p12, nrow=2)

#### Categorical Variables

SEX The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.
EDUCATION certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.
MARRIAGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.

get_legend<-function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}
p <- lapply(X = c('SEX', 'EDUCATION', 'MARRIAGE'),
            FUN = function(x) ggplot(data = train.imp) +
              aes_string(x=x, fill = 'default') +
              geom_bar(position="dodge") +
              theme(legend.position="none"))
legend <- get_legend(ggplot(data = train.imp, aes(x=SEX, fill = default)) +
                       geom_bar())

grid.arrange(p[[1]],p[[2]],p[[3]],
             legend, layout_matrix = cbind(c(1,2,3),
                                           c(4,5,3),
                                           c(6,6,6)),
             widths=c(3,3,3))

Data Preparation

Modeling

xgboost,
glmnet,
Avg NN
KNN

Extreme Gradient Boosting

#read in the model if it's on hand If not, follow code from 336.
#xgbFitD <- readRDS('xgbFitD.rds')

#control measures
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     verboseIter = F,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary,
                     sampling = 'down',
                     savePredictions = T
                     )

#grid expansion
xgbGrid <- expand.grid(
    nrounds=seq(100, 200, 1000), #Run the model how many times?
    max_depth=c(2,6,2),                #The maximum depth of a tree
    eta=c(0.01, 0.1, 0.5),      #Step size shrinkage (smaller is less likely to overfit)
    gamma=0,                    #Minimum Loss reduction (the larger the more conservative the model)
    colsample_bytree=c(0.5,1),  #subsample ratio of columns when constructing each tree
    min_child_weight=1,         #minimum sum of instance weight (hessian) needed in a child
    subsample=c(0.5, 1)         #Subsample ratio of the training instance; 0.5 means half the data is used.
)
xgbFitD <- train(
    default~
      PAY_PC1 +
      AGE +
      LIMIT_BAL +
      AMT_PC1 +
      PAY_PC2 +
      AMT_PC2 +
      AMT_PC5 +
      EDUCATION +
      PAY_PC3 +
      AMT_PC7 +
      AMT_PC4 +
      AMT_PC6 +
      AMT_PC3,
    train.imp,
    method = 'xgbTree',
    trControl = ctrl,
    tuneGrid = xgbGrid,
    metric = "ROC"
)

saveRDS(xgbFitD, file = 'xgbFitU3.rds')

print(xgbFitD, details=F)

## eXtreme Gradient Boosting 
## 
## 18479 samples
##    13 predictor
##     2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 16632, 16632, 16631, 16631, 16630, 16631, ... 
## Addtional sampling using down-sampling
## 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  colsample_bytree  subsample  ROC        Sens     
##   0.01  2          0.5               0.5        0.7597945  0.7739848
##   0.01  2          0.5               1.0        0.7585042  0.7664497
##   0.01  2          1.0               0.5        0.7547281  0.8184378
##   0.01  2          1.0               1.0        0.7495721  0.8163965
##   0.01  6          0.5               0.5        0.7887507  0.7761964
##   0.01  6          0.5               1.0        0.7895069  0.7809485
##   0.01  6          1.0               0.5        0.7976870  0.8087481
##   0.01  6          1.0               1.0        0.7961267  0.8089479
##   0.10  2          0.5               0.5        0.7834335  0.7660357
##   0.10  2          0.5               1.0        0.7846638  0.7705592
##   0.10  2          1.0               0.5        0.7866898  0.7725436
##   0.10  2          1.0               1.0        0.7871512  0.7801491
##   0.10  6          0.5               0.5        0.7912204  0.7659651
##   0.10  6          0.5               1.0        0.7967229  0.7781240
##   0.10  6          1.0               0.5        0.7934491  0.7655792
##   0.10  6          1.0               1.0        0.8002317  0.7827609
##   0.50  2          0.5               0.5        0.7736018  0.7379077
##   0.50  2          0.5               1.0        0.7816663  0.7559464
##   0.50  2          1.0               0.5        0.7748588  0.7401922
##   0.50  2          1.0               1.0        0.7854065  0.7596715
##   0.50  6          0.5               0.5        0.7325908  0.6821402
##   0.50  6          0.5               1.0        0.7643370  0.7198992
##   0.50  6          1.0               0.5        0.7366725  0.6864931
##   0.50  6          1.0               1.0        0.7680423  0.7217689
##   Spec     
##   0.6104377
##   0.6140657
##   0.5425613
##   0.5392483
##   0.6805975
##   0.6783149
##   0.6552417
##   0.6502673
##   0.6690842
##   0.6678276
##   0.6724007
##   0.6636607
##   0.6847234
##   0.6812767
##   0.6827077
##   0.6794376
##   0.6777749
##   0.6806444
##   0.6805574
##   0.6811808
##   0.6645189
##   0.6806456
##   0.6680117
##   0.6817234
## 
## Tuning parameter 'nrounds' was held constant at a value of 100
## 
## Tuning parameter 'gamma' was held constant at a value of 0
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 100, max_depth = 6,
##  eta = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 1
##  and subsample = 1.

plot(xgbFitD)

xgb.importance(
               model = xgbFitD$finalModel) %>%
  xgb.ggplot.importance()

densityplot(xgbFitD,pch='|')

predict(xgbFitD,type = 'prob') -> train.ProbsD
histogram(~Y + N, train.ProbsD)

Elastinet

mixture of ridge & lasso

ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     verboseIter = F,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary,
                     savePredictions = T,
                     sampling = 'down'
                     )
glmnetGrid <- expand.grid(.alpha = c(0,.2,.4,.6,.8,1),
                          .lambda = seq(10^-10,10^-1,0.02))
glmnetFit <- train(
    default~
      PAY_PC1 +
      AGE +
      LIMIT_BAL +
      AMT_PC1 +
      EDUCATION +
      PAY_PC2 +
      PAY_PC3 +
      AMT_PC2 +
      AMT_PC7,
    train.imp,
    trControl=ctrl,
    method='glmnet',
    tuneGrid = glmnetGrid
)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

saveRDS(glmnetFit, file = 'glmnetFit.rds')

glmnetFit

## glmnet 
## 
## 18479 samples
##     9 predictor
##     2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 16632, 16631, 16631, 16631, 16630, 16632, ... 
## Addtional sampling using down-sampling
## 
## Resampling results across tuning parameters:
## 
##   alpha  lambda  ROC        Sens       Spec     
##   0.0    1e-10   0.7277187  0.6679416  0.6555589
##   0.0    2e-02   0.7278027  0.6659010  0.6567684
##   0.0    4e-02   0.7279900  0.6600361  0.6618759
##   0.0    6e-02   0.7280595  0.6554838  0.6658196
##   0.0    8e-02   0.7280931  0.6521160  0.6685975
##   0.2    1e-10   0.7275294  0.6736923  0.6529174
##   0.2    2e-02   0.7275913  0.6652301  0.6586516
##   0.2    4e-02   0.7271482  0.6560683  0.6650140
##   0.2    6e-02   0.7263258  0.6514166  0.6705244
##   0.2    8e-02   0.7252698  0.6479205  0.6744220
##   0.4    1e-10   0.7275277  0.6750910  0.6513919
##   0.4    2e-02   0.7271131  0.6653734  0.6577088
##   0.4    4e-02   0.7252763  0.6561976  0.6669392
##   0.4    6e-02   0.7225615  0.6508603  0.6671627
##   0.4    8e-02   0.7194290  0.6454661  0.6712390
##   0.6    1e-10   0.7275992  0.6745193  0.6521539
##   0.6    2e-02   0.7260800  0.6619621  0.6579779
##   0.6    4e-02   0.7223813  0.6531431  0.6644299
##   0.6    6e-02   0.7181876  0.6464364  0.6724941
##   0.6    8e-02   0.7147847  0.6374173  0.6801585
##   0.8    1e-10   0.7273519  0.6742201  0.6531386
##   0.8    2e-02   0.7247687  0.6594079  0.6589631
##   0.8    4e-02   0.7192135  0.6498753  0.6677442
##   0.8    6e-02   0.7144486  0.6383590  0.6794395
##   0.8    8e-02   0.7033890  0.6223331  0.6768873
##   1.0    1e-10   0.7275313  0.6759043  0.6500482
##   1.0    2e-02   0.7235773  0.6616630  0.6574410
##   1.0    4e-02   0.7169133  0.6487198  0.6695377
##   1.0    6e-02   0.7070203  0.6302829  0.6766215
##   1.0    8e-02   0.6907202  0.6120300  0.6684638
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0 and lambda = 0.08.

densityplot(glmnetFit, pch='|')

plot(varImp(glmnetFit),15,main='Elastinet Model')

predict(glmnetFit, type = 'prob') -> train.glmnet.Probs
histogram(~Y + N, train.glmnet.Probs)

K-NN

Supposed to be the worst.

ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     verboseIter = F,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary,
                     savePredictions = T,
                     sampling = 'down'
                     )
knnGrid <- expand.grid(k=seq(3,23,2))
knnFit <- train(
    default~
      PAY_PC1 +
      AGE +
      LIMIT_BAL +
      AMT_PC1 +
      EDUCATION +
      PAY_PC2 +
      PAY_PC3 +
      AMT_PC2 +
      AMT_PC7,
    train.imp,
    method = 'knn',
    trControl = ctrl,
    tuneGrid = knnGrid
)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

saveRDS(knnFit, file = 'knnFit.rds')

knnFit

## k-Nearest Neighbors 
## 
## 18479 samples
##     9 predictor
##     2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 16631, 16631, 16632, 16631, 16631, 16631, ... 
## Addtional sampling using down-sampling
## 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens       Spec     
##    3  0.6896864  0.6966531  0.6066241
##    5  0.7069371  0.7267068  0.5935506
##    7  0.7111725  0.7438599  0.5825244
##    9  0.7150697  0.7600292  0.5735652
##   11  0.7176105  0.7689184  0.5653230
##   13  0.7188659  0.7763688  0.5599450
##   15  0.7196457  0.7812484  0.5542553
##   17  0.7198720  0.7859155  0.5549256
##   19  0.7207763  0.7881274  0.5490135
##   21  0.7201811  0.7908104  0.5466847
##   23  0.7196569  0.7920513  0.5412158
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 19.

plot(knnFit)

densityplot(knnFit,pch='|')

predict(knnFit, type = 'prob') -> train.Probs
histogram(~Y+N, train.Probs)

SVM

ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     verboseIter = F,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary,
                     savePredictions = T,
                     sampling = 'down'
                     )
svmFit <- train(
    default~
      PAY_PC1 +
      AGE +
      LIMIT_BAL +
      AMT_PC1 +
      EDUCATION +
      PAY_PC2 +
      PAY_PC3 +
      AMT_PC2 +
      AMT_PC7,
    train.imp,
    method = 'svmRadial',
    trControl = ctrl,
    tuneGrid = expand.grid(C=c(0.05,0.1,0.2,0.3), sigma=c(0.001,0.005,0.01,0.015))
)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

saveRDS(svmFit, file = 'svmFit')

svmFit

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 18479 samples
##     9 predictor
##     2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 16631, 16631, 16631, 16632, 16631, 16631, ... 
## Addtional sampling using down-sampling
## 
## Resampling results across tuning parameters:
## 
##   C     sigma  ROC        Sens       Spec     
##   0.05  0.001  0.7265791  0.5840472  0.7300963
##   0.05  0.005  0.7305104  0.6621624  0.6605688
##   0.05  0.010  0.7345025  0.6683700  0.6599393
##   0.05  0.015  0.7379832  0.6864644  0.6537131
##   0.10  0.001  0.7276004  0.6501473  0.6671989
##   0.10  0.005  0.7324205  0.6674142  0.6584644
##   0.10  0.010  0.7380494  0.6955119  0.6481600
##   0.10  0.015  0.7425362  0.7190735  0.6387507
##   0.20  0.001  0.7280246  0.6516168  0.6676480
##   0.20  0.005  0.7352825  0.6875067  0.6508901
##   0.20  0.010  0.7424370  0.7232112  0.6359290
##   0.20  0.015  0.7472045  0.7412357  0.6285832
##   0.30  0.001  0.7285948  0.6544702  0.6644677
##   0.30  0.005  0.7372719  0.6984234  0.6451551
##   0.30  0.010  0.7448928  0.7349714  0.6312694
##   0.30  0.015  0.7500729  0.7465300  0.6263879
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.015 and C = 0.3.

plot(svmFit)

densityplot(svmFit, pch='|')

predict(svmFit, type = 'prob') -> train.Probs
histogram(~Y+N, train.Probs)

Random Forest

ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     verboseIter = F,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary,
                     savePredictions = T,
                     sampling = 'down'
                     )
RFFit <- train(
    default~
      PAY_PC1 +
      LIMIT_BAL +
      AGE +
      AMT_PC1 +
      AMT_PC2 +
      AMT_PC3 +
      AMT_PC4 +
      AMT_PC5 +
      AMT_PC6 +
      AMT_PC7 +
      PAY_PC2 +
      PAY_PC3,
    train.imp,
    method = 'rf',
    trControl = ctrl
)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

saveRDS(RFFit, file = 'RFFit.rds')

RFFit

## Random Forest 
## 
## 18479 samples
##    12 predictor
##     2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 16630, 16631, 16631, 16631, 16631, 16631, ... 
## Addtional sampling using down-sampling
## 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.7689348  0.7414769  0.6753137
##    7    0.7732522  0.7488111  0.6704772
##   12    0.7719981  0.7486116  0.6686388
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.

plot(RFFit)

densityplot(RFFit,pch='|')

plot(varImp(RFFit),15,main='RF - Variable Importance')

predict(RFFit, type = 'prob') -> train.Probs
histogram(~Y+N, train.Probs)

#set threshold

Compare the Models

re <-
  resamples(x = list(
    xgb = xgbFitD,
    knn = knnFit,
    elastinet = glmnetFit,
    svm = svmFit,
    rf = RFFit
  ))

dotplot(re)

bwplot(re)

difValues <- diff(re)
dotplot(difValues)

Test Set Evaluation

Create test set

test.imp

## # A tibble: 4,619 x 16
##    LIMIT_BAL SEX   EDUCATION MARRIAGE   AGE PAY_PC1  PAY_PC2  PAY_PC3
##        <dbl> <fct> <fct>     <fct>    <dbl>   <dbl>    <dbl>    <dbl>
##  1     20000 Fema~ Uni       Married     24   0.477 -3.22     0.145  
##  2     50000 Fema~ Uni       Married     37  -0.393  0.176    0.00489
##  3     50000 Male  Grad_Sch~ Single      37  -0.393  0.176    0.00489
##  4    140000 Fema~ High_Sch~ Married     28  -1.10   0.00576 -0.948  
##  5    250000 Male  Grad_Sch~ Single      29  -0.393  0.176    0.00489
##  6     50000 Male  High_Sch~ Single      23  -0.393  0.176    0.00489
##  7     50000 Fema~ High_Sch~ Married     47   1.71   0.268   -0.140  
##  8    230000 Fema~ Grad_Sch~ Single      27   1.71   0.268   -0.140  
##  9    500000 Male  Grad_Sch~ Married     58   3.81   0.361   -0.284  
## 10     50000 Male  Grad_Sch~ Single      25   2.18  -1.82     0.560  
## # ... with 4,609 more rows, and 8 more variables: AMT_PC1 <dbl>,
## #   AMT_PC2 <dbl>, AMT_PC3 <dbl>, AMT_PC4 <dbl>, AMT_PC5 <dbl>,
## #   AMT_PC6 <dbl>, AMT_PC7 <dbl>, default <fct>

summary(test.imp)

##    LIMIT_BAL          SEX             EDUCATION       MARRIAGE   
##  Min.   :     0   Male  :1849   Grad_School:1687   Married:2104  
##  1st Qu.: 50000   Female:2770   Uni        :2145   Single :2454  
##  Median :140000                 High_School: 729   Other  :  61  
##  Mean   :168275                 Other      :  58                 
##  3rd Qu.:240000                                                  
##  Max.   :750000                                                  
##       AGE           PAY_PC1             PAY_PC2        
##  Min.   :21.00   Min.   :-11.85168   Min.   :-4.42243  
##  1st Qu.:28.00   1st Qu.: -0.39331   1st Qu.:-0.26901  
##  Median :34.00   Median : -0.39331   Median : 0.17555  
##  Mean   :35.43   Mean   :  0.01748   Mean   :-0.01418  
##  3rd Qu.:42.00   3rd Qu.:  1.36957   3rd Qu.: 0.26834  
##  Max.   :69.00   Max.   :  3.81335   Max.   : 5.44103  
##     PAY_PC3             AMT_PC1             AMT_PC2        
##  Min.   :-2.705913   Min.   :-1.970628   Min.   :-3.98807  
##  1st Qu.:-0.283941   1st Qu.:-1.531260   1st Qu.:-0.43463  
##  Median : 0.004886   Median :-0.898807   Median :-0.20994  
##  Mean   : 0.011796   Mean   :-0.005837   Mean   :-0.01761  
##  3rd Qu.: 0.110511   3rd Qu.: 0.448394   3rd Qu.: 0.05934  
##  Max.   : 3.192124   Max.   :19.598795   Max.   :38.06379  
##     AMT_PC3              AMT_PC4              AMT_PC5         
##  Min.   :-29.377842   Min.   :-15.759742   Min.   :-25.10246  
##  1st Qu.: -0.136882   1st Qu.: -0.064640   1st Qu.: -0.07890  
##  Median : -0.070445   Median :  0.018313   Median : -0.03200  
##  Mean   : -0.016652   Mean   : -0.004938   Mean   :  0.00689  
##  3rd Qu.:  0.006755   3rd Qu.:  0.080747   3rd Qu.:  0.02671  
##  Max.   : 15.310976   Max.   : 10.520814   Max.   : 13.34686  
##     AMT_PC6              AMT_PC7          default 
##  Min.   :-17.599251   Min.   :-15.64263   N:3503  
##  1st Qu.: -0.040710   1st Qu.: -0.09164   Y:1116  
##  Median : -0.002781   Median : -0.04212           
##  Mean   :  0.000511   Mean   : -0.01809           
##  3rd Qu.:  0.065534   3rd Qu.:  0.02653           
##  Max.   : 15.582944   Max.   : 13.27408

map_int(test.imp,~sum(is.na(.x)))

## LIMIT_BAL       SEX EDUCATION  MARRIAGE       AGE   PAY_PC1   PAY_PC2 
##         0         0         0         0         0         0         0 
##   PAY_PC3   AMT_PC1   AMT_PC2   AMT_PC3   AMT_PC4   AMT_PC5   AMT_PC6 
##         0         0         0         0         0         0         0 
##   AMT_PC7   default 
##         0         0

test.imp <- test.imp[complete.cases(test.imp), ]

Predict Results

xgbPred <- predict(object = xgbFitD, newdata = test.imp)
elastinetPred   <- predict(object = glmnetFit, newdata = test.imp)
knnPred         <- predict(object = knnFit,    newdata = test.imp)
svmPred         <- predict(object = svmFit,    newdata = test.imp)
rfPred          <- predict(object = RFFit,    newdata = test.imp)

library(purrr)

xtab <- table(xgbPred,test.imp$default)
xgbCM <- confusionMatrix(xtab)

xtab <- table(elastinetPred,test.imp$default)
elastinetCM <- caret::confusionMatrix(xtab)

xtab <- table(knnPred,test.imp$default)
knnCM <-caret::confusionMatrix(xtab)

xtab <- table(svmPred,test.imp$default)
svmCM <-caret::confusionMatrix(xtab)

xtab <- table(rfPred,test.imp$default)
rfCM <-caret::confusionMatrix(xtab)

CM_list <- list(xgbCM, elastinetCM, knnCM, svmCM, rfCM)

compiled_results <- tibble(
    models = c('xgb','elastinet','knn','svm', 'rf'),
    accuracy = map_dbl(CM_list,~.x$overall[1]),
    kappa = map_dbl(CM_list,~.x$overall[2]),
    sensitivity = map_dbl(CM_list,~.x$byClass[1]),
    specificity = map_dbl(CM_list,~.x$byClass[2]),
    F1 = map_dbl(CM_list,~.x$byClass[7])
)
compiled_results %>% arrange(accuracy,kappa)

## # A tibble: 5 x 6
##   models    accuracy kappa sensitivity specificity    F1
##   <chr>        <dbl> <dbl>       <dbl>       <dbl> <dbl>
## 1 elastinet    0.659 0.260       0.656       0.670 0.745
## 2 svm          0.731 0.351       0.761       0.636 0.811
## 3 rf           0.736 0.379       0.752       0.687 0.812
## 4 knn          0.738 0.336       0.790       0.572 0.820
## 5 xgb          0.767 0.433       0.788       0.699 0.837

library(ggrepel)
dotplot(reorder(models,accuracy)~accuracy,compiled_results, main = 'Accuracy (Test Set Performance)')

ggplot(compiled_results, aes(F1, accuracy)) +
    geom_point(color = 'blue',shape=1) +
    geom_text_repel(aes(label = models),
                    box.padding=unit(1,'lines'),
                    max.iter=1e2,segment.size=.3,
                    force=1) +
    theme_bw()+
    labs(x='F1',y='kappa', title='Kappa vs F1 (Test Set Performance)')

Validation Set

## Parsed with column specification:
## cols(
##   ID = col_integer(),
##   LIMIT_BAL = col_integer(),
##   SEX = col_integer(),
##   EDUCATION = col_integer(),
##   MARRIAGE = col_integer(),
##   AGE = col_integer(),
##   PAY_PC1 = col_double(),
##   PAY_PC2 = col_double(),
##   PAY_PC3 = col_double(),
##   AMT_PC1 = col_double(),
##   AMT_PC2 = col_double(),
##   AMT_PC3 = col_double(),
##   AMT_PC4 = col_double(),
##   AMT_PC5 = col_double(),
##   AMT_PC6 = col_double(),
##   AMT_PC7 = col_double()
## )

##        ID          LIMIT_BAL          SEX             EDUCATION   
##  Min.   :    5   Min.   : 10000   Male  :2641   Grad_School:2393  
##  1st Qu.: 7532   1st Qu.: 50000   Female:4258   Uni        :3306  
##  Median :15062   Median :140000                 High_School:1096  
##  Mean   :15065   Mean   :166217                 Other      : 104  
##  3rd Qu.:22624   3rd Qu.:230000                                   
##  Max.   :29989   Max.   :800000                                   
##     MARRIAGE         AGE           PAY_PC1              PAY_PC2         
##  Married:3149   Min.   :21.00   Min.   :-13.302028   Min.   :-4.110567  
##  Single :3660   1st Qu.:28.00   1st Qu.: -0.393308   1st Qu.:-0.210711  
##  Other  :  90   Median :34.00   Median : -0.393308   Median : 0.175555  
##                 Mean   :35.47   Mean   :  0.005547   Mean   : 0.005926  
##                 3rd Qu.:41.00   3rd Qu.:  1.360047   3rd Qu.: 0.361123  
##                 Max.   :79.00   Max.   :  3.813348   Max.   : 4.967609  
##     PAY_PC3             AMT_PC1            AMT_PC2         
##  Min.   :-3.341166   Min.   :-2.06507   Min.   :-3.535521  
##  1st Qu.:-0.283941   1st Qu.:-1.50940   1st Qu.:-0.424712  
##  Median : 0.004886   Median :-0.85844   Median :-0.211871  
##  Mean   :-0.002183   Mean   :-0.01544   Mean   :-0.004575  
##  3rd Qu.: 0.057637   3rd Qu.: 0.48720   3rd Qu.: 0.067081  
##  Max.   : 3.176797   Max.   :19.26397   Max.   :28.783658  
##     AMT_PC3            AMT_PC4             AMT_PC5          
##  Min.   :-8.19007   Min.   :-19.17146   Min.   :-24.108569  
##  1st Qu.:-0.13215   1st Qu.: -0.06678   1st Qu.: -0.082701  
##  Median :-0.07044   Median :  0.01664   Median : -0.032000  
##  Mean   :-0.01284   Mean   : -0.01546   Mean   : -0.004962  
##  3rd Qu.:-0.00122   3rd Qu.:  0.07733   3rd Qu.:  0.020652  
##  Max.   :17.88261   Max.   : 15.07758   Max.   : 12.649943  
##     AMT_PC6              AMT_PC7         
##  Min.   :-19.077508   Min.   :-25.90403  
##  1st Qu.: -0.040660   1st Qu.: -0.09017  
##  Median : -0.001886   Median : -0.04039  
##  Mean   :  0.006764   Mean   :  0.01371  
##  3rd Qu.:  0.069230   3rd Qu.:  0.02954  
##  Max.   : 14.222570   Max.   : 18.18085

## # A tibble: 6,899 x 2
##       ID default
##    <int> <chr>  
##  1     5 1      
##  2    17 1      
##  3    19 0      
##  4    23 1      
##  5    27 0      
##  6    30 0      
##  7    32 1      
##  8    34 0      
##  9    37 0      
## 10    38 0      
## # ... with 6,889 more rows

AT2

Edward Truong

October 6, 2018