Objectives

  1. Develop and deploy a classification model an a product purchase data set.
  2. End to end analysis using R
  3. Learn the caret package for ML
  4. Learn to present the case using Rmarkdown

Read in the dataset

library(tidyverse, quietly = T)
library(dplyr, quietly = T)      #used by Caret
library(ggplot2, quietly = T)
library(corrplot, quietly =T)
library(caret, quietly = T)
library(Amelia, quietly = T)
library(gridExtra, quietly = T)

setwd("~/Documents/brookstruong")

train.raw <- read_csv("AT2_credit_train_STUDENT.csv")
test.raw <- read_csv("AT2_credit_test_STUDENT.csv")

23,101 observations over 17 variables

glimpse(train.raw)
## Observations: 23,101
## Variables: 17
## $ ID        <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ LIMIT_BAL <dbl> 20000, 120000, 90000, 50000, 50000, 500000, 100000, ...
## $ SEX       <int> 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2...
## $ EDUCATION <int> 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 3, 1, 1, 3...
## $ MARRIAGE  <int> 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2...
## $ AGE       <int> 24, 26, 34, 37, 37, 29, 23, 28, 35, 34, 51, 41, 30, ...
## $ PAY_PC1   <dbl> 0.47746305, -1.46161240, -0.39330764, -0.39330764, -...
## $ PAY_PC2   <dbl> -3.224589961, 0.853866590, 0.175554996, 0.175554996,...
## $ PAY_PC3   <dbl> 0.145040802, -0.360863449, 0.004885522, 0.004885522,...
## $ AMT_PC1   <dbl> -1.7522077, -1.6633432, -1.1348380, -0.3971748, -0.3...
## $ AMT_PC2   <dbl> -0.22434761, -0.14388558, -0.17660872, -0.45109777, ...
## $ AMT_PC3   <dbl> -0.077841055, -0.054600404, 0.015954854, -0.09978950...
## $ AMT_PC4   <dbl> 0.006957244, -0.002851947, -0.129071306, -0.03533896...
## $ AMT_PC5   <dbl> -0.041356958, 0.043889122, 0.098245278, -0.055306582...
## $ AMT_PC6   <dbl> 0.0008865935, -0.0261897987, -0.0223825102, 0.050465...
## $ AMT_PC7   <dbl> -0.05626505, -0.09997756, -0.06898686, -0.02820475, ...
## $ default   <chr> "Y", "Y", "N", "N", "N", "N", "N", "Y", "N", "N", "N...
train.raw$default <- as.factor(train.raw$default)
train.raw$SEX <- as.factor(train.raw$SEX)
train.raw$EDUCATION <- as.factor(train.raw$EDUCATION)
train.raw$MARRIAGE <- as.factor(train.raw$MARRIAGE)

summary(train.raw)
##        ID          LIMIT_BAL         SEX        EDUCATION MARRIAGE 
##  Min.   :    1   Min.   :    -99   1   : 9244   0:   11   0:   38  
##  1st Qu.: 7489   1st Qu.:  50000   2   :13854   1: 8192   1:10510  
##  Median :14987   Median : 140000   NA's:    3   2:10724   2:12304  
##  Mean   :14981   Mean   : 167524                3: 3821   3:  249  
##  3rd Qu.:22452   3rd Qu.: 240000                4:   88            
##  Max.   :30000   Max.   :1000000                5:  229            
##                                                 6:   36            
##       AGE           PAY_PC1              PAY_PC2        
##  Min.   : 21.0   Min.   :-11.859675   Min.   :-4.42243  
##  1st Qu.: 28.0   1st Qu.: -0.393308   1st Qu.:-0.23617  
##  Median : 34.0   Median : -0.393308   Median : 0.17555  
##  Mean   : 35.7   Mean   : -0.001656   Mean   :-0.00177  
##  3rd Qu.: 41.0   3rd Qu.:  1.360047   3rd Qu.: 0.36112  
##  Max.   :141.0   Max.   :  3.813348   Max.   : 5.44103  
##                                                         
##     PAY_PC3             AMT_PC1            AMT_PC2        
##  Min.   :-3.864638   Min.   :-3.41080   Min.   :-4.71769  
##  1st Qu.:-0.283941   1st Qu.:-1.50827   1st Qu.:-0.42961  
##  Median : 0.004886   Median :-0.86433   Median :-0.20780  
##  Mean   : 0.000652   Mean   : 0.00461   Mean   : 0.00137  
##  3rd Qu.: 0.093942   3rd Qu.: 0.49766   3rd Qu.: 0.09062  
##  Max.   : 3.364030   Max.   :37.49240   Max.   :83.52137  
##                                                           
##     AMT_PC3             AMT_PC4              AMT_PC5         
##  Min.   :-38.46500   Min.   :-21.593416   Min.   :-42.37665  
##  1st Qu.: -0.13710   1st Qu.: -0.068199   1st Qu.: -0.08239  
##  Median : -0.07044   Median :  0.018389   Median : -0.03200  
##  Mean   :  0.00383   Mean   :  0.004618   Mean   :  0.00148  
##  3rd Qu.:  0.00325   3rd Qu.:  0.083236   3rd Qu.:  0.02644  
##  Max.   : 21.98483   Max.   : 21.823749   Max.   : 17.43097  
##                                                              
##     AMT_PC6             AMT_PC7          default  
##  Min.   :-38.88504   Min.   :-41.71546   N:17518  
##  1st Qu.: -0.04241   1st Qu.: -0.09273   Y: 5583  
##  Median : -0.00216   Median : -0.04099            
##  Mean   : -0.00202   Mean   : -0.00409            
##  3rd Qu.:  0.06754   3rd Qu.:  0.03157            
##  Max.   : 20.22670   Max.   : 22.92727            
## 

Looking at the dataset and how the read_csv imported the csv, we can understand that the date is imported as a string rather than .

train.raw %>%
  arrange(AGE>80) %>%
  ggplot(aes(x=AGE)) + geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot
## function (data = NULL, mapping = aes(), ..., environment = parent.frame()) 
## {
##     UseMethod("ggplot")
## }
## <bytecode: 0x7feb39707070>
## <environment: namespace:ggplot2>

Missing data = 3NA for sex.

cplot <- train.raw %>%
  select(-default) %>%
  select_if(is.numeric)

M <- cor(cplot)
p.mat <- cor.mtest(M)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(M, 
         method = "color",
         order= "hclust",
         type="full",
         col=col(200),
         diag =F,
         title="Correlation of Numeric Variables",
         addCoef.col = "black",
         sig.level = 0.05,
         insig ="blank",
         mar=c(0,0,3,0))

train.raw %>%
  filter(SEX == "NA")
## # A tibble: 0 x 17
## # ... with 17 variables: ID <int>, LIMIT_BAL <dbl>, SEX <fct>,
## #   EDUCATION <fct>, MARRIAGE <fct>, AGE <int>, PAY_PC1 <dbl>,
## #   PAY_PC2 <dbl>, PAY_PC3 <dbl>, AMT_PC1 <dbl>, AMT_PC2 <dbl>,
## #   AMT_PC3 <dbl>, AMT_PC4 <dbl>, AMT_PC5 <dbl>, AMT_PC6 <dbl>,
## #   AMT_PC7 <dbl>, default <fct>

Train-Test Split

set.seed(42)
training_rows <- createDataPartition(y = train.raw$default, p=0.7, list=F)

test.raw <- train.raw %>% filter(!(rownames(.) %in% training_rows))
train.raw <- train.raw %>% filter(rownames(.) %in% training_rows)
dim(train.raw)
## [1] 16172    17
## [1] 6929   17

Missing values analysis

missmap(train.raw, main='Missing Values Analysis using Amelia ordered by % missing', col=c('red', 'gray'), legend =F, rank.order = T)
## Warning in if (class(obj) == "amelia") {: the condition has length > 1 and
## only the first element will be used
## Warning: Unknown or uninitialised column: 'arguments'.

## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'imputations'.

map_int(train.raw,~sum(is.na(.x)))
##        ID LIMIT_BAL       SEX EDUCATION  MARRIAGE       AGE   PAY_PC1 
##         0         0         2         0         0         0         0 
##   PAY_PC2   PAY_PC3   AMT_PC1   AMT_PC2   AMT_PC3   AMT_PC4   AMT_PC5 
##         0         0         0         0         0         0         0 
##   AMT_PC6   AMT_PC7   default 
##         0         0         0

Sex is missing 2 values.

EDA

default is the response variable. The response variable is a Yes/No boolean variable therefor is appropriate for our classification problem.

round(prop.table(table(train.raw$default)),2)
## 
##    N    Y 
## 0.76 0.24

76% of the dataset do not default and 24% have defaulted.


Predictor Variables

Univariate & Bivariate

First step is to look at all variables available using the ggplot2 framework for visuals.

Continuous Variables

  1. LIMIT_BAL The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.
  2. AGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.
p1 <- ggplot(data=train.raw, aes(x=LIMIT_BAL)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p2 <- ggplot(data=train.raw, aes(x=AGE)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
grid.arrange(p1,p2, nrow=1)

p3 <- ggplot(data=train.raw, aes(x=PAY_PC1)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p4 <- ggplot(data=train.raw, aes(x=PAY_PC2)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p5 <- ggplot(data=train.raw, aes(x=PAY_PC3)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
grid.arrange(p3,p4,p5, nrow=1)

p6 <- ggplot(data=train.raw, aes(x=AMT_PC1)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p7 <- ggplot(data=train.raw, aes(x=AMT_PC2)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p8 <- ggplot(data=train.raw, aes(x=AMT_PC3)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p9 <- ggplot(data=train.raw, aes(x=AMT_PC4)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p10 <- ggplot(data=train.raw, aes(x=AMT_PC5)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p11 <- ggplot(data=train.raw, aes(x=AMT_PC6)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p12 <- ggplot(data=train.raw, aes(x=AMT_PC7)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
grid.arrange(p6,p7,p8,p9,p10,p11,p12, nrow=2)

#### Categorical Variables

  1. SEX The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.
  2. EDUCATION certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.
  3. MARRIAGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.
get_legend<-function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}
p <- lapply(X = c('SEX', 'EDUCATION', 'MARRIAGE'),
            FUN = function(x) ggplot(data = train.raw) +
              aes_string(x=x, fill = 'default') +
              geom_bar(position="dodge") +
              theme(legend.position="none"))
legend <- get_legend(ggplot(data = train.raw, aes(x=SEX, fill = default)) +
                       geom_bar())

grid.arrange(p[[1]],p[[2]],p[[3]],
             legend, layout_matrix = cbind(c(1,2,3),
                                           c(4,5,3),
                                           c(6,6,6)),
             widths=c(3,3,1))


Data Preparation

Missing Values Imputation

summary(train.raw$SEX)
##    1    2 NA's 
## 6461 9709    2
#train.imp <- train.raw
#train.imp$SEX[is.na(train.imp$SEX)] <- '3'

#what to do with the 2 NAs?'

Modeling

  1. xgboost,
  2. glmnet,
  3. Avg NN

Extreme Gradient Boosting

ctrl <- trainControl(method = “repeatedcv”,

                 repeats = 5,
                 verboseIter = F,
                 classProbs = TRUE,
                 summaryFunction = #twoClassSummary,
                 # sampling = 'down',
                 savePredictions = T
                 )

xgbGrid <- expand.grid( nrounds = seq(14,24,2), max_depth = seq(2,8,2), eta=c(0.1,0.2,0.3), gamma=1, colsample_bytree=1, min_child_weight=1, subsample=1 )

xgbFit <- train(

default~., train.imp, method = ‘xgbTree’, trcontrol = ctrl, tuneGrid = xgbGrid )

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.