Objectives

  1. Develop and deploy a classification model an a product purchase data set.
  2. End to end analysis using R
  3. Learn the caret package for ML
  4. Learn to present the case using Rmarkdown

Read in the dataset

library(tidyverse, quietly = T)
library(dplyr, quietly = T)      #used by Caret
library(ggplot2, quietly = T)
library(corrplot, quietly =T)
library(caret, quietly = T)
library(Amelia, quietly = T)
library(gridExtra, quietly = T)
library(xgboost, quietly = T)
library(ROCR, quietly = T)
library(Matrix, quietly = T)
library(rpart.plot, quietly =T)

#setwd("~/Documents/brookstruong")

#Import the data
training.raw <- read_csv('AT2_credit_train_STUDENT.csv')
predict.raw <- read_csv('AT2_credit_test_STUDENT.csv')

23,101 observations over 17 variables for the training/test split 6899 observations over 17 variables for the prediction set.

Clean the data based on new observations to make train and test the same

glimpse(training.raw)
## Observations: 23,101
## Variables: 17
## $ ID        <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ LIMIT_BAL <dbl> 20000, 120000, 90000, 50000, 50000, 500000, 100000, ...
## $ SEX       <int> 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2...
## $ EDUCATION <int> 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 3, 1, 1, 3...
## $ MARRIAGE  <int> 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2...
## $ AGE       <int> 24, 26, 34, 37, 37, 29, 23, 28, 35, 34, 51, 41, 30, ...
## $ PAY_PC1   <dbl> 0.47746305, -1.46161240, -0.39330764, -0.39330764, -...
## $ PAY_PC2   <dbl> -3.224589961, 0.853866590, 0.175554996, 0.175554996,...
## $ PAY_PC3   <dbl> 0.145040802, -0.360863449, 0.004885522, 0.004885522,...
## $ AMT_PC1   <dbl> -1.7522077, -1.6633432, -1.1348380, -0.3971748, -0.3...
## $ AMT_PC2   <dbl> -0.22434761, -0.14388558, -0.17660872, -0.45109777, ...
## $ AMT_PC3   <dbl> -0.077841055, -0.054600404, 0.015954854, -0.09978950...
## $ AMT_PC4   <dbl> 0.006957244, -0.002851947, -0.129071306, -0.03533896...
## $ AMT_PC5   <dbl> -0.041356958, 0.043889122, 0.098245278, -0.055306582...
## $ AMT_PC6   <dbl> 0.0008865935, -0.0261897987, -0.0223825102, 0.050465...
## $ AMT_PC7   <dbl> -0.05626505, -0.09997756, -0.06898686, -0.02820475, ...
## $ default   <chr> "Y", "Y", "N", "N", "N", "N", "N", "Y", "N", "N", "N...
training.raw$default <- as.factor(training.raw$default)
training.raw$SEX <- as.factor(training.raw$SEX)
training.raw$MARRIAGE <- as.factor(training.raw$MARRIAGE)
training.raw$AGE <- as.numeric(training.raw$AGE)
#remove random anomilies

#remove NA - 2 rows from the sex being cat/dog
training.raw <- training.raw[complete.cases(training.raw), ]

#set negative balance values to 0. Max Limit_bal for test set is 760K and validation set for 800k. Lets set these at median value.
training.raw$LIMIT_BAL[training.raw$LIMIT_BAL <= 0] <- 0
training.raw$LIMIT_BAL[training.raw$LIMIT_BAL >800000] <-140000 


#all education factors greater than 4, set them to 4, also set 0 to 4
training.raw$EDUCATION[training.raw$EDUCATION == 5] <- 4
training.raw$EDUCATION[training.raw$EDUCATION == 6] <- 4
training.raw$EDUCATION[training.raw$EDUCATION == 0] <- 4
training.raw$EDUCATION <- factor(training.raw$EDUCATION)

#clean marriage
training.raw$MARRIAGE <- factor(training.raw$MARRIAGE)
training.raw$MARRIAGE[training.raw$MARRIAGE == 0] <- 3

#lets label our factors properly
training.raw$SEX <- factor(training.raw$SEX,
                           levels =c('1', '2'),
                           labels =c('Male', 'Female'))

training.raw$EDUCATION[training.raw$EDUCATION == 2] <- 4
training.raw$EDUCATION <- factor(training.raw$EDUCATION,
                                 levels =c('1', '2', '3', '4'),
                                 labels = c('Grad_School', 'Uni', 'High_School', 'Other'))
training.raw$EDUCATION <- as.factor(training.raw$EDUCATION)
training.raw$MARRIAGE <- factor(training.raw$MARRIAGE,
                                levels = c('1','2','3'),
                                labels = c('Married', 'Single', 'Other'))

summary(training.raw)
##        ID          LIMIT_BAL          SEX              EDUCATION    
##  Min.   :    1   Min.   :     0   Male  : 9244   Grad_School: 8192  
##  1st Qu.: 7488   1st Qu.: 50000   Female:13854   Uni        :    0  
##  Median :14989   Median :140000                  High_School: 3820  
##  Mean   :14982   Mean   :167479                  Other      :11086  
##  3rd Qu.:22453   3rd Qu.:240000                                     
##  Max.   :30000   Max.   :780000                                     
##     MARRIAGE          AGE           PAY_PC1              PAY_PC2        
##  Married:10507   Min.   : 21.0   Min.   :-11.859675   Min.   :-4.42243  
##  Single :12304   1st Qu.: 28.0   1st Qu.: -0.393308   1st Qu.:-0.23617  
##  Other  :  287   Median : 34.0   Median : -0.393308   Median : 0.17555  
##                  Mean   : 35.7   Mean   : -0.001513   Mean   :-0.00187  
##                  3rd Qu.: 41.0   3rd Qu.:  1.360047   3rd Qu.: 0.36042  
##                  Max.   :141.0   Max.   :  3.813348   Max.   : 5.44103  
##     PAY_PC3             AMT_PC1            AMT_PC2        
##  Min.   :-3.864638   Min.   :-3.41080   Min.   :-4.71769  
##  1st Qu.:-0.283941   1st Qu.:-1.50825   1st Qu.:-0.42972  
##  Median : 0.004886   Median :-0.86429   Median :-0.20781  
##  Mean   : 0.000605   Mean   : 0.00431   Mean   : 0.00124  
##  3rd Qu.: 0.093942   3rd Qu.: 0.49754   3rd Qu.: 0.09062  
##  Max.   : 3.364030   Max.   :37.49240   Max.   :83.52137  
##     AMT_PC3             AMT_PC4              AMT_PC5         
##  Min.   :-38.46500   Min.   :-21.593416   Min.   :-42.37665  
##  1st Qu.: -0.13709   1st Qu.: -0.068216   1st Qu.: -0.08239  
##  Median : -0.07044   Median :  0.018389   Median : -0.03200  
##  Mean   :  0.00396   Mean   :  0.004458   Mean   :  0.00144  
##  3rd Qu.:  0.00325   3rd Qu.:  0.083214   3rd Qu.:  0.02637  
##  Max.   : 21.98483   Max.   : 21.823749   Max.   : 17.43097  
##     AMT_PC6             AMT_PC7          default  
##  Min.   :-38.88504   Min.   :-41.71546   N:17518  
##  1st Qu.: -0.04241   1st Qu.: -0.09262   Y: 5580  
##  Median : -0.00216   Median : -0.04099            
##  Mean   : -0.00159   Mean   : -0.00406            
##  3rd Qu.:  0.06754   3rd Qu.:  0.03157            
##  Max.   : 20.22670   Max.   : 22.92727
cplot <- training.raw %>%
  select(-default) %>%
  select_if(is.numeric)

M <- cor(cplot)
p.mat <- cor.mtest(M)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(M, 
         method = "color",
         order= "hclust",
         type="full",
         col=col(200),
         diag =F,
         title="Correlation of Numeric Variables",
         addCoef.col = "black",
         sig.level = 0.05,
         insig ="blank",
         mar=c(0,0,3,0))

Fix age imputation

training.raw$AGE <- ifelse(training.raw$AGE >=100, NA, training.raw$AGE)

map_int(training.raw,~sum(is.na(.x)))
##        ID LIMIT_BAL       SEX EDUCATION  MARRIAGE       AGE   PAY_PC1 
##         0         0         0         0         0        50         0 
##   PAY_PC2   PAY_PC3   AMT_PC1   AMT_PC2   AMT_PC3   AMT_PC4   AMT_PC5 
##         0         0         0         0         0         0         0 
##   AMT_PC6   AMT_PC7   default 
##         0         0         0
'we can see that there is 50 age that are missing, out of the scheme of things, it is <1% of the total dataset, however lets play with using rpart to predict the missing age'
## [1] "we can see that there is 50 age that are missing, out of the scheme of things, it is <1% of the total dataset, however lets play with using rpart to predict the missing age"

Missing Values Imputation

train.imp <- training.raw[-1]

map_int(train.imp,~sum(is.na(.x)))
## LIMIT_BAL       SEX EDUCATION  MARRIAGE       AGE   PAY_PC1   PAY_PC2 
##         0         0         0         0        50         0         0 
##   PAY_PC3   AMT_PC1   AMT_PC2   AMT_PC3   AMT_PC4   AMT_PC5   AMT_PC6 
##         0         0         0         0         0         0         0 
##   AMT_PC7   default 
##         0         0
head(train.imp)
## # A tibble: 6 x 16
##   LIMIT_BAL SEX   EDUCATION MARRIAGE   AGE PAY_PC1 PAY_PC2  PAY_PC3 AMT_PC1
##       <dbl> <fct> <fct>     <fct>    <dbl>   <dbl>   <dbl>    <dbl>   <dbl>
## 1     20000 Fema~ Other     Married     24   0.477  -3.22   0.145    -1.75 
## 2    120000 Fema~ Other     Single      26  -1.46    0.854 -0.361    -1.66 
## 3     90000 Fema~ Other     Single      34  -0.393   0.176  0.00489  -1.13 
## 4     50000 Fema~ Other     Married     37  -0.393   0.176  0.00489  -0.397
## 5     50000 Male  Grad_Sch~ Single      37  -0.393   0.176  0.00489  -0.393
## 6    500000 Male  Grad_Sch~ Single      29  -0.393   0.176  0.00489  15.7  
## # ... with 7 more variables: AMT_PC2 <dbl>, AMT_PC3 <dbl>, AMT_PC4 <dbl>,
## #   AMT_PC5 <dbl>, AMT_PC6 <dbl>, AMT_PC7 <dbl>, default <fct>

Age Imputation.

age.predictors <- train.imp %>%
  select(LIMIT_BAL, SEX, EDUCATION, MARRIAGE, AGE, PAY_PC1) %>%
  filter(complete.cases(.))
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5)
rpartGrid <- data.frame(maxdepth = seq(2,10,1))
rpartFit_ageimputation <- train (x=age.predictors[,-3],
                                 y=age.predictors$AGE,
                                 method='rpart2',
                                 trControl = ctrl,
                                 tuneGrid = rpartGrid)
## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.
rpartFit_ageimputation
## CART 
## 
## 23048 samples
##     5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 20743, 20743, 20742, 20742, 20744, 20745, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  RMSE      Rsquared   MAE     
##    2        4.019416  0.8085220  3.332353
##    3        2.770871  0.9090355  2.243507
##    4        2.450554  0.9288340  2.080836
##    5        2.130606  0.9462219  1.782894
##    6        1.804709  0.9614172  1.464249
##    7        1.456406  0.9748723  1.179393
##    8        1.456406  0.9748723  1.179393
##    9        1.456406  0.9748723  1.179393
##   10        1.456406  0.9748723  1.179393
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 7.
plot(rpartFit_ageimputation)

# Tree depth of 7 is optimal

rpart.plot(rpartFit_ageimputation$finalModel)

#save the model externally to load back in
saveRDS(rpartFit_ageimputation, file = 'rpartFit_ageimputation.rds')
#insert missing age back into the dataset
missing_age <- is.na(train.imp$AGE)
age.predicted <- predict(rpartFit_ageimputation, newdata = train.imp[missing_age,])

train.imp[missing_age, 'AGE'] <- age.predicted

summary(train.imp)
##    LIMIT_BAL          SEX              EDUCATION        MARRIAGE    
##  Min.   :     0   Male  : 9244   Grad_School: 8192   Married:10507  
##  1st Qu.: 50000   Female:13854   Uni        :    0   Single :12304  
##  Median :140000                  High_School: 3820   Other  :  287  
##  Mean   :167479                  Other      :11086                  
##  3rd Qu.:240000                                                     
##  Max.   :780000                                                     
##       AGE           PAY_PC1              PAY_PC2        
##  Min.   :21.00   Min.   :-11.859675   Min.   :-4.42243  
##  1st Qu.:28.00   1st Qu.: -0.393308   1st Qu.:-0.23617  
##  Median :34.00   Median : -0.393308   Median : 0.17555  
##  Mean   :35.48   Mean   : -0.001513   Mean   :-0.00187  
##  3rd Qu.:41.00   3rd Qu.:  1.360047   3rd Qu.: 0.36042  
##  Max.   :75.00   Max.   :  3.813348   Max.   : 5.44103  
##     PAY_PC3             AMT_PC1            AMT_PC2        
##  Min.   :-3.864638   Min.   :-3.41080   Min.   :-4.71769  
##  1st Qu.:-0.283941   1st Qu.:-1.50825   1st Qu.:-0.42972  
##  Median : 0.004886   Median :-0.86429   Median :-0.20781  
##  Mean   : 0.000605   Mean   : 0.00431   Mean   : 0.00124  
##  3rd Qu.: 0.093942   3rd Qu.: 0.49754   3rd Qu.: 0.09062  
##  Max.   : 3.364030   Max.   :37.49240   Max.   :83.52137  
##     AMT_PC3             AMT_PC4              AMT_PC5         
##  Min.   :-38.46500   Min.   :-21.593416   Min.   :-42.37665  
##  1st Qu.: -0.13709   1st Qu.: -0.068216   1st Qu.: -0.08239  
##  Median : -0.07044   Median :  0.018389   Median : -0.03200  
##  Mean   :  0.00396   Mean   :  0.004458   Mean   :  0.00144  
##  3rd Qu.:  0.00325   3rd Qu.:  0.083214   3rd Qu.:  0.02637  
##  Max.   : 21.98483   Max.   : 21.823749   Max.   : 17.43097  
##     AMT_PC6             AMT_PC7          default  
##  Min.   :-38.88504   Min.   :-41.71546   N:17518  
##  1st Qu.: -0.04241   1st Qu.: -0.09262   Y: 5580  
##  Median : -0.00216   Median : -0.04099            
##  Mean   : -0.00159   Mean   : -0.00406            
##  3rd Qu.:  0.06754   3rd Qu.:  0.03157            
##  Max.   : 20.22670   Max.   : 22.92727

Train-Test Split

set.seed(42)
training_rows <- createDataPartition(y = train.imp$default, p=0.7, list=F)

test.imp <- train.imp %>% filter(!(rownames(.) %in% training_rows))
train.imp <- train.imp %>% filter(rownames(.) %in% training_rows)
dim(train.imp)
## [1] 16169    16
## [1] 6929   16

Missing values analysis

map_int(train.imp,~sum(is.na(.x)))
## LIMIT_BAL       SEX EDUCATION  MARRIAGE       AGE   PAY_PC1   PAY_PC2 
##         0         0         0         0         0         0         0 
##   PAY_PC3   AMT_PC1   AMT_PC2   AMT_PC3   AMT_PC4   AMT_PC5   AMT_PC6 
##         0         0         0         0         0         0         0 
##   AMT_PC7   default 
##         0         0

No missing values.

EDA

default is the response variable. The response variable is a Yes/No boolean variable therefor is appropriate for our classification problem.

round(prop.table(table(train.imp$default)),2)
## 
##    N    Y 
## 0.76 0.24

76% of the dataset do not default and 24% have defaulted. This is a classic imbalanced dataset. We will need to deal with this either prior to modelling or using packages to sample for us.


Predictor Variables

Univariate & Bivariate

First step is to look at all variables available using the ggplot2 framework for visuals.

Continuous Variables

  1. LIMIT_BAL The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.
  2. AGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
p1 <- ggplot(data=train.imp, aes(x=LIMIT_BAL)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip() +
  scale_x_continuous(labels=comma)
p2 <- ggplot(data=train.imp, aes(x=AGE)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
grid.arrange(p1,p2, nrow=1)

ggplot(data=train.imp, aes(x=LIMIT_BAL, y=AGE)) +
  geom_jitter(aes(fill=default)) +
  coord_flip()

p3 <- ggplot(data=train.imp, aes(x=PAY_PC1)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p4 <- ggplot(data=train.imp, aes(x=PAY_PC2)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p5 <- ggplot(data=train.imp, aes(x=PAY_PC3)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
grid.arrange(p3,p4,p5, nrow=1)

p6 <- ggplot(data=train.imp, aes(x=AMT_PC1)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p7 <- ggplot(data=train.imp, aes(x=AMT_PC2)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p8 <- ggplot(data=train.imp, aes(x=AMT_PC3)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p9 <- ggplot(data=train.imp, aes(x=AMT_PC4)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p10 <- ggplot(data=train.imp, aes(x=AMT_PC5)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p11 <- ggplot(data=train.imp, aes(x=AMT_PC6)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
p12 <- ggplot(data=train.imp, aes(x=AMT_PC7)) +
  geom_histogram(aes(fill=default), bins = 40) +
  coord_flip()
grid.arrange(p6,p7,p8,p9,p10,p11,p12, nrow=2)

#### Categorical Variables

  1. SEX The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.
  2. EDUCATION certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.
  3. MARRIAGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.
get_legend<-function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}
p <- lapply(X = c('SEX', 'EDUCATION', 'MARRIAGE'),
            FUN = function(x) ggplot(data = train.imp) +
              aes_string(x=x, fill = 'default') +
              geom_bar(position="dodge") +
              theme(legend.position="none"))
legend <- get_legend(ggplot(data = train.imp, aes(x=SEX, fill = default)) +
                       geom_bar())

grid.arrange(p[[1]],p[[2]],p[[3]],
             legend, layout_matrix = cbind(c(1,2,3),
                                           c(4,5,3),
                                           c(6,6,6)),
             widths=c(3,3,1))


Data Preparation

Modeling

  1. xgboost,
  2. glmnet, -removed from this file
  3. Avg NN - removed from this file.

Extreme Gradient Boosting

#read in the model if it's on hand If not, follow code from 336.
xgbFitD <- readRDS('xgbFitD.rds')

#control measures
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     verboseIter = F,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary,
                     sampling = 'down',
                     savePredictions = T
                     )

#grid expansion
xgbGrid <- expand.grid(
    nrounds=seq(100, 200, 500), #Run the model how many times?
    max_depth=6,                #The maximum depth of a tree
    eta=c(0.01, 0.1, 0.5),      #Step size shrinkage (smaller is less likely to overfit)
    gamma=0,                    #Minimum Loss reduction (the larger the more conservative the model)
    colsample_bytree=c(0.5,1),  #subsample ratio of columns when constructing each tree
    min_child_weight=1,         #minimum sum of instance weight (hessian) needed in a child
    subsample=c(0.5, 1)         #Subsample ratio of the training instance; 0.5 means half the data is used.
)
xgbFitD <- train(
    default~
      PAY_PC1 +
      AGE +
      LIMIT_BAL +
      AMT_PC1 +
      EDUCATION +
      PAY_PC2 +
      PAY_PC3 +
      AMT_PC2 +
      AMT_PC7,
    train.imp,
    method = 'xgbTree',
    trControl = ctrl,
    tuneGrid = xgbGrid,
    metric = "ROC"
)

saveRDS(xgbFitD, file = 'xgbFitD.rds')
print(xgbFitD, details=F)
## eXtreme Gradient Boosting 
## 
## 16169 samples
##     9 predictor
##     2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 14552, 14553, 14552, 14553, 14551, 14552, ... 
## Addtional sampling using down-sampling
## 
## Resampling results across tuning parameters:
## 
##   eta   colsample_bytree  subsample  ROC        Sens       Spec     
##   0.01  0.5               0.5        0.7940388  0.7748844  0.6958986
##   0.01  0.5               1.0        0.7944374  0.7778040  0.6911871
##   0.01  1.0               0.5        0.8043237  0.8009142  0.6777680
##   0.01  1.0               1.0        0.8003286  0.8013388  0.6674766
##   0.10  0.5               0.5        0.7985907  0.7728947  0.6905190
##   0.10  0.5               1.0        0.8019819  0.7819628  0.6885730
##   0.10  1.0               0.5        0.8008873  0.7738409  0.6946682
##   0.10  1.0               1.0        0.8057312  0.7848987  0.6879580
##   0.50  0.5               0.5        0.7499764  0.7000248  0.6742929
##   0.50  0.5               1.0        0.7761141  0.7360846  0.6875997
##   0.50  1.0               0.5        0.7501419  0.6958673  0.6799233
##   0.50  1.0               1.0        0.7748297  0.7337195  0.6865296
## 
## Tuning parameter 'nrounds' was held constant at a value of 100
## 
## Tuning parameter 'gamma' was held constant at a value of 0
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 100, max_depth = 6,
##  eta = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 1
##  and subsample = 1.
plot(xgbFitD)

xgb.importance(
               model = xgbFitD$finalModel) %>%
  xgb.ggplot.importance()

densityplot(xgbFitD,pch='|')

predict(xgbFitD,type = 'prob') -> train.ProbsD
histogram(~Y + N, train.ProbsD)

Test Set Evaluation

Create test set

test.imp
## # A tibble: 6,929 x 16
##    LIMIT_BAL SEX   EDUCATION MARRIAGE   AGE PAY_PC1 PAY_PC2  PAY_PC3
##        <dbl> <fct> <fct>     <fct>    <dbl>   <dbl>   <dbl>    <dbl>
##  1     50000 Fema~ Other     Married     37  -0.393   0.176  0.00489
##  2     50000 Male  Grad_Sch~ Single      37  -0.393   0.176  0.00489
##  3    250000 Male  Grad_Sch~ Single      29  -0.393   0.176  0.00489
##  4    120000 Fema~ Other     Married     39   1.71    0.268 -0.140  
##  5     50000 Male  High_Sch~ Single      23  -0.393   0.176  0.00489
##  6     50000 Fema~ High_Sch~ Married     47   1.71    0.268 -0.140  
##  7    230000 Fema~ Grad_Sch~ Single      27   1.71    0.268 -0.140  
##  8    500000 Male  Grad_Sch~ Married     58   3.81    0.361 -0.284  
##  9     50000 Male  Grad_Sch~ Single      25   2.18   -1.82   0.560  
## 10    280000 Male  Grad_Sch~ Single      31   0.271   0.358 -1.40   
## # ... with 6,919 more rows, and 8 more variables: AMT_PC1 <dbl>,
## #   AMT_PC2 <dbl>, AMT_PC3 <dbl>, AMT_PC4 <dbl>, AMT_PC5 <dbl>,
## #   AMT_PC6 <dbl>, AMT_PC7 <dbl>, default <fct>
summary(test.imp)
##    LIMIT_BAL          SEX             EDUCATION       MARRIAGE   
##  Min.   :     0   Male  :2805   Grad_School:2490   Married:3173  
##  1st Qu.: 50000   Female:4124   Uni        :   0   Single :3670  
##  Median :140000                 High_School:1099   Other  :  86  
##  Mean   :166907                 Other      :3340                 
##  3rd Qu.:240000                                                  
##  Max.   :760000                                                  
##       AGE           PAY_PC1              PAY_PC2        
##  Min.   :21.00   Min.   :-11.859675   Min.   :-4.42243  
##  1st Qu.:28.00   1st Qu.: -0.393308   1st Qu.:-0.26506  
##  Median :34.00   Median : -0.393308   Median : 0.17555  
##  Mean   :35.47   Mean   :  0.007173   Mean   :-0.00134  
##  3rd Qu.:41.00   3rd Qu.:  1.360047   3rd Qu.: 0.34434  
##  Max.   :70.00   Max.   :  3.813348   Max.   : 5.44103  
##     PAY_PC3             AMT_PC1             AMT_PC2        
##  Min.   :-3.616800   Min.   :-2.161575   Min.   :-3.47545  
##  1st Qu.:-0.283941   1st Qu.:-1.515179   1st Qu.:-0.43997  
##  Median : 0.004886   Median :-0.858795   Median :-0.21123  
##  Mean   : 0.012680   Mean   : 0.002525   Mean   :-0.01860  
##  3rd Qu.: 0.112994   3rd Qu.: 0.463515   3rd Qu.: 0.06759  
##  Max.   : 3.364030   Max.   :19.598795   Max.   :38.06379  
##     AMT_PC3              AMT_PC4              AMT_PC5          
##  Min.   :-29.377842   Min.   :-15.759742   Min.   :-25.102463  
##  1st Qu.: -0.138236   1st Qu.: -0.066254   1st Qu.: -0.082010  
##  Median : -0.070445   Median :  0.018397   Median : -0.032000  
##  Mean   : -0.008069   Mean   :  0.003631   Mean   :  0.008541  
##  3rd Qu.:  0.001689   3rd Qu.:  0.081770   3rd Qu.:  0.025525  
##  Max.   : 15.310976   Max.   : 12.231116   Max.   : 13.346865  
##     AMT_PC6              AMT_PC7          default 
##  Min.   :-17.599251   Min.   :-15.64263   N:5255  
##  1st Qu.: -0.041455   1st Qu.: -0.09317   Y:1674  
##  Median : -0.001935   Median : -0.04190           
##  Mean   :  0.000567   Mean   : -0.01526           
##  3rd Qu.:  0.066713   3rd Qu.:  0.02631           
##  Max.   : 15.582944   Max.   : 14.03629
map_int(test.imp,~sum(is.na(.x)))
## LIMIT_BAL       SEX EDUCATION  MARRIAGE       AGE   PAY_PC1   PAY_PC2 
##         0         0         0         0         0         0         0 
##   PAY_PC3   AMT_PC1   AMT_PC2   AMT_PC3   AMT_PC4   AMT_PC5   AMT_PC6 
##         0         0         0         0         0         0         0 
##   AMT_PC7   default 
##         0         0
test.imp <- test.imp[complete.cases(test.imp), ]

Predict Results

xgbPredDownF <- predict(object = xgbFitD, newdata = test.imp)
library(purrr)

xtab <- table(xgbPredDownF,test.imp$default)
xgbCM <- confusionMatrix(xtab)

#CM_list <- list(xgbCM, elastinetCM, knnCM, svmCM)

#compiled_results <- tibble(
    #models = c('xgb','elastinet','knn','svm'),
    #accuracy = map_dbl(CM_list,~.x$overall[1]),
    #kappa = map_dbl(CM_list,~.x$overall[2]),
    #sensitivity = map_dbl(CM_list,~.x$byClass[1]),
    #specificity = map_dbl(CM_list,~.x$byClass[2]),
    #F1 = map_dbl(CM_list,~.x$byClass[7])
#)
#compiled_results %>% arrange(accuracy,kappa)

xgbCM
## Confusion Matrix and Statistics
## 
##             
## xgbPredDownF    N    Y
##            N 4145  556
##            Y 1110 1118
##                                           
##                Accuracy : 0.7596          
##                  95% CI : (0.7493, 0.7696)
##     No Information Rate : 0.7584          
##     P-Value [Acc > NIR] : 0.4175          
##                                           
##                   Kappa : 0.4104          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7888          
##             Specificity : 0.6679          
##          Pos Pred Value : 0.8817          
##          Neg Pred Value : 0.5018          
##              Prevalence : 0.7584          
##          Detection Rate : 0.5982          
##    Detection Prevalence : 0.6785          
##       Balanced Accuracy : 0.7283          
##                                           
##        'Positive' Class : N               
## 

Validation Set


predict.raw <- read_csv('AT2_credit_test_STUDENT.csv')
## Parsed with column specification:
## cols(
##   ID = col_integer(),
##   LIMIT_BAL = col_integer(),
##   SEX = col_integer(),
##   EDUCATION = col_integer(),
##   MARRIAGE = col_integer(),
##   AGE = col_integer(),
##   PAY_PC1 = col_double(),
##   PAY_PC2 = col_double(),
##   PAY_PC3 = col_double(),
##   AMT_PC1 = col_double(),
##   AMT_PC2 = col_double(),
##   AMT_PC3 = col_double(),
##   AMT_PC4 = col_double(),
##   AMT_PC5 = col_double(),
##   AMT_PC6 = col_double(),
##   AMT_PC7 = col_double()
## )
val.imp <- predict.raw

val.imp$SEX <- as.factor(val.imp$SEX)
val.imp$SEX <- factor(val.imp$SEX,
                           levels =c('1', '2'),
                           labels =c('Male', 'Female'))

#all education factors greater than 4, set them to 4, also set 0 to 4
  val.imp$EDUCATION[val.imp$EDUCATION == 5] <- 4
  val.imp$EDUCATION[val.imp$EDUCATION == 6] <- 4
  val.imp$EDUCATION[val.imp$EDUCATION == 0] <- 4
  val.imp$EDUCATION <- factor(val.imp$EDUCATION,
                                 levels =c('1', '2', '3', '4'),
                                 labels = c('Grad_School', 'Uni', 'High_School', 'Other'))

  #clean marriage
val.imp$MARRIAGE[val.imp$MARRIAGE == 0] <- 3
val.imp$MARRIAGE <- as.factor(val.imp$MARRIAGE)
val.imp$MARRIAGE <- factor(val.imp$MARRIAGE,
                                levels = c('1','2','3'),
                                labels = c('Married', 'Single', 'Other'))

summary(val.imp)
##        ID          LIMIT_BAL          SEX             EDUCATION   
##  Min.   :    5   Min.   : 10000   Male  :2641   Grad_School:2393  
##  1st Qu.: 7532   1st Qu.: 50000   Female:4258   Uni        :3306  
##  Median :15062   Median :140000                 High_School:1096  
##  Mean   :15065   Mean   :166217                 Other      : 104  
##  3rd Qu.:22624   3rd Qu.:230000                                   
##  Max.   :29989   Max.   :800000                                   
##     MARRIAGE         AGE           PAY_PC1              PAY_PC2         
##  Married:3149   Min.   :21.00   Min.   :-13.302028   Min.   :-4.110567  
##  Single :3660   1st Qu.:28.00   1st Qu.: -0.393308   1st Qu.:-0.210711  
##  Other  :  90   Median :34.00   Median : -0.393308   Median : 0.175555  
##                 Mean   :35.47   Mean   :  0.005547   Mean   : 0.005926  
##                 3rd Qu.:41.00   3rd Qu.:  1.360047   3rd Qu.: 0.361123  
##                 Max.   :79.00   Max.   :  3.813348   Max.   : 4.967609  
##     PAY_PC3             AMT_PC1            AMT_PC2         
##  Min.   :-3.341166   Min.   :-2.06507   Min.   :-3.535521  
##  1st Qu.:-0.283941   1st Qu.:-1.50940   1st Qu.:-0.424712  
##  Median : 0.004886   Median :-0.85844   Median :-0.211871  
##  Mean   :-0.002183   Mean   :-0.01544   Mean   :-0.004575  
##  3rd Qu.: 0.057637   3rd Qu.: 0.48720   3rd Qu.: 0.067081  
##  Max.   : 3.176797   Max.   :19.26397   Max.   :28.783658  
##     AMT_PC3            AMT_PC4             AMT_PC5          
##  Min.   :-8.19007   Min.   :-19.17146   Min.   :-24.108569  
##  1st Qu.:-0.13215   1st Qu.: -0.06678   1st Qu.: -0.082701  
##  Median :-0.07044   Median :  0.01664   Median : -0.032000  
##  Mean   :-0.01284   Mean   : -0.01546   Mean   : -0.004962  
##  3rd Qu.:-0.00122   3rd Qu.:  0.07733   3rd Qu.:  0.020652  
##  Max.   :17.88261   Max.   : 15.07758   Max.   : 12.649943  
##     AMT_PC6              AMT_PC7         
##  Min.   :-19.077508   Min.   :-25.90403  
##  1st Qu.: -0.040660   1st Qu.: -0.09017  
##  Median : -0.001886   Median : -0.04039  
##  Mean   :  0.006764   Mean   :  0.01371  
##  3rd Qu.:  0.069230   3rd Qu.:  0.02954  
##  Max.   : 14.222570   Max.   : 18.18085
val.imp$default <- predict(xgbFitD, val.imp)

final.output <- val.imp %>%
    select(ID, default)

final.output$default <- as.character(final.output$default)

final.output$default[final.output$default == "N"] = 0 #Set N to 0 and Y to 1
final.output$default[final.output$default == "Y"] = 1 #Set Y to 1

final.output
## # A tibble: 6,899 x 2
##       ID default
##    <int> <chr>  
##  1     5 0      
##  2    17 1      
##  3    19 0      
##  4    23 1      
##  5    27 1      
##  6    30 0      
##  7    32 1      
##  8    34 0      
##  9    37 0      
## 10    38 0      
## # ... with 6,889 more rows
write.csv(final.output, file="xgb_resultsEdDown.csv", row.names=FALSE)