Meta-learning - Ensemble Methods

Collect and Prepare the Data
Train and Evaluate a Model
Customize the Tuning Process
Ensemble Models
- Random Forest
- Extreme Gradient Boosting
Compare Model Performance
Conclusion

library(tidyverse)
library(caret)
library(DMwR)
library(rpart)
library(ROCR)
library(randomForest)
library(xgboost)

Collect and Prepare the Data

loans <- read_csv("https://s3.amazonaws.com/notredame.analytics.data/lendingclub.csv")

loans$Grade <- as.factor(loans$Grade)
loans$EmploymentLength <- as.factor(loans$EmploymentLength)
loans$HomeOwnership <- as.factor(loans$HomeOwnership)
loans$IncomeVerified <- as.factor(loans$IncomeVerified)
loans$LoanPurpose <- as.factor(loans$LoanPurpose)
loans$Default <- as.factor(loans$Default)

set.seed(1234)
sample.set <- createDataPartition(loans$Default, p = 0.75, list = FALSE)
loans.train <- loans[sample.set, ]
loans.train <- SMOTE(Default ~ ., data.frame(loans.train), perc.over = 100, perc.under = 200)
loans.test <- loans[-sample.set, ]

Train and Evaluate a Model

tree.mod <- train(Default ~ ., data = loans.train, method = "rpart") 
tree.mod

CART 

19228 samples
   18 predictor
    2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 19228, 19228, 19228, 19228, 19228, 19228, ... 
Resampling results across tuning parameters:

  cp          Accuracy   Kappa    
  0.02101103  0.6439888  0.2881672
  0.02652382  0.6334887  0.2674375
  0.25119617  0.5214907  0.0482178

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02101103.

tree.pred <- predict(tree.mod, loans.test)

confusionMatrix(tree.pred, loans.test$Default, positive = "Yes")

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  6164  789
       Yes 2845  813
                                          
               Accuracy : 0.6575          
                 95% CI : (0.6484, 0.6666)
    No Information Rate : 0.849           
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.1255          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.50749         
            Specificity : 0.68420         
         Pos Pred Value : 0.22225         
         Neg Pred Value : 0.88652         
             Prevalence : 0.15098         
         Detection Rate : 0.07662         
   Detection Prevalence : 0.34474         
      Balanced Accuracy : 0.59585         
                                          
       'Positive' Class : Yes

head(predict(tree.mod, loans.test, type = "raw"))

[1] Yes No  Yes No  No  No 
Levels: No Yes

head(predict(tree.mod, loans.test, type = "prob"))

         No       Yes
1 0.3464811 0.6535189
2 0.6730191 0.3269809
3 0.3464811 0.6535189
4 0.6730191 0.3269809
5 0.5358852 0.4641148
6 0.5358852 0.4641148

Customize the Tuning Process

ctrl <-
  trainControl(method = "cv",  ### control object: cross-validation 
               number = 10, ### fold number 
               selectionFunction = "oneSE") ### 3 options

modelLookup("C5.0")

  model parameter                 label forReg forClass probModel
1  C5.0    trials # Boosting Iterations  FALSE     TRUE      TRUE
2  C5.0     model            Model Type  FALSE     TRUE      TRUE
3  C5.0    winnow                Winnow  FALSE     TRUE      TRUE

grid <-
  expand.grid(
    .model = "tree",
    .trials = c(1, 5, 10, 15, 20, 25, 30, 35),
    .winnow = FALSE
  )

grid

  .model .trials .winnow
1   tree       1   FALSE
2   tree       5   FALSE
3   tree      10   FALSE
4   tree      15   FALSE
5   tree      20   FALSE
6   tree      25   FALSE
7   tree      30   FALSE
8   tree      35   FALSE

modelLookup("rpart")

  model parameter                label forReg forClass probModel
1 rpart        cp Complexity Parameter   TRUE     TRUE      TRUE

grid <- 
  expand.grid(
    .cp = seq(from=0.0001, to=0.005, by=0.0001)
  )

grid

set.seed(1234)
tree.mod <-
  train(
    Default ~ .,
    data = loans.train,
    method = "rpart",
    metric = "Kappa", ### using kappa, not accuracy 
    trControl = ctrl,
    tuneGrid = grid ### 50 cp values we got earlier 
  )

tree.mod

CART 

19228 samples
   18 predictor
    2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 17304, 17306, 17305, 17304, 17305, 17306, ... 
Resampling results across tuning parameters:

  cp      Accuracy   Kappa    
  0.0001  0.7029326  0.4058658
  0.0002  0.7106288  0.4212586
  0.0003  0.7143202  0.4286414
  0.0004  0.7170771  0.4341562
  0.0005  0.7170251  0.4340508
  0.0006  0.7167651  0.4335292
  0.0007  0.7139563  0.4279107
  0.0008  0.7135405  0.4270801
  0.0009  0.7128643  0.4257282
  0.0010  0.7127604  0.4255207
  0.0011  0.7116688  0.4233364
  0.0012  0.7107328  0.4214643
  0.0013  0.7107323  0.4214666
  0.0014  0.7096923  0.4193864
  0.0015  0.7090164  0.4180342
  0.0016  0.7078197  0.4156382
  0.0017  0.7060518  0.4121021
  0.0018  0.7064157  0.4128322
  0.0019  0.7067279  0.4134565
  0.0020  0.7067280  0.4134573
  0.0021  0.7070922  0.4141857
  0.0022  0.7073003  0.4146029
  0.0023  0.7064682  0.4129405
  0.0024  0.7068323  0.4136678
  0.0025  0.7068323  0.4136678
  0.0026  0.7068323  0.4136678
  0.0027  0.7065724  0.4131472
  0.0028  0.7060003  0.4120047
  0.0029  0.7060003  0.4120047
  0.0030  0.7060003  0.4120047
  0.0031  0.7060003  0.4120047
  0.0032  0.7058443  0.4116925
  0.0033  0.7058443  0.4116925
  0.0034  0.7056362  0.4112764
  0.0035  0.7043877  0.4087794
  0.0036  0.7038154  0.4076341
  0.0037  0.7032435  0.4064890
  0.0038  0.7032435  0.4064890
  0.0039  0.7026714  0.4053448
  0.0040  0.7026714  0.4053461
  0.0041  0.7021507  0.4043020
  0.0042  0.7000705  0.4001436
  0.0043  0.7000705  0.4001436
  0.0044  0.6990302  0.3980636
  0.0045  0.6990302  0.3980636
  0.0046  0.6984061  0.3968157
  0.0047  0.6978341  0.3956710
  0.0048  0.6977302  0.3954631
  0.0049  0.6977302  0.3954631
  0.0050  0.6977302  0.3954631

Kappa was used to select the optimal model using  the one SE rule.
The final value used for the model was cp = 7e-04.

Ensemble Models

Random Forest

modelLookup("rf")

  model parameter                         label forReg forClass probModel
1    rf      mtry #Randomly Selected Predictors   TRUE     TRUE      TRUE

grid <- expand.grid(.mtry = c(3, 6, 9))

ctrl <-
  trainControl(method = "cv",
               number = 3, ### can change the number of the fold to try nore mtry numbers 
               selectionFunction = "best")

set.seed(1234)
rf.mod <-
  train(
    Default ~ .,
    data = loans.train,
    method = "rf",
    metric = "Kappa",
    trControl = ctrl,
    tuneGrid = grid
  )

rf.mod

Random Forest 

19228 samples
   18 predictor
    2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (3 fold) 
Summary of sample sizes: 12818, 12819, 12819 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
  3     0.7457353  0.4914732
  6     0.7700228  0.5400467
  9     0.7726232  0.5452469

Kappa was used to select the optimal model using the largest value.
The final value used for the model was mtry = 9.

Extreme Gradient Boosting

ctrl <-
  trainControl(method = "cv",
               number = 3,
               selectionFunction = "best")

modelLookup("xgbTree")

    model        parameter                          label forReg forClass
1 xgbTree          nrounds          # Boosting Iterations   TRUE     TRUE
2 xgbTree        max_depth                 Max Tree Depth   TRUE     TRUE
3 xgbTree              eta                      Shrinkage   TRUE     TRUE
4 xgbTree            gamma         Minimum Loss Reduction   TRUE     TRUE
5 xgbTree colsample_bytree     Subsample Ratio of Columns   TRUE     TRUE
6 xgbTree min_child_weight Minimum Sum of Instance Weight   TRUE     TRUE
7 xgbTree        subsample           Subsample Percentage   TRUE     TRUE
  probModel
1      TRUE
2      TRUE
3      TRUE
4      TRUE
5      TRUE
6      TRUE
7      TRUE

grid <- expand.grid(
  nrounds = 20,
  max_depth = c(4, 6, 8),
  eta =  c(0.1, 0.3, 0.5),
  gamma = 0.01,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = c(0.5, 1)
)

set.seed(1234)
xgb.mod <-
  train(
    Default ~ .,
    data = loans.train,
    method = "xgbTree",
    metric = "Kappa",
    trControl = ctrl,
    tuneGrid = grid
  )

xgb.mod

eXtreme Gradient Boosting 

19228 samples
   18 predictor
    2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (3 fold) 
Summary of sample sizes: 12818, 12819, 12819 
Resampling results across tuning parameters:

  eta  max_depth  subsample  Accuracy   Kappa    
  0.1  4          0.5        0.7145306  0.4290631
  0.1  4          1.0        0.7128147  0.4256306
  0.1  6          0.5        0.7252965  0.4505944
  0.1  6          1.0        0.7253486  0.4506984
  0.1  8          0.5        0.7330458  0.4660927
  0.1  8          1.0        0.7333059  0.4666126
  0.3  4          0.5        0.7345538  0.4691077
  0.3  4          1.0        0.7398584  0.4797175
  0.3  6          0.5        0.7388704  0.4777420
  0.3  6          1.0        0.7475558  0.4951131
  0.3  8          0.5        0.7418350  0.4836711
  0.3  8          1.0        0.7526527  0.5053065
  0.5  4          0.5        0.7337221  0.4674456
  0.5  4          1.0        0.7476081  0.4952181
  0.5  6          0.5        0.7389229  0.4778465
  0.5  6          1.0        0.7506765  0.5013543
  0.5  8          0.5        0.7342419  0.4684855
  0.5  8          1.0        0.7574891  0.5149794

Tuning parameter 'nrounds' was held constant at a value of 20

Tuning parameter 'colsample_bytree' was held constant at a value of
 1
Tuning parameter 'min_child_weight' was held constant at a value of 1
Kappa was used to select the optimal model using the largest value.
The final values used for the model were nrounds = 20, max_depth = 8,
 eta = 0.5, gamma = 0.01, colsample_bytree = 1, min_child_weight = 1
 and subsample = 1.

Compare Model Performance

## Logistic Regression

logit.mod <-
  glm(Default ~ ., family = binomial(link = 'logit'), data = loans.train)

logit.pred.prob <- predict(logit.mod, loans.test, type = 'response')

logit.pred <- as.factor(ifelse(logit.pred.prob > 0.5, "Yes", "No"))

test <- loans.test$Default
pred <- logit.pred
prob <- logit.pred.prob

# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, main = "ROC Curve for Loan Default Prediction Approaches", col = 2, lwd = 2)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)

# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "Yes")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "Yes")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- tibble(approach="Logistic Regression", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc) 


## Classification Tree
tree.pred <- predict(tree.mod, loans.test, type = "raw")
tree.pred.prob <- predict(tree.mod, loans.test, type = "prob")

test <- loans.test$Default
pred <- tree.pred
prob <- tree.pred.prob[,c("Yes")]

# Plot ROC Curve
# dev.off()
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=3, lwd = 2, add=TRUE)

# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "Yes")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "Yes")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
  add_row(approach="Classification Tree", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc) 


## Random Forest

rf.pred <- predict(rf.mod, loans.test, type = "raw")
rf.pred.prob <- predict(rf.mod, loans.test, type = "prob")

test <- loans.test$Default
pred <- rf.pred
prob <- rf.pred.prob[,c("Yes")]

# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=4, lwd = 2, add=TRUE)

# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "Yes")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "Yes")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
  add_row(approach="Random Forest", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc) 


## XGBoost

xgb.pred <- predict(xgb.mod, loans.test, type = "raw")
xgb.pred.prob <- predict(xgb.mod, loans.test, type = "prob")

test <- loans.test$Default
pred <- xgb.pred
prob <- xgb.pred.prob[,c("Yes")]

# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=5, lwd = 2, add=TRUE)

# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "Yes")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "Yes")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
  add_row(approach="Extreme Gradient Boosting", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc) 

legend(0.6, 0.6, c('Logistic Regression', 'Classification Tree', 'Random Forest', 'Extreme Gradient Boosting'), 2:5)

comparisons

# A tibble: 4 x 5
  approach                  accuracy fmeasure kappa   auc
  <chr>                        <dbl>    <dbl> <dbl> <dbl>
1 Logistic Regression          0.633    0.343 0.157 0.688
2 Classification Tree          0.706    0.316 0.150 0.645
3 Random Forest                0.681    0.337 0.164 0.683
4 Extreme Gradient Boosting    0.716    0.309 0.146 0.650

Conclusion

logistic regression and random forest are 2 better options, but logistic regression is the best because random forest is ensemble method which makes more assumption. When choosing the optimal model, we always go with simpler model which makes the least assumption based on Occam’s razor - Law of Parsimony.