Meta-learning - Ensemble Methods
library(tidyverse)
library(caret)
library(DMwR)
library(rpart)
library(ROCR)
library(randomForest)
library(xgboost)Collect and Prepare the Data
loans <- read_csv("https://s3.amazonaws.com/notredame.analytics.data/lendingclub.csv")
loans$Grade <- as.factor(loans$Grade)
loans$EmploymentLength <- as.factor(loans$EmploymentLength)
loans$HomeOwnership <- as.factor(loans$HomeOwnership)
loans$IncomeVerified <- as.factor(loans$IncomeVerified)
loans$LoanPurpose <- as.factor(loans$LoanPurpose)
loans$Default <- as.factor(loans$Default)
set.seed(1234)
sample.set <- createDataPartition(loans$Default, p = 0.75, list = FALSE)
loans.train <- loans[sample.set, ]
loans.train <- SMOTE(Default ~ ., data.frame(loans.train), perc.over = 100, perc.under = 200)
loans.test <- loans[-sample.set, ]Train and Evaluate a Model
CART
19228 samples
18 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 19228, 19228, 19228, 19228, 19228, 19228, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.02101103 0.6439888 0.2881672
0.02652382 0.6334887 0.2674375
0.25119617 0.5214907 0.0482178
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02101103.
tree.pred <- predict(tree.mod, loans.test)
confusionMatrix(tree.pred, loans.test$Default, positive = "Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 6164 789
Yes 2845 813
Accuracy : 0.6575
95% CI : (0.6484, 0.6666)
No Information Rate : 0.849
P-Value [Acc > NIR] : 1
Kappa : 0.1255
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.50749
Specificity : 0.68420
Pos Pred Value : 0.22225
Neg Pred Value : 0.88652
Prevalence : 0.15098
Detection Rate : 0.07662
Detection Prevalence : 0.34474
Balanced Accuracy : 0.59585
'Positive' Class : Yes
[1] Yes No Yes No No No
Levels: No Yes
No Yes
1 0.3464811 0.6535189
2 0.6730191 0.3269809
3 0.3464811 0.6535189
4 0.6730191 0.3269809
5 0.5358852 0.4641148
6 0.5358852 0.4641148
Customize the Tuning Process
ctrl <-
trainControl(method = "cv", ### control object: cross-validation
number = 10, ### fold number
selectionFunction = "oneSE") ### 3 options model parameter label forReg forClass probModel
1 C5.0 trials # Boosting Iterations FALSE TRUE TRUE
2 C5.0 model Model Type FALSE TRUE TRUE
3 C5.0 winnow Winnow FALSE TRUE TRUE
grid <-
expand.grid(
.model = "tree",
.trials = c(1, 5, 10, 15, 20, 25, 30, 35),
.winnow = FALSE
)
grid .model .trials .winnow
1 tree 1 FALSE
2 tree 5 FALSE
3 tree 10 FALSE
4 tree 15 FALSE
5 tree 20 FALSE
6 tree 25 FALSE
7 tree 30 FALSE
8 tree 35 FALSE
model parameter label forReg forClass probModel
1 rpart cp Complexity Parameter TRUE TRUE TRUE
.cp
1 0.0001
2 0.0002
3 0.0003
4 0.0004
5 0.0005
6 0.0006
7 0.0007
8 0.0008
9 0.0009
10 0.0010
11 0.0011
12 0.0012
13 0.0013
14 0.0014
15 0.0015
16 0.0016
17 0.0017
18 0.0018
19 0.0019
20 0.0020
21 0.0021
22 0.0022
23 0.0023
24 0.0024
25 0.0025
26 0.0026
27 0.0027
28 0.0028
29 0.0029
30 0.0030
31 0.0031
32 0.0032
33 0.0033
34 0.0034
35 0.0035
36 0.0036
37 0.0037
38 0.0038
39 0.0039
40 0.0040
41 0.0041
42 0.0042
43 0.0043
44 0.0044
45 0.0045
46 0.0046
47 0.0047
48 0.0048
49 0.0049
50 0.0050
set.seed(1234)
tree.mod <-
train(
Default ~ .,
data = loans.train,
method = "rpart",
metric = "Kappa", ### using kappa, not accuracy
trControl = ctrl,
tuneGrid = grid ### 50 cp values we got earlier
)
tree.modCART
19228 samples
18 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 17304, 17306, 17305, 17304, 17305, 17306, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.0001 0.7029326 0.4058658
0.0002 0.7106288 0.4212586
0.0003 0.7143202 0.4286414
0.0004 0.7170771 0.4341562
0.0005 0.7170251 0.4340508
0.0006 0.7167651 0.4335292
0.0007 0.7139563 0.4279107
0.0008 0.7135405 0.4270801
0.0009 0.7128643 0.4257282
0.0010 0.7127604 0.4255207
0.0011 0.7116688 0.4233364
0.0012 0.7107328 0.4214643
0.0013 0.7107323 0.4214666
0.0014 0.7096923 0.4193864
0.0015 0.7090164 0.4180342
0.0016 0.7078197 0.4156382
0.0017 0.7060518 0.4121021
0.0018 0.7064157 0.4128322
0.0019 0.7067279 0.4134565
0.0020 0.7067280 0.4134573
0.0021 0.7070922 0.4141857
0.0022 0.7073003 0.4146029
0.0023 0.7064682 0.4129405
0.0024 0.7068323 0.4136678
0.0025 0.7068323 0.4136678
0.0026 0.7068323 0.4136678
0.0027 0.7065724 0.4131472
0.0028 0.7060003 0.4120047
0.0029 0.7060003 0.4120047
0.0030 0.7060003 0.4120047
0.0031 0.7060003 0.4120047
0.0032 0.7058443 0.4116925
0.0033 0.7058443 0.4116925
0.0034 0.7056362 0.4112764
0.0035 0.7043877 0.4087794
0.0036 0.7038154 0.4076341
0.0037 0.7032435 0.4064890
0.0038 0.7032435 0.4064890
0.0039 0.7026714 0.4053448
0.0040 0.7026714 0.4053461
0.0041 0.7021507 0.4043020
0.0042 0.7000705 0.4001436
0.0043 0.7000705 0.4001436
0.0044 0.6990302 0.3980636
0.0045 0.6990302 0.3980636
0.0046 0.6984061 0.3968157
0.0047 0.6978341 0.3956710
0.0048 0.6977302 0.3954631
0.0049 0.6977302 0.3954631
0.0050 0.6977302 0.3954631
Kappa was used to select the optimal model using the one SE rule.
The final value used for the model was cp = 7e-04.
Ensemble Models
Random Forest
model parameter label forReg forClass probModel
1 rf mtry #Randomly Selected Predictors TRUE TRUE TRUE
grid <- expand.grid(.mtry = c(3, 6, 9))
ctrl <-
trainControl(method = "cv",
number = 3, ### can change the number of the fold to try nore mtry numbers
selectionFunction = "best")
set.seed(1234)
rf.mod <-
train(
Default ~ .,
data = loans.train,
method = "rf",
metric = "Kappa",
trControl = ctrl,
tuneGrid = grid
)
rf.modRandom Forest
19228 samples
18 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (3 fold)
Summary of sample sizes: 12818, 12819, 12819
Resampling results across tuning parameters:
mtry Accuracy Kappa
3 0.7457353 0.4914732
6 0.7700228 0.5400467
9 0.7726232 0.5452469
Kappa was used to select the optimal model using the largest value.
The final value used for the model was mtry = 9.
Extreme Gradient Boosting
model parameter label forReg forClass
1 xgbTree nrounds # Boosting Iterations TRUE TRUE
2 xgbTree max_depth Max Tree Depth TRUE TRUE
3 xgbTree eta Shrinkage TRUE TRUE
4 xgbTree gamma Minimum Loss Reduction TRUE TRUE
5 xgbTree colsample_bytree Subsample Ratio of Columns TRUE TRUE
6 xgbTree min_child_weight Minimum Sum of Instance Weight TRUE TRUE
7 xgbTree subsample Subsample Percentage TRUE TRUE
probModel
1 TRUE
2 TRUE
3 TRUE
4 TRUE
5 TRUE
6 TRUE
7 TRUE
grid <- expand.grid(
nrounds = 20,
max_depth = c(4, 6, 8),
eta = c(0.1, 0.3, 0.5),
gamma = 0.01,
colsample_bytree = 1,
min_child_weight = 1,
subsample = c(0.5, 1)
)
set.seed(1234)
xgb.mod <-
train(
Default ~ .,
data = loans.train,
method = "xgbTree",
metric = "Kappa",
trControl = ctrl,
tuneGrid = grid
)
xgb.modeXtreme Gradient Boosting
19228 samples
18 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (3 fold)
Summary of sample sizes: 12818, 12819, 12819
Resampling results across tuning parameters:
eta max_depth subsample Accuracy Kappa
0.1 4 0.5 0.7145306 0.4290631
0.1 4 1.0 0.7128147 0.4256306
0.1 6 0.5 0.7252965 0.4505944
0.1 6 1.0 0.7253486 0.4506984
0.1 8 0.5 0.7330458 0.4660927
0.1 8 1.0 0.7333059 0.4666126
0.3 4 0.5 0.7345538 0.4691077
0.3 4 1.0 0.7398584 0.4797175
0.3 6 0.5 0.7388704 0.4777420
0.3 6 1.0 0.7475558 0.4951131
0.3 8 0.5 0.7418350 0.4836711
0.3 8 1.0 0.7526527 0.5053065
0.5 4 0.5 0.7337221 0.4674456
0.5 4 1.0 0.7476081 0.4952181
0.5 6 0.5 0.7389229 0.4778465
0.5 6 1.0 0.7506765 0.5013543
0.5 8 0.5 0.7342419 0.4684855
0.5 8 1.0 0.7574891 0.5149794
Tuning parameter 'nrounds' was held constant at a value of 20
Tuning parameter 'colsample_bytree' was held constant at a value of
1
Tuning parameter 'min_child_weight' was held constant at a value of 1
Kappa was used to select the optimal model using the largest value.
The final values used for the model were nrounds = 20, max_depth = 8,
eta = 0.5, gamma = 0.01, colsample_bytree = 1, min_child_weight = 1
and subsample = 1.
Compare Model Performance
## Logistic Regression
logit.mod <-
glm(Default ~ ., family = binomial(link = 'logit'), data = loans.train)
logit.pred.prob <- predict(logit.mod, loans.test, type = 'response')
logit.pred <- as.factor(ifelse(logit.pred.prob > 0.5, "Yes", "No"))
test <- loans.test$Default
pred <- logit.pred
prob <- logit.pred.prob
# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, main = "ROC Curve for Loan Default Prediction Approaches", col = 2, lwd = 2)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "Yes")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "Yes")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- tibble(approach="Logistic Regression", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
## Classification Tree
tree.pred <- predict(tree.mod, loans.test, type = "raw")
tree.pred.prob <- predict(tree.mod, loans.test, type = "prob")
test <- loans.test$Default
pred <- tree.pred
prob <- tree.pred.prob[,c("Yes")]
# Plot ROC Curve
# dev.off()
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=3, lwd = 2, add=TRUE)
# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "Yes")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "Yes")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
add_row(approach="Classification Tree", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
## Random Forest
rf.pred <- predict(rf.mod, loans.test, type = "raw")
rf.pred.prob <- predict(rf.mod, loans.test, type = "prob")
test <- loans.test$Default
pred <- rf.pred
prob <- rf.pred.prob[,c("Yes")]
# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=4, lwd = 2, add=TRUE)
# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "Yes")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "Yes")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
add_row(approach="Random Forest", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
## XGBoost
xgb.pred <- predict(xgb.mod, loans.test, type = "raw")
xgb.pred.prob <- predict(xgb.mod, loans.test, type = "prob")
test <- loans.test$Default
pred <- xgb.pred
prob <- xgb.pred.prob[,c("Yes")]
# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=5, lwd = 2, add=TRUE)
# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "Yes")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "Yes")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
add_row(approach="Extreme Gradient Boosting", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
legend(0.6, 0.6, c('Logistic Regression', 'Classification Tree', 'Random Forest', 'Extreme Gradient Boosting'), 2:5)# A tibble: 4 x 5
approach accuracy fmeasure kappa auc
<chr> <dbl> <dbl> <dbl> <dbl>
1 Logistic Regression 0.633 0.343 0.157 0.688
2 Classification Tree 0.706 0.316 0.150 0.645
3 Random Forest 0.681 0.337 0.164 0.683
4 Extreme Gradient Boosting 0.716 0.309 0.146 0.650
Conclusion
logistic regression and random forest are 2 better options, but logistic regression is the best because random forest is ensemble method which makes more assumption. When choosing the optimal model, we always go with simpler model which makes the least assumption based on Occam’s razor - Law of Parsimony.