In chapter 7 of John Foreman’s book, Data Smart[http://www.wiley.com/WileyCDA/WileyTitle/productCd-111866146X.html], he again predicts the pregnancy status of Retail Mart’s customers based on their shopping habits. This time he uses ensemble techniques, specifically bagging and boosting, to build his predictive models.
Since the author also provides the R code for a logistic regression and random forest solution in a later chapter of his book (chapter 10), we take a slightly different approach here. We use the well-known and powerful R caret predictive modeling framework to implement the bagging, boosting and random forest models.
The boosting model proved to have a marginally better AUC score. More importantly, we will see dramatic differences in computation time, with the boosting model being more than 6 times and 13 times faster than the random forest and bagging methods used respectively.
# An R script, using the caret package, is used to apply and compare ensemble model performance with the synthesized RetailMart dataset in chapter 7 of the book Data Smart by John Foreman. Data Smart provides excellent explanations of Data Science methods. The book's website is: http://www.wiley.com/WileyCDA/WileyTitle/productCd-111866146X.html
# Load required packages
library(caret) # See the excellent http://topepo.github.io/caret/index.html by the package developer, Max Kuhn
library(ggplot2)
library(e1071)
library(ROCR) # required for ROC curve
# Read in the training and test data
rm_train <- read.csv(".\\ensemble_training.csv", header = TRUE, check.names = FALSE) # the training data
rm_train$PREGNANT <- factor(rm_train$PREGNANT, labels = c("Yes", "No")) # using labels "1" and "0" can give rise to problems in caret
rm_test <- read.csv(".\\ensemble_test_orig.csv", header = TRUE, check.names = FALSE) # the test data
rm_test$PREGNANT <- factor(rm_test$PREGNANT, labels = c("Yes", "No"))
# Get ready to set up and use caret
# Set the control parameters for the training step
# classProbs are required to return probability of outcome (pregnant and not pregnant in this case)
# summaryFunction is set to return outcome as a set of binary classification results
# ten-fold cross validation is used by default
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
# Train a random forest model using using the rm_train data
# method is set to 'rf', the designator for the randomForest package
# metric is set to 'ROC', so that the output data is suitable for generating an ROC curve
set.seed(88)
timeStart <- proc.time() # measure computation time
rf_fit <- train(PREGNANT ~., data = rm_train, method = "rf", trControl = ctrl, metric = "ROC", type = "Classification")
proc.time() - timeStart # time taken was measured at 6 mins 40 seconds in this example
## user system elapsed
## 292.22 6.54 310.84
plot(rf_fit) # plots the fitted results. The best AUC is achieved with 2 random predictors
rf_fit # prints the model information
## Random Forest
##
## 1000 samples
## 19 predictor
## 2 classes: 'Yes', 'No'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
##
## Summary of sample sizes: 900, 900, 900, 900, 900, 900, ...
##
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec ROC SD Sens SD
## 2 0.8939133 0.9033333 0.7360000 0.03381877 0.03679705
## 10 0.8676067 0.8826667 0.7226667 0.03944552 0.04059330
## 19 0.8621867 0.8806667 0.7106667 0.03725257 0.04050827
## Spec SD
## 0.06589438
## 0.06051123
## 0.05982350
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
plot(varImp(rf_fit)) # plots the variable importance
# Predict on the test set using the random forest model created with the training set
rf_class <- predict(rf_fit, newdata = rm_test) # classification outcome required for the confusion matrix
rf_pred <- predict(rf_fit, newdata = rm_test, type = "prob") # predicted data required for ROC curve
# Generate a confusion matrix for the random forest prediction results
rf_cm <- confusionMatrix(rf_class, rm_test$PREGNANT)
rf_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 843 13
## No 97 47
##
## Accuracy : 0.89
## 95% CI : (0.8689, 0.9087)
## No Information Rate : 0.94
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4109
## Mcnemar's Test P-Value : 2.498e-15
##
## Sensitivity : 0.8968
## Specificity : 0.7833
## Pos Pred Value : 0.9848
## Neg Pred Value : 0.3264
## Prevalence : 0.9400
## Detection Rate : 0.8430
## Detection Prevalence : 0.8560
## Balanced Accuracy : 0.8401
##
## 'Positive' Class : Yes
##
# Train a boosted classification tree model using caret and the 'gbm' package
# measure the process time
timeStart <- proc.time()
gbm_fit <- train(PREGNANT ~., data = rm_train, method = "gbm", trControl = ctrl, metric = "ROC", verbose = FALSE) # using the same trControl settings
proc.time() - timeStart # process time measured at 45 secs
## user system elapsed
## 45.79 0.25 46.47
plot(gbm_fit) # plots gbm_fit
gbm_fit # returns gbm_fit
## Stochastic Gradient Boosting
##
## 1000 samples
## 19 predictor
## 2 classes: 'Yes', 'No'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
##
## Summary of sample sizes: 900, 900, 900, 900, 900, 900, ...
##
## Resampling results across tuning parameters:
##
## interaction.depth n.trees ROC Sens Spec ROC SD
## 1 50 0.8866733 0.8980000 0.7120000 0.02918051
## 1 100 0.8981333 0.8993333 0.7420000 0.02639375
## 1 150 0.9013867 0.8960000 0.7566667 0.02464031
## 2 50 0.8958800 0.8946667 0.7433333 0.02869702
## 2 100 0.9023000 0.8966667 0.7586667 0.02630462
## 2 150 0.9023867 0.8960000 0.7613333 0.02688160
## 3 50 0.8999200 0.8946667 0.7546667 0.02664430
## 3 100 0.9040067 0.8973333 0.7593333 0.02380261
## 3 150 0.9024867 0.8926667 0.7640000 0.02470665
## Sens SD Spec SD
## 0.04936842 0.07888883
## 0.04798946 0.06692250
## 0.05021334 0.06369828
## 0.04783112 0.07028677
## 0.05040069 0.06786057
## 0.05210070 0.06516469
## 0.04925187 0.06516469
## 0.05218887 0.06378484
## 0.05342951 0.06376682
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 100,
## interaction.depth = 3 and shrinkage = 0.1.
plot(varImp(gbm_fit)) # returns variable importance
# Predict on the test set using the gbm model
gbm_class <- predict(gbm_fit, newdata = rm_test) # returns classifications
gbm_pred <- predict(gbm_fit, newdata = rm_test, type = "prob") # returns probabilities
# Generate a confusion matrix for the gbm prediction results
gbm_cm <- confusionMatrix(gbm_class, rm_test$PREGNANT)
gbm_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 835 13
## No 105 47
##
## Accuracy : 0.882
## 95% CI : (0.8604, 0.9013)
## No Information Rate : 0.94
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.391
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8883
## Specificity : 0.7833
## Pos Pred Value : 0.9847
## Neg Pred Value : 0.3092
## Prevalence : 0.9400
## Detection Rate : 0.8350
## Detection Prevalence : 0.8480
## Balanced Accuracy : 0.8358
##
## 'Positive' Class : Yes
##
# Train a bagging model using caret and the 'bag' package
# Needed to include a bagControl() argument in the train function, which allows choice of tree types
timeStart <- proc.time()
bag_fit <- train(PREGNANT ~., data = rm_train, method = "bag", trControl = ctrl, metric = "ROC", verbose = FALSE,
bagControl= bagControl(fit = ctreeBag$fit, pred = ctreeBag$pred, aggregate = ctreeBag$aggregate ),
tuneGrid = data.frame(vars = seq(1, 15, by = 2)))
proc.time() - timeStart
## user system elapsed
## 658.64 2.59 671.98
# processing times measured as 581 secs (user) and 1.66 secs (system) respectively
plot(bag_fit) # plots bag_fit
bag_fit # returns bag_fit
## Bagged Model
##
## 1000 samples
## 19 predictor
## 2 classes: 'Yes', 'No'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
##
## Summary of sample sizes: 900, 900, 900, 900, 900, 900, ...
##
## Resampling results across tuning parameters:
##
## vars ROC Sens Spec ROC SD Sens SD
## 1 0.6315533 0.8380000 0.3760000 0.21244297 0.22732932
## 3 0.8099867 0.8486667 0.5486667 0.11276167 0.13490056
## 5 0.8508800 0.8500000 0.6206667 0.03930400 0.15229282
## 7 0.8652133 0.8766667 0.6633333 0.03007760 0.07720699
## 9 0.8789867 0.9000000 0.6740000 0.03500573 0.04954970
## 11 0.8747733 0.8760000 0.7040000 0.03370725 0.08211010
## 13 0.8767667 0.8806667 0.7160000 0.03239221 0.05570447
## 15 0.8822867 0.8846667 0.7100000 0.03643544 0.05243935
## Spec SD
## 0.31668106
## 0.18093118
## 0.15926411
## 0.08355602
## 0.08822385
## 0.08556506
## 0.07582193
## 0.06982737
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was vars = 15.
plot(varImp(bag_fit)) # returns variable importance
# Predict on the test set using the bagging model
bag_class <- predict(bag_fit, newdata = rm_test)
bag_pred <- predict(bag_fit, newdata = rm_test, type = "prob")
# Generate a confusion matrix for the bagging prediction results
bag_cm <- confusionMatrix(bag_class, rm_test$PREGNANT)
bag_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 800 12
## No 140 48
##
## Accuracy : 0.848
## 95% CI : (0.8242, 0.8697)
## No Information Rate : 0.94
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3258
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8511
## Specificity : 0.8000
## Pos Pred Value : 0.9852
## Neg Pred Value : 0.2553
## Prevalence : 0.9400
## Detection Rate : 0.8000
## Detection Prevalence : 0.8120
## Balanced Accuracy : 0.8255
##
## 'Positive' Class : Yes
##
# # Generate an ROC curve for the rf method
predRF <- prediction(rf_pred[,1], rm_test$PREGNANT)
perfRF <- performance(predRF, "tpr", "fpr")
plot(perfRF, main = "ROC curves for randomForest, gbm and bag models")
# Generate an ROC curve for the gbm method
pred_gbm <- prediction(gbm_pred[,1], rm_test$PREGNANT)
perf_gbm <- performance(pred_gbm, "tpr", "fpr")
plot(perf_gbm, add = TRUE, col = "blue")
#Generate an ROC curve for the 'bag' method
pred_bag <- prediction(bag_pred[,1], rm_test$PREGNANT)
perf_bag <- performance(pred_bag, "tpr", "fpr")
plot(perf_bag, add = TRUE, col = "red")
# Add legends to the plot
legend("right", legend = c("randomForest", "gbm", "bag"), bty = "n", cex = 1, lty = 1,
col = c("black", "blue", "red"))