Week-11: Voting and Stacking

Voting

What is voting and how do models vote?

library(AER)  # for HMDA data

## Warning: package 'AER' was built under R version 4.4.3

## Loading required package: car

## Loading required package: carData

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: survival

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.4.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

library(e1071)  # for SVM and naiveBayes

## Warning: package 'e1071' was built under R version 4.4.3

library(rpart)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(gbm)

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(xgboost)
library(mlbench)  # our data source


# Load Default data
data(HMDA)
str(HMDA)

## 'data.frame':    2380 obs. of  14 variables:
##  $ deny     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ pirat    : num  0.221 0.265 0.372 0.32 0.36 ...
##  $ hirat    : num  0.221 0.265 0.248 0.25 0.35 ...
##  $ lvrat    : num  0.8 0.922 0.92 0.86 0.6 ...
##  $ chist    : Factor w/ 6 levels "1","2","3","4",..: 5 2 1 1 1 1 1 2 2 2 ...
##  $ mhist    : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 1 1 2 2 2 1 ...
##  $ phist    : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ unemp    : num  3.9 3.2 3.2 4.3 3.2 ...
##  $ selfemp  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ insurance: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ condomin : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ afam     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ single   : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 2 1 1 2 ...
##  $ hschool  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...

set.seed(123) # for reproducibility

# Do a 75-25 train-test split
split <- createDataPartition(HMDA$deny, p = 0.75, list=FALSE)
HMDA_train <- HMDA[split,]
HMDA_test <- HMDA[-split,]


# Model fitting, prediction, and accuracy
# Probit model
probit.vote <- glm(deny ~ ., 
                  family = binomial(link = "probit"), 
                  data = HMDA_train)

probit.pred <- predict(probit.vote, HMDA_test, type = "response")  # Save for soft voting estimation
probit.pred.bin <- as.factor(ifelse(probit.pred >= 0.5, "yes", "no"))

confmat_probit <- table(Predicted = probit.pred.bin, Actual = HMDA_test$deny) # Confusion table
confmat_probit

##          Actual
## Predicted  no yes
##       no  512  54
##       yes  11  17

probit_accuracy <- (confmat_probit[1, 1] + confmat_probit[2, 2]) / sum(confmat_probit) * 100
probit_accuracy

## [1] 89.05724

# logit model
logit.vote <- glm(deny ~ ., 
                  family = binomial(link = "logit"), 
                  data = HMDA_train)

logit.pred <- predict(logit.vote, HMDA_test, type = "response")   # Save for soft voting estimation
logit.pred.bin <-as.factor(ifelse(logit.pred >= 0.5, "yes", "no"))

confmat_logit <- table(Predicted = logit.pred.bin, Actual = HMDA_test$deny) # Confusion matrix
confmat_logit

##          Actual
## Predicted  no yes
##       no  510  51
##       yes  13  20

logit_accuracy <- (confmat_logit[1, 1] + confmat_logit[2, 2]) / sum(confmat_logit) * 100
logit_accuracy

## [1] 89.22559

# SVM
svm.vote <- svm(deny ~ ., 
                 scale = TRUE, 
               kernel = "polynomial", 
               degree = 3, # if kernel is of type = "polynomial"
               cost = 1, # The cost function (C)
               probability = TRUE, # Whether to output probability predictions
               na.action = na.omit,
                 data = HMDA_train)

svm.pred <- predict(svm.vote, HMDA_test) 

confmat_svm <- table(Predicted = svm.pred, Actual = HMDA_test$deny) # Confusion matrix
confmat_svm

##          Actual
## Predicted  no yes
##       no  522  71
##       yes   1   0

svm_accuracy <- (confmat_svm[1, 1] + confmat_svm[2, 2]) / sum(confmat_svm) * 100
svm_accuracy

## [1] 87.87879

# Naive Bayes
nb.vote <- naiveBayes(deny ~ .,
                       data = HMDA_train)

nb.pred <- predict(nb.vote, HMDA_test)
confmat_nb <- table(Predicted = nb.pred, Actual = HMDA_test$deny) # Confusion matrix
confmat_nb

##          Actual
## Predicted  no yes
##       no  501  49
##       yes  22  22

nb_accuracy <- (confmat_nb[1, 1] + confmat_nb[2, 2]) / sum(confmat_nb) * 100
nb_accuracy

## [1] 88.04714

# Random forest
# Fine-tuning hyperparameters

hyper_grid <- expand.grid(
  shrinkage = c(.01, .05),
  interaction.depth = c(1, 3),
  n.minobsinnode = c(5, 10, 25),
  bag.fraction = c(.65, .8), 
  optimal_trees = 0,               
  min_RMSE = 0                     
)

# Convert deny to numeric (0, 1)
HMDA_train$deny <- ifelse(HMDA_train$deny == "yes", 1, 0)


for(i in 1:nrow(hyper_grid)) {
  
  # train model
  gbm.tune <- gbm(
    deny ~ .,
    distribution = "bernoulli",
    data = HMDA_train,
    n.trees = 500, 
    interaction.depth = hyper_grid$interaction.depth[i], 
    shrinkage = hyper_grid$shrinkage[i], 
    n.minobsinnode = hyper_grid$n.minobsinnode[i], 
    bag.fraction = hyper_grid$bag.fraction[i], 
    train.fraction = .75,
    n.cores = NULL, 
    verbose = FALSE  
  )
  
  # Locate minimum training error from the n-th tree, add it to the grid
  hyper_grid$optimal_trees[i] <- which.min(gbm.tune$valid.error)  # Find the index
  hyper_grid$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}


# Get the best-performing hyperparameters
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:xgboost':
## 
##     slice

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

optimal_hyperparameters <- hyper_grid %>% 
  dplyr::arrange(min_RMSE) %>%
  head(1)

# Set the optimal hyperparameter values and re-train the model

gbm.vote <- gbm(
  deny ~ ., 
  distribution = "bernoulli",
  data = HMDA_train,
  n.trees = optimal_hyperparameters$optimal_trees,
  interaction.depth = optimal_hyperparameters$interaction.depth,
  shrinkage = optimal_hyperparameters$shrinkage,
  n.minobsinnode = optimal_hyperparameters$n.minobsinnode,
  bag.fraction = optimal_hyperparameters$bag.fraction, 
  train.fraction = 0.63,
  n.cores = NULL, # Use all cores by default
  verbose = FALSE
  )  

# Temporarily convert the "deny" variable in HMDA_test 
# Convert deny to numeric (0, 1)
deny <- HMDA_test$deny
HMDA_test$deny <- ifelse(HMDA_test$deny == "yes", 1, 0)

gbm.pred <- predict(gbm.vote, HMDA_test, type = "response")

## Using 193 trees...

gbm.pred.bin <-as.factor(ifelse(gbm.pred >= 0.5, "yes", "no"))

confmat_gbm <- table(Predicted = gbm.pred.bin, Actual = HMDA_test$deny) # Confusion matrix
confmat_gbm

##          Actual
## Predicted   0   1
##       no  509  55
##       yes  14  16

gbm_accuracy <- (confmat_gbm[1, 1] + confmat_gbm[2, 2]) / sum(confmat_gbm) * 100
gbm_accuracy

## [1] 88.38384

HMDA_test$deny <- deny  # Add it back, we're gonna need it soon


# Combine predictions into a data frame

preds <- data.frame(probit.pred.bin, logit.pred.bin, svm.pred, nb.pred, gbm.pred.bin)
colnames(preds) <- c("Probit", "Logit", "SVM", "Naive Bayes", "GBM")

# Take a look at this dataframe
head(preds)

##    Probit Logit SVM Naive Bayes GBM
## 3      no    no  no          no  no
## 5      no    no  no          no  no
## 7      no    no  no          no  no
## 9     yes   yes  no          no yes
## 17     no    no  no          no  no
## 24     no    no  no          no  no

## Hard Voting (majority vote)
hard_voting <- apply(preds, 1, function(x) {
  names(sort(table(x), decreasing = TRUE))[1]  # Sort category with the highest freq. first
})


# Accuracy
hard_voting_accuracy <- sum(hard_voting == HMDA_test$deny) / length(HMDA_test$deny) * 100
cat("Hard Voting Accuracy:", hard_voting_accuracy, "\n")

## Hard Voting Accuracy: 89.39394

# Tabulate hard voting accuracy

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:randomForest':
## 
##     combine

library(grid)

tab_hard <- data.frame(Accuracy = c(probit_accuracy, logit_accuracy, svm_accuracy, nb_accuracy, gbm_accuracy, hard_voting_accuracy))
tab_hard$Accuracy <- format(round(tab_hard$Accuracy, 3), nsmall = 3)  # Limit to 3 digits after the decimal
rownames(tab_hard) <- c("Probit", "Logit", "SVM", "Naive Bayes", "GBM", "Hard-Voting")

theme_1 <- ttheme_default()  # Plotting scheme
grid.arrange(tableGrob(tab_hard, theme = theme_1))

# The accuracy of the hard voting model is indeed the best


## Soft Voting (Probabilities)
# since we need to convert them to probability, a few adjustments are needed. We need to convert these prediction values to two columns ("no", "yes") of length nrow(HMDA_test)

probit.pred_0 <- 1 - probit.pred   # Also need to encode the probabilities for those 0 outcomes
probit.pred.prob <- as.matrix(data.frame(no = probit.pred_0, yes = probit.pred))

logit.pred_0 <- 1 - logit.pred
logit.pred.prob <- as.matrix(data.frame(no = logit.pred_0, yes = logit.pred))  

svm.pred.prob <- attr(predict(svm.vote, HMDA_test, probability = TRUE), "probabilities")
nb.pred.prob <- predict(nb.vote, HMDA_test, type = "raw")  # Set type = "raw" to get raw score for each class

gbm.pred_0 <- 1 - gbm.pred
gbm.pred.prob <- as.matrix(data.frame(no = gbm.pred_0, yes = gbm.pred))  


# Average probabilities
average_prob <- (probit.pred.prob + logit.pred.prob + svm.pred.prob + nb.pred.prob + gbm.pred.prob) / 5


# Convert to class prediction
soft_voting <- colnames(average_prob)[max.col(average_prob)]  # Select the class/outcome with the highest probability using max.col()
soft_voting_accuracy <- sum(soft_voting == HMDA_test$deny) / length(HMDA_test$deny) * 100
cat("Soft Voting Accuracy:", soft_voting_accuracy, "\n")

## Soft Voting Accuracy: 89.39394

# Tabulate soft voting accuracy
tab_soft <- data.frame(Accuracy = c(probit_accuracy, logit_accuracy, svm_accuracy, nb_accuracy, gbm_accuracy, soft_voting_accuracy))
tab_soft$Accuracy <- format(round(tab_soft$Accuracy, 3), nsmall = 3)
rownames(tab_soft) <- c("Probit", "Logit", "SVM", "Naive Bayes", "GBM", "Soft-Voting")

grid.arrange(tableGrob(tab_soft, theme = theme_1))

# Alas, the accuracy of the soft voting model only ranks the third.

Using pre-programmed R functions to implement hard- and soft-voting methods

There are many existing R ensemble functions that can implement voting methods. Functions from \(\textsf{h2o}\) and \(\textsf{caretEnsemble}\) are two examples. However, \(\textsf{caretList}\), \(\textsf{caretEnsemble}\), and \(\textsf{caretStack}\) tend to re-scale the data, which would require re-scaling, conversion, … , etc. along the way. I for one, do not recommend this package. For this week’s exercise, we will stick to functions from the \(\textsf{h2o}\) package, the scalable open source machine learning platform.

It should be noted that, at the moment, the functions supported by \(\textsf{h2o}\) are somehow limited. For example, the \(\textsf{h2o.glm()}\) function in H2O does not support the “probit” link function. Its \(\textsf{h2o.psvm()}\) does not allow direct specification of degree for polynomial kernels. The cost parameter (C) is also not directly tunable, though it can be controlled via lambda (i.e., regularization strength). For pedagogical purpose, the ensemble examples below will only include three base models, logit, naive Bayes, and GBM.

We first begin with hard-voting, followed by soft-voting. We then assess their performance, in terms of AUC, compared to the three base models against which the ensemble model should produce more accurate predictions.

# Re-load Default data (just in case we've altered it somehow in our earlier data wrangling ... )
data(HMDA)  # From the AER package
str(HMDA)

## 'data.frame':    2380 obs. of  14 variables:
##  $ deny     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ pirat    : num  0.221 0.265 0.372 0.32 0.36 ...
##  $ hirat    : num  0.221 0.265 0.248 0.25 0.35 ...
##  $ lvrat    : num  0.8 0.922 0.92 0.86 0.6 ...
##  $ chist    : Factor w/ 6 levels "1","2","3","4",..: 5 2 1 1 1 1 1 2 2 2 ...
##  $ mhist    : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 1 1 2 2 2 1 ...
##  $ phist    : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ unemp    : num  3.9 3.2 3.2 4.3 3.2 ...
##  $ selfemp  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ insurance: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ condomin : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ afam     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ single   : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 2 1 1 2 ...
##  $ hschool  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...

# Check if the outcome variable "deny" is of factor class (H2O requires categorical targets as factors)
class(HMDA$deny)

## [1] "factor"

set.seed(123) # For reproducibility

# Do a 75-25 train-test split
split <- createDataPartition(HMDA$deny, p = 0.75, list=FALSE)
HMDA_train <- HMDA[split,]
HMDA_test <- HMDA[-split,]


# Initialize H2O
library(h2o)
h2o.init()

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\hktse\AppData\Local\Temp\RtmpigHSKx\file3a804f7b77a3/h2o_hktse_started_from_r.out
##     C:\Users\hktse\AppData\Local\Temp\RtmpigHSKx\file3a803b71f2a/h2o_hktse_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         5 seconds 856 milliseconds 
##     H2O cluster timezone:       Asia/Taipei 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 9 days 
##     H2O cluster name:           H2O_started_from_R_hktse_yom614 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.98 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.2 (2024-10-31 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 9 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Convert data to H2O format
HMDA_train_h2o <- as.h2o(HMDA_train)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

HMDA_test_h2o <- as.h2o(HMDA_test)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Define predictor variables and target variable
Y_HMDA <- "deny"
X_HMDA <- names(HMDA)[-1]  # All columns except the outcome variable assigned to X_HMDA


# Specify base models (i.e., your "voters")

base_logit.vote <- h2o.glm(
  x = X_HMDA, y = Y_HMDA, train = HMDA_train_h2o, 
  family = "binomial",  # binary outcomes
  link = "logit",
  nfolds = 5, fold_assignment = "Modulo",  # "Module" evenly splits the dataset into the designated folds and does not depend on the seed
  keep_cross_validation_predictions = TRUE,
  stopping_metric = "AUC",   # Performance metric, other metrics are available
  auc_type = "AUTO",
  seed = 123
)

## Warning in .h2o.processResponseWarnings(res): Stopping metric is ignored for _stopping_rounds=0..

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

base_nb.vote <- h2o.naiveBayes(
  x = X_HMDA, y = Y_HMDA, train = HMDA_train_h2o, 
  laplace = 0,  # No smoothing
  nfolds = 5, fold_assignment = "Modulo", 
  keep_cross_validation_predictions = TRUE,
  auc_type = "AUTO",
  seed = 123
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# For the GBM, we will set the hyperparameter to the optimized values we tuned earlier

base_gbm.vote <- h2o.gbm(
  x = X_HMDA, y = Y_HMDA, training_frame = HMDA_train_h2o, 
  distribution = "bernoulli",  # If your response variable is of 2-class categorical input
  ntrees = optimal_hyperparameters$n.trees, 
  learn_rate = optimal_hyperparameters$shrinkage,   # It's just another name for learning rate
  max_depth = optimal_hyperparameters$interaction.depth, 
  min_rows = optimal_hyperparameters$n.minobsinnode, 
  sample_rate = optimal_hyperparameters$bag.fraction, 
  nfolds = 5,
  fold_assignment = "Modulo", 
  keep_cross_validation_predictions = TRUE,
  stopping_rounds = 3, 
  stopping_metric = "AUC",
  auc_type = "AUTO",
  stopping_tolerance = 0,
  seed = 123
)

## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.

##   |                                                                              |                                                                      |   0%  |                                                                              |======                                                                |   8%  |                                                                              |======================================================================| 100%

## Hard-Voting

# Make class predictions for each model
base_logit.pred <- as.vector(h2o.predict(base_logit.vote, HMDA_test_h2o)$predict)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

base_nb.pred <- as.vector(h2o.predict(base_nb.vote, HMDA_test_h2o)$predict)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

base_gbm.pred <- as.vector(h2o.predict(base_gbm.vote, HMDA_test_h2o)$predict)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Combine predictions into a data frame
predictions <- data.frame(base_logit.pred, base_nb.pred, base_gbm.pred)


# Define function for majority voting (hard voting)
majority_vote <- function(preds) {
  apply(preds, 1, function(row) {
    names(sort(table(row), decreasing = TRUE))[1]  # Get the most frequent class
  })
}

# Get final ensemble prediction
hard_voting_pred <- majority_vote(predictions)

# Convert to factor with correct levels
hard_voting_pred <- factor(hard_voting_pred, levels = levels(HMDA$deny))

# Evaluate hard-voting performance
conf_mat <- table(hard_voting_pred, HMDA_test$deny)
hard_voting_auc <- sum(diag(conf_mat)) / sum(conf_mat)
print(conf_mat)

##                 
## hard_voting_pred  no yes
##              no  495  39
##              yes  28  32

print(paste("Hard-Voting Ensemble Accuracy:", round(hard_voting_auc, 3)))

## [1] "Hard-Voting Ensemble Accuracy: 0.887"

## Soft-Voting

# Combine models into a list (from the base model outputs obtained earlier, @use model_id to subset)
base_voter_models <- c(base_logit.vote@model_id, base_nb.vote@model_id, base_gbm.vote@model_id)

# Create a Stacked Ensemble (Soft Voting)
soft_voting_ensemble <- h2o.stackedEnsemble(
  x = X_HMDA, y = Y_HMDA,
  training_frame = HMDA_train_h2o,
     base_models = base_voter_models
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Predict on the testing data
soft_voting_pred <- h2o.predict(soft_voting_ensemble, newdata = HMDA_test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# View predictions
head(soft_voting_pred)

##   predict        no        yes
## 1      no 0.9452028 0.05479724
## 2      no 0.9512366 0.04876341
## 3      no 0.9464035 0.05359649
## 4     yes 0.1457385 0.85426154
## 5      no 0.9533266 0.04667338
## 6      no 0.9436444 0.05635562

# Evaluate soft-voting performance
soft_voting_auc <- h2o.auc(h2o.performance(soft_voting_ensemble, newdata = HMDA_test_h2o))
print(soft_voting_auc)  # AUC Score, a bit low, not good  :/

## [1] 0.8047289

# Get results from base learners
# Define the AUC function
HMDA_get_auc <- function(model) {
  results <- h2o.performance(model, newdata = HMDA_test_h2o)
  h2o.auc(results)
}

# Get AUC
base_voter_perf <- list(base_logit.vote, base_nb.vote, base_gbm.vote) %>%
  purrr::map_dbl(HMDA_get_auc)  # Map the pre-defined AUC function to each of the three base models


voting_auc <- c(base_voter_perf, hard_voting_auc, soft_voting_auc)
voting_auc <- format(round(voting_auc, 3), nsmall = 3)


# Display model AUC in a table

tab_voting_auc <- data.frame(AUC = voting_auc)
rownames(tab_voting_auc) <- c("Logit", "Naive Bayes", "GBM", "Hard-Voting", "Soft-Voting")

theme_1 <- ttheme_default()
grid.arrange(tableGrob(tab_voting_auc, theme = theme_1))  # The higher the better. Hard-Voting rocks!

# The performance of the hard-voting model is the highest among all models compared, followed by Naive Bayes and soft-voting counterpart.


h2o.shutdown(prompt = FALSE)  # Be sure to close your current session after current computing session

Stacking

Leveraging the information learned from multiple base learners to improve prediction accuracy. We will (also) demonstrate stacking procedure with functions from the \(\textsf{h2o}\) package.

library(tidyverse) # for rearranging data
library(ISLR2) # load BrainCancer data
library(resample)  # initial_split()
library(recipes)   # for step_other() function

housing <- read.csv(url("https://www.dropbox.com/scl/fi/jib74gutay1cj00tfg6vh/housing.csv?rlkey=d871strkjvdgrzlnkslsac5c7&st=qcywbsbx&dl=1"))

names(housing) <- c("income", "age", "nrooms", "nbedrooms", "pop", "price", "address")
housing <- housing[, -c(7)]  # remove address column

set.seed(123) # for reproducibility

# Do a 75-25 train-test split
split <- createDataPartition(housing$price, p = 0.75, list=FALSE)
housing_train <- housing[split,]
housing_test <- housing[-split,]

# Initialize the h2o
h2o.init()

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\hktse\AppData\Local\Temp\Rtmpg1lrMi\file38a036cc16e1/h2o_hktse_started_from_r.out
##     C:\Users\hktse\AppData\Local\Temp\Rtmpg1lrMi\file38a0163b6d10/h2o_hktse_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         6 seconds 332 milliseconds 
##     H2O cluster timezone:       Asia/Taipei 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 8 days 
##     H2O cluster name:           H2O_started_from_R_hktse_dwm061 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.98 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.2 (2024-10-31 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 8 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Collapse infrequent categorical levels
collapse_labels <- recipe(price ~ ., data = housing_train) %>%
  step_other(all_nominal(), threshold = 0.005)    #  creates a specification of a recipe step that will pool infrequently occurring values into an "other" category.

# Need to convert the train and test dataframes to h2o format. Might take a while, be patient!
train_h2o <- prep(collapse_labels, training = housing_train, retain = TRUE) %>%
                                                                    juice() %>%
                                                                   as.h2o()

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

test_h2o <- prep(collapse_labels, training = housing_train) %>%
                                 bake(new_data = housing_test) %>%
                                                  as.h2o()

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# You can also use h2o.splitFrame(data = , ratios = ) to create train-test split and assign the first element of the output [[1]] to training data, the second element [[2]] to testing data.


# Check out data class
class(train_h2o)

## [1] "H2OFrame"

class(test_h2o)

## [1] "H2OFrame"

# Split predictors/features (x) and the outcome variable (y) names. Only "names" are needed
Y_house <- "price"
X_house <- names(train_h2o[, -c(6)])


### Define base learners  ###
## Must apply the same nfolds to all base learners!
# Train & cross-validate an elastic net model via h2o.glm()
base_glm <- h2o.glm(
  x = X_house, y = Y_house, training_frame = train_h2o, 
  family = "gaussian",  # continuous outcomes
   alpha = 0.5,    # elastic net
  lambda = 0.02,  # penalty parameter
  remove_collinear_columns = FALSE,   # When lambda != 0, set this to FALSE
  nfolds = 5, fold_assignment = "Modulo",  # "Module" evenly splits the dataset into the designated folds and does not depend on the seed
  keep_cross_validation_predictions = TRUE, 
  seed = 123
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Train & cross-validate a random forest model via h2o.randomForest()
base_rf <- h2o.randomForest(
  x = X_house, y = Y_house, training_frame = train_h2o, 
  ntrees = 50, mtries = 5,   # The number of trees and the number of random variables used in each tree
  max_depth = 50, min_rows = 1,  # Maximum tree depth (complexity of the trees) and minimum number of observations in a leaf in order to split
  sample_rate = 0.5, nfolds = 5,  # Sampling rate (without replacement). The range is between 0 to 1. In GBM and XGBoost, this value defaults to 1.
  fold_assignment = "Modulo", 
  keep_cross_validation_predictions = TRUE,
  stopping_rounds = 50, stopping_metric = "RMSE",  # Should increase performance (stopping_metric) by restricting the number of models built (stopping_rounds)
  stopping_tolerance = 0,  # The tolerance value by which a model must improve before training stops
  seed = 123
)

## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.

##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   2%  |                                                                              |=====================================================                 |  75%  |                                                                              |====================================================================  |  97%  |                                                                              |======================================================================| 100%

# Train & cross-validate a GBM via h2o.gbm()

base_gbm <- h2o.gbm(
  x = X_house, y = Y_house, training_frame = train_h2o, 
  ntrees = 200, learn_rate = 0.05,  # The rate at which GBM learns when building a model. Lower learning rates are preferrable, but more trees will be required
  max_depth = 10, min_rows = 10, 
  sample_rate = 0.8, nfolds = 5,
  fold_assignment = "Modulo", 
  keep_cross_validation_predictions = TRUE,
  stopping_rounds = 50, stopping_metric = "RMSE",
  stopping_tolerance = 0,
  seed = 123
)

## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.

##   |                                                                              |                                                                      |   0%  |                                                                              |===                                                                   |   4%  |                                                                              |=======================                                               |  32%  |                                                                              |======================================================                |  78%  |                                                                              |======================================================================| 100%

# Train a stacked ensemble using the above base models
ensemble_tree <- h2o.stackedEnsemble(
  x = X_house, y = Y_house, 
  training_frame = train_h2o, 
        model_id = "ensemble_tree",
     base_models = list(base_glm, base_rf, base_gbm), # Enter your base models here
  metalearner_algorithm = "AUTO"   # The algorithm learns the optimal combination of base learners. Here we use the default setting: GLM with non negative weights w/ standardization turned off
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Get results from base learners
# We first define the RMSE function
get_rmse <- function(model) {
  results <- h2o.performance(model, newdata = test_h2o)
  results@metrics$RMSE
}

# Get RMSE
base_rmse <- list(base_glm, base_rf, base_gbm) %>%
  purrr::map_dbl(get_rmse)  # Map the function you defined above to the three base models


# Get RMSE for the Stacked results
ensemble_rmse <- h2o.performance(ensemble_tree, newdata = test_h2o)@metrics$RMSE

# Combine the RMSE of the base learners with that of the meta learner
rmse <- c(base_rmse, ensemble_rmse)
rmse <- format(round(rmse, 3), nsmall = 3)

# Display model RMSE in a table

library(gridExtra)
library(grid)

tab_rmse <- data.frame(RMSE = rmse)
rownames(tab_rmse) <- c("GLM", "RandomForest", "GBM", "Ensemble")

theme_1 <- ttheme_default()
grid.arrange(tableGrob(tab_rmse, theme = theme_1))

# The ensemble model (aka., super learner) does have relative low RMSE; however, the improvement isn't that much, and it's still 0.03 % lower than the base GLM model.
(ensemble_rmse - base_rmse[1]) / ensemble_rmse

## [1] 0.0003274046

# Perhaps the correlation across the base learners, especially between the two tree-based learners, is to be blamed. Let's find out.

data.frame(
  GLM_pred = as.vector(h2o.getFrame(base_glm@model$cross_validation_holdout_predictions_frame_id$name)),
  RF_pred = as.vector(h2o.getFrame(base_rf@model$cross_validation_holdout_predictions_frame_id$name)),
  GBM_pred = as.vector(h2o.getFrame(base_gbm@model$cross_validation_holdout_predictions_frame_id$name))
) %>% cor()  # Get correlation between model predictions

##           GLM_pred   RF_pred  GBM_pred
## GLM_pred 1.0000000 0.9794124 0.9883014
## RF_pred  0.9794124 1.0000000 0.9902745
## GBM_pred 0.9883014 0.9902745 1.0000000

# It turned out they are indeed highly-correlated. This should be avoided.

# Let's take a further look at the ensemble model estimate. Since the meta learner (the ensemble model) is the optimal combination of its base learners, what about the "weight" of these level-one models? Does the ensemble model assign higher (lower) weights to any the base models? Unfortunately, h2o's Stacked Ensemble model does not directly supply such information in the traditional sense (like linear regression coefficients). However, you can estimate their relative importance using the metalearner coefficients if you use a linear metalearner. let's do that.

# Extract the metalearner model, lots of subsetting to do ...
metalearner <- h2o.getModel(ensemble_tree@model$metalearner$name)

# Get the coefficients (i.e., weights of base models)
metalearner_coeffs <- h2o.coef(metalearner)
print(metalearner_coeffs)

##                   Intercept GLM_model_R_1745909452877_1 
##               -1.723607e+04                9.775170e-01 
## DRF_model_R_1745909452877_2 GBM_model_R_1745909452877_3 
##                2.293963e-02                1.355666e-02

# The meta learner gives higher weight to the GLM. In fact, if you tweak around the above code, you will see the performance of the meta learner is also almost entirely driven by the GLM base model.

# We can also get a glimpse of its relative importance using:
h2o.varimp(metalearner)

## Variable Importances: 
##                      variable relative_importance scaled_importance percentage
## 1 GLM_model_R_1745909452877_1       324096.312500          1.000000   0.965538
## 2 DRF_model_R_1745909452877_2         7115.145020          0.021954   0.021197
## 3 GBM_model_R_1745909452877_3         4452.344727          0.013738   0.013264

h2o.shutdown(prompt = FALSE)  # Shut down h2o

Let’s try using the ensemble approach to model data of dichotomous outcome.

# Load cancer data
cancer <- read.csv(url("https://www.dropbox.com/scl/fi/ot23dm6h3immb9sv7l248/Cancer_Data.csv?rlkey=vv62t6hi34795v11epc72tpvo&st=7utqlcb3&dl=1"))
# remove id column
cancer <- cancer[, -c(1)]

# Examine and data structure and variable class
names(cancer)

##  [1] "diagnosis"               "radius_mean"            
##  [3] "texture_mean"            "perimeter_mean"         
##  [5] "area_mean"               "smoothness_mean"        
##  [7] "compactness_mean"        "concavity_mean"         
##  [9] "concave.points_mean"     "symmetry_mean"          
## [11] "fractal_dimension_mean"  "radius_se"              
## [13] "texture_se"              "perimeter_se"           
## [15] "area_se"                 "smoothness_se"          
## [17] "compactness_se"          "concavity_se"           
## [19] "concave.points_se"       "symmetry_se"            
## [21] "fractal_dimension_se"    "radius_worst"           
## [23] "texture_worst"           "perimeter_worst"        
## [25] "area_worst"              "smoothness_worst"       
## [27] "compactness_worst"       "concavity_worst"        
## [29] "concave.points_worst"    "symmetry_worst"         
## [31] "fractal_dimension_worst"

str(cancer)

## 'data.frame':    569 obs. of  31 variables:
##  $ diagnosis              : chr  "M" "M" "M" "M" ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...

head(cancer)

##   diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1         M       17.99        10.38         122.80    1001.0         0.11840
## 2         M       20.57        17.77         132.90    1326.0         0.08474
## 3         M       19.69        21.25         130.00    1203.0         0.10960
## 4         M       11.42        20.38          77.58     386.1         0.14250
## 5         M       20.29        14.34         135.10    1297.0         0.10030
## 6         M       12.45        15.70          82.57     477.1         0.12780
##   compactness_mean concavity_mean concave.points_mean symmetry_mean
## 1          0.27760         0.3001             0.14710        0.2419
## 2          0.07864         0.0869             0.07017        0.1812
## 3          0.15990         0.1974             0.12790        0.2069
## 4          0.28390         0.2414             0.10520        0.2597
## 5          0.13280         0.1980             0.10430        0.1809
## 6          0.17000         0.1578             0.08089        0.2087
##   fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1                0.07871    1.0950     0.9053        8.589  153.40
## 2                0.05667    0.5435     0.7339        3.398   74.08
## 3                0.05999    0.7456     0.7869        4.585   94.03
## 4                0.09744    0.4956     1.1560        3.445   27.23
## 5                0.05883    0.7572     0.7813        5.438   94.44
## 6                0.07613    0.3345     0.8902        2.217   27.19
##   smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## 1      0.006399        0.04904      0.05373           0.01587     0.03003
## 2      0.005225        0.01308      0.01860           0.01340     0.01389
## 3      0.006150        0.04006      0.03832           0.02058     0.02250
## 4      0.009110        0.07458      0.05661           0.01867     0.05963
## 5      0.011490        0.02461      0.05688           0.01885     0.01756
## 6      0.007510        0.03345      0.03672           0.01137     0.02165
##   fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 1             0.006193        25.38         17.33          184.60     2019.0
## 2             0.003532        24.99         23.41          158.80     1956.0
## 3             0.004571        23.57         25.53          152.50     1709.0
## 4             0.009208        14.91         26.50           98.87      567.7
## 5             0.005115        22.54         16.67          152.20     1575.0
## 6             0.005082        15.47         23.75          103.40      741.6
##   smoothness_worst compactness_worst concavity_worst concave.points_worst
## 1           0.1622            0.6656          0.7119               0.2654
## 2           0.1238            0.1866          0.2416               0.1860
## 3           0.1444            0.4245          0.4504               0.2430
## 4           0.2098            0.8663          0.6869               0.2575
## 5           0.1374            0.2050          0.4000               0.1625
## 6           0.1791            0.5249          0.5355               0.1741
##   symmetry_worst fractal_dimension_worst
## 1         0.4601                 0.11890
## 2         0.2750                 0.08902
## 3         0.3613                 0.08758
## 4         0.6638                 0.17300
## 5         0.2364                 0.07678
## 6         0.3985                 0.12440

# Check out the freq. dist of the outcome variable "diagnosis"

table(cancer$diagnosis)

## 
##   B   M 
## 357 212

# Do a 75-25 train-test split
set.seed(123) # for reproducibility
split_ind <- createDataPartition(cancer$diagnosis, p = 0.75, list=FALSE)
cancer_train <- cancer[split_ind,]
cancer_test <- cancer[-split_ind,]


# Initialize the h2o
h2o.init()

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\hktse\AppData\Local\Temp\RtmpigHSKx\file3a80404039f9/h2o_hktse_started_from_r.out
##     C:\Users\hktse\AppData\Local\Temp\RtmpigHSKx\file3a80538452ad/h2o_hktse_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         5 seconds 960 milliseconds 
##     H2O cluster timezone:       Asia/Taipei 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 9 days 
##     H2O cluster name:           H2O_started_from_R_hktse_uyg267 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.98 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.2 (2024-10-31 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 9 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Collapse infrequent categorical levels
cancer_collapse_labels <- recipe(diagnosis ~ ., data = cancer_train) %>%
  step_other(all_nominal(), threshold = 0.005)

# Convert categorical variables to factors
cancer$diagnosis <- as.factor(cancer$diagnosis)

# Need to convert the train and test dataframes to h2o format. Might take a while, be patient!
cancer_train_h2o <- prep(cancer_collapse_labels, training = cancer_train, retain = TRUE) %>%
                                                                    juice() %>%
                                                                   as.h2o()

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

cancer_test_h2o <- prep(cancer_collapse_labels, training = cancer_train) %>%
                                 bake(new_data = cancer_test) %>%
                                                  as.h2o()

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Check out data class
class(cancer_train_h2o)

## [1] "H2OFrame"

class(cancer_test_h2o)

## [1] "H2OFrame"

# Split predictors/features (x) and the outcome variable (y) names. Only "names" are needed
Y_cancer <- "diagnosis"
X_cancer <- names(cancer_train_h2o[, -c(31)])   # "diagnosis" is in the 31st column


### Define the base learners  ###
## Again, you must apply the same nfolds to all base learners
# Train & cross-validate an elastic net model via h2o.glm

cancer_base_glm <- h2o.glm(
  x = X_cancer, y = Y_cancer, training_frame = cancer_train_h2o, 
  family = "binomial",  # logit model on binary outcomes
  alpha = 0.75,    # elastic net
  lambda = 0.01,  # penalty parameter
  remove_collinear_columns = FALSE,   # when lambda != 0, set this to FALSE
  nfolds = 5, fold_assignment = "Modulo",  # "Module" evenly splits the dataset into the designated folds and does not depend on the seed
  keep_cross_validation_predictions = TRUE, 
  auc_type = "AUTO",
  seed = 123
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Train & cross-validate a random forest model via h2o.randomForest()
# If your outcome variable is binary and coded as a factor level variable, then H2O's random forest will automatically recognize this as a classification problem.

cancer_base_rf <- h2o.randomForest(
  x = X_cancer, y = Y_cancer, training_frame = cancer_train_h2o, 
  ntrees = 10, mtries = 5,   
  max_depth = 20, min_rows = 2, 
  sample_rate = 0.8, nfolds = 5,  
  fold_assignment = "Modulo", 
  keep_cross_validation_predictions = TRUE,
  auc_type = "AUTO",
  stopping_rounds = 3, stopping_metric = "AUC",  #stopping_rounds must be enabled for stopping_metric or stopping_tolerance to work
  stopping_tolerance = 0, 
  seed = 123
)

## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Train & cross-validate a GBM via h2o.gbm()

cancer_base_gbm <- h2o.gbm(
  x = X_cancer, y = Y_cancer, training_frame = cancer_train_h2o, 
  distribution = "bernoulli",  # If your response variable is of 2-class categorical input
  ntrees = 50, learn_rate = 0.01, 
  max_depth = 5, min_rows = 5, 
  sample_rate = 0.5, nfolds = 5,
  fold_assignment = "Modulo", 
  keep_cross_validation_predictions = TRUE,
  auc_type = "AUTO",
  stopping_rounds = 3, stopping_metric = "AUC",
  stopping_tolerance = 0,
  seed = 123
)

## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.

##   |                                                                              |                                                                      |   0%  |                                                                              |================================================                      |  69%  |                                                                              |======================================================================| 100%

# Train a stacked tree ensemble using the above base models
cancer_ensemble_tree <- h2o.stackedEnsemble(
  x = X_cancer, y = Y_cancer, training_frame = cancer_train_h2o, model_id = "cancer_ensemble_tree",
  base_models = list(cancer_base_glm, cancer_base_rf, cancer_base_gbm),
  metalearner_algorithm = "AUTO"   
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Get AUC from base learners
# Define the AUC function
cancer_get_auc <- function(model) {
  results <- h2o.performance(model, newdata = cancer_test_h2o)
  h2o.auc(results)
}

# Get AUC
cancer_base_auc <- list(cancer_base_glm, cancer_base_rf, cancer_base_gbm) %>%
  purrr::map_dbl(cancer_get_auc)


# Get AUC for Stacked results
ensemble_auc <- h2o.auc(h2o.performance(cancer_ensemble_tree, newdata = cancer_test_h2o))

auc <- c(cancer_base_auc, ensemble_auc)
auc <- format(round(auc, 3), nsmall = 3)


# Display model AUC in a table

tab_auc <- data.frame(AUC = auc)
rownames(tab_auc) <- c("GLM", "RandomForest", "GBM", "Ensemble")

theme_1 <- ttheme_default()
grid.arrange(tableGrob(tab_auc, theme = theme_1))  # The higher the better

# The performance of the ensemble model is very close to the GLM but not higher


h2o.shutdown(prompt = FALSE)  # Shut down

Stacking Grid Search

We can also apply the concept of stacking to grid search. By stacking the results generated from models using unique combination of hyperparameters, we can find the best model that produces the lowest RMSE.

In the two examples shown earlier, the GBM base models were not optimized. Perhaps we should do a bit hyperparameter tuning. The example below demonstrates this stacking approach to grid search using GBM.

# h2o can very easily lose track of the objects (i.e., your data), let’s restart h2o and reload everything!

cancer <- read.csv(url("https://www.dropbox.com/scl/fi/ot23dm6h3immb9sv7l248/Cancer_Data.csv?rlkey=vv62t6hi34795v11epc72tpvo&st=7utqlcb3&dl=1"))

# Remove id column
cancer <- cancer[, -c(1)]

# Convert categorical variables to factors
cancer$diagnosis <- as.factor(cancer$diagnosis)

# Train-test split
set.seed(123)
split_ind <- sample(1:nrow(cancer), size = 0.75 * nrow(cancer))
cancer_train <- cancer[split_ind, ]
cancer_test <- cancer[-split_ind, ]

# Re-initate h2o
h2o.init()

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\hktse\AppData\Local\Temp\RtmpsFijOh\file2c0c715a1fff/h2o_hktse_started_from_r.out
##     C:\Users\hktse\AppData\Local\Temp\RtmpsFijOh\file2c0c2f382900/h2o_hktse_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         5 seconds 169 milliseconds 
##     H2O cluster timezone:       Asia/Taipei 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 7 days 
##     H2O cluster name:           H2O_started_from_R_hktse_hku862 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.98 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.2 (2024-10-31 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 7 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Convert to H2O format
cancer_train_h2o <- as.h2o(cancer_train)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

cancer_test_h2o <- as.h2o(cancer_test)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Define X and Y
Y_cancer <- "diagnosis"
X_cancer <- setdiff(names(cancer_train_h2o), Y_cancer)

# Confirm everything is (still) in h2o
h2o.ls()

##                        key
## 1  cancer_test_sid_b980_20
## 2 cancer_train_sid_b980_18

# As usual and as a minimalist example, we first define hyperparameter grid
hyper_grid <- list(
  max_depth = c(5, 10, 20, 50, 100, 200),  # Maximum depth for each tree to be built
  min_rows = c(1, 3, 5),  # Minimum number of obs in a leaf for splitting
  learn_rate = c(0.3, 0.1, 0.05, 0.01, 0.005),
  learn_rate_annealing = c(0.98, 1),  # Reduce the learning rate by this value after each tree
  sample_rate = c(0.5, 0.75, 1),   # Row sampling
  col_sample_rate = c(0.5, 0.75, 1)    # Col sampling for each split
)



# Define random grid search criteria
search_criteria <- list(
  strategy = "RandomDiscrete",   # Tell the model when to stop the search by the following criteria: max number of models, max time, and/or metric-based early stopping
  max_models = 50
)


# Perform random grid search via h2o.grid()

gbm_grid <- h2o.grid(
  algorithm = "gbm", grid_id = "gbm_grid", x = X_cancer, y = Y_cancer,
  training_frame = cancer_train_h2o, 
  hyper_params = hyper_grid,
  search_criteria = search_criteria, 
  ntrees = 50, 
  stopping_metric = "AUC",     
  stopping_rounds = 5, 
  stopping_tolerance = 0, 
  nfolds = 5, 
  fold_assignment = "Modulo", 
  keep_cross_validation_predictions = TRUE,
  seed = 123
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Sort results by RMSE
random_grid_perf <- h2o.getGrid(
                    grid_id = "gbm_grid", 
                    sort_by = "AUC",
                    decreasing = TRUE  # Sort by AUC in decreasing order
                    )

random_grid_perf

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  learn_rate_annealing 
##   -  max_depth 
##   -  min_rows 
##   -  sample_rate 
## Number of models: 50 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing AUC
##   col_sample_rate learn_rate learn_rate_annealing max_depth min_rows
## 1         0.75000    0.30000              0.98000  20.00000  5.00000
## 2         0.75000    0.30000              1.00000  10.00000  3.00000
## 3         0.50000    0.10000              0.98000  10.00000  5.00000
## 4         0.75000    0.05000              1.00000  10.00000  5.00000
## 5         0.50000    0.30000              0.98000  50.00000  1.00000
##   sample_rate         model_ids     auc
## 1     1.00000 gbm_grid_model_28 0.98388
## 2     0.75000  gbm_grid_model_4 0.98230
## 3     0.75000 gbm_grid_model_11 0.97972
## 4     0.75000 gbm_grid_model_10 0.97835
## 5     0.50000  gbm_grid_model_5 0.97722
## 
## ---
##    col_sample_rate learn_rate learn_rate_annealing max_depth min_rows
## 45         0.50000    0.00500              0.98000  10.00000  1.00000
## 46         0.50000    0.00500              0.98000 200.00000  5.00000
## 47         0.50000    0.00500              0.98000  20.00000  5.00000
## 48         1.00000    0.01000              0.98000  10.00000  5.00000
## 49         0.75000    0.00500              0.98000  50.00000  3.00000
## 50         1.00000    0.10000              0.98000  50.00000  1.00000
##    sample_rate         model_ids     auc
## 45     0.50000 gbm_grid_model_18 0.95141
## 46     0.50000 gbm_grid_model_14 0.95096
## 47     0.50000 gbm_grid_model_26 0.95096
## 48     1.00000 gbm_grid_model_32 0.94809
## 49     0.50000 gbm_grid_model_16 0.94378
## 50     1.00000 gbm_grid_model_49 0.92001

# Get the model_id for the top-performing model based on AUC (you can subset the best-performer's id using @)
best_model_id <- random_grid_perf@model_ids[[1]]
best_model <- h2o.getModel(best_model_id)  # Look for the model indexed by this id
h2o.performance(best_model, newdata = cancer_test_h2o)

## H2OBinomialMetrics: gbm
## 
## MSE:  0.04225112
## RMSE:  0.2055508
## LogLoss:  0.1410415
## Mean Per-Class Error:  0.04880952
## AUC:  0.9902778
## AUCPR:  0.988111
## Gini:  0.9805556
## R^2:  0.8285728
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         B  M    Error    Rate
## B      76  4 0.050000   =4/80
## M       3 60 0.047619   =3/63
## Totals 79 64 0.048951  =7/143
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold     value idx
## 1                       max f1  0.524929  0.944882  63
## 2                       max f2  0.044910  0.966258  73
## 3                 max f0point5  0.796560  0.966102  57
## 4                 max accuracy  0.796560  0.951049  57
## 5                max precision  0.994307  1.000000   0
## 6                   max recall  0.044910  1.000000  73
## 7              max specificity  0.994307  1.000000   0
## 8             max absolute_mcc  0.796560  0.902161  57
## 9   max min_per_class_accuracy  0.524929  0.950000  63
## 10 max mean_per_class_accuracy  0.524929  0.951190  63
## 11                     max tns  0.994307 80.000000   0
## 12                     max fns  0.994307 62.000000   0
## 13                     max fps  0.002966 80.000000 142
## 14                     max tps  0.044910 63.000000  73
## 15                     max tnr  0.994307  1.000000   0
## 16                     max fnr  0.994307  0.984127   0
## 17                     max fpr  0.002966  1.000000 142
## 18                     max tpr  0.044910  1.000000  73
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

# The AUC is very high!

# We can also stack the top-performing models from grid search to train the meta learner.

# Train a stacked ensemble using the GBM grid
ensemble_gbm_grid <- h2o.stackedEnsemble(
  x = X_cancer, y = Y_cancer, 
  training_frame = cancer_train_h2o, 
  model_id = "ensemble_gbm_grid",
  base_models = gbm_grid@model_ids,   # The name of the GBM grid search model your trained earlier with h2o.grid() should appear here  @ means include all models trained 
  metalearner_algorithm = "gbm"
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Evaluate ensemble performance on test data
h2o.performance(ensemble_gbm_grid, newdata = cancer_test_h2o)

## H2OBinomialMetrics: stackedensemble
## 
## MSE:  0.0469671
## RMSE:  0.2167189
## LogLoss:  0.1691067
## Mean Per-Class Error:  0.05555556
## AUC:  0.9793651
## AUCPR:  0.9831875
## Gini:  0.9587302
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         B  M    Error    Rate
## B      80  0 0.000000   =0/80
## M       7 56 0.111111   =7/63
## Totals 87 56 0.048951  =7/143
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold     value idx
## 1                       max f1  0.981867  0.941176  14
## 2                       max f2  0.065958  0.953125  26
## 3                 max f0point5  0.981867  0.975610  14
## 4                 max accuracy  0.981867  0.951049  14
## 5                max precision  0.995783  1.000000   0
## 6                   max recall  0.007972  1.000000  46
## 7              max specificity  0.995783  1.000000   0
## 8             max absolute_mcc  0.981867  0.904085  14
## 9   max min_per_class_accuracy  0.176520  0.937500  23
## 10 max mean_per_class_accuracy  0.176520  0.944940  23
## 11                     max tns  0.995783 80.000000   0
## 12                     max fns  0.995783 62.000000   0
## 13                     max fps  0.003539 80.000000  57
## 14                     max tps  0.007972 63.000000  46
## 15                     max tnr  0.995783  1.000000   0
## 16                     max fnr  0.995783  0.984127   0
## 17                     max fpr  0.003539  1.000000  57
## 18                     max tpr  0.007972  1.000000  46
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

# However, the AUC is lower than the best-performing gbm_grid model (random_grid_perf@model_ids[[1]])

h2o.shutdown(prompt = FALSE)

Advanced application: Predicting Multiple Outcomes using Residual Stacking

In previous example, we see that sometimes prediction accuracy may actually be compromised by stacking when some base models already provide accurate predictions. Hence, Residual Stacking (RS) is proposed to mitigate this problem. In RS, all information learned from level-one base models is retained. In the stacking stage, the outcome is learned using all other fitted values (but excluding the outcome) as the predictors to predict the residual of the outcome. Finally, level-one predictions will be revised using the residuals.

We will demonstrate this at below using the \(\textsf{MTPS}\) package.

library(MTPS)
library(caret)
library(ggplot2)  # for plotting
library(reshape2)  # for data wrangling

# We'll be using the HIV Drug Resistance Database in the package. In this database, the resistance to the five Nucleoside RT Inhibitor (NRTI) drugs were used as multivariate outcomes, including Lamivudine (3TC), Abacavir(ABC), Zidovudine (AZT), Stavudine (D4T), Didanosine (DDI). The final outcome data is a matrix of size 1246 × 5, and the predictor data is a matrix of 1246 × 228 values. Here, “YY” refers to the outcome data and “XX” is a vector of the predictors.
data("HIV")

# Take a look at how the data looks like
head(YY)

##          ABC       3TC         AZT        D4T         DDI
## 1 0.63346846 2.3010300  0.59106461 0.14612804  0.07918125
## 3 0.04139269 0.1461280  1.44715803 0.00000000 -0.09691001
## 4 0.17609126 0.2552725  0.85125835 0.07918125  0.04139269
## 6 0.85125835 2.3010300 -0.09691001 0.11394335  0.27875360
## 7 0.71600334 0.4065402  0.82607480 0.73239376  0.72427587
## 8 0.77085201 2.3010300  0.27875360 0.11394335  0.23044892

XX[1:4, 5:10]

##   X.8I X.8V X.11K X.11R X.20K X.20R
## 1    0    0     0     0     0     0
## 3    0    0     0     0     0     0
## 4    0    0     0     1     0     0
## 6    0    0     0     0     0     0

dim(YY)

## [1] 1246    5

dim(XX)

## [1] 1246  228

# This is a rather large database

# First, convert the data to matrix format
set.seed(123)
xmat <- as.matrix(XX)
ymat <- as.matrix(YY)
nobs <- nrow(xmat)

# Do a 75-25 train-test split
split_ind <- createDataPartition(ymat[,1], p = 0.75, list=FALSE) # This is doable since XX and YY contain the same number of rows

y_train <- ymat[split_ind, ]
y_test  <- ymat[-split_ind, ]

x_train <- xmat[split_ind, ]
x_test  <- xmat[-split_ind, ]

# The multiFit() function is a non-stacking algorithm that fits individual models for each outcome separately. The following code fits generalized linear models with ridge regression on each outcome.

multi.fit <- multiFit(xmat = x_train, ymat = y_train, method = glmnet.ridge, family = "gaussian")

# Use the base R's predict() function to return predicted values 
multi.pred <- predict(multi.fit, x_test)

# To set up the stacking algorithm, we need to specify the base (step 1) and the meta learner (step 2) algorithms. As we are dealing with continuous outcome, we need to set family = "gaussian". The cv and residual argument specify whether we want to use cross-validation stacking (CVS) or residual stacking (RS) or their combination. The default value of cv is “FALSE” and for residual is “TRUE”, denoting residual stacking. 

# The code below fits models using the standard stacking (SS), cross-validation stacking (CVS), residual stacking (RS) and the combination of the latter two, cross-validation residual stacking (CVRS) via the MTPS function. Note how the argument settings of cv and residual vary.
# The following models use ridge regression as their base models in Step 1 and tree methods as the meta model in Step 2.

# Standard stacking 
ss.fit <- MTPS(xmat = x_train, ymat = y_train, family = "gaussian",
                            cv = FALSE, residual = FALSE,  # cv and residual all set to FALSE
                            method.step1 = glmnet.ridge,  # Base learner
                            method.step2 = rpart1)   # Meta learner

# Predict on testing data
ss.pred <- predict(ss.fit, x_test)


# Cross-validation Stacking 
cv.fit <- MTPS(xmat = x_train, ymat = y_train, family = "gaussian",
                            cv = TRUE, residual = FALSE,  # cv set to TRUE
                            method.step1 = glmnet.ridge,
                            method.step2 = rpart1)

# Predict on testing data
cv.pred <- predict(cv.fit, x_test)


# Residual Stacking 
rs.fit <- MTPS(xmat = x_train, ymat = y_train, family = "gaussian",
                            cv = FALSE, residual = TRUE,   # residual set to TRUE
                            method.step1 = glmnet.ridge,
                            method.step2 = rpart1)
# Predict on testing data
rs.pred <- predict(rs.fit, x_test)

# Cross-validation residual stacking 
cvrs.fit <- MTPS(xmat = x_train, ymat = y_train, family = "gaussian",
                            cv = TRUE, residual = TRUE,   # all set to TRUE
                            method.step1 = glmnet.ridge,
                            method.step2 = rpart1)
# Predict on testing data
cvrs.pred <- predict(cvrs.fit, x_test)


n_test <- nrow(x_test)  # Calculate the dimension of the test data
ctn.plot.data_matrix <- cbind(rbind(multi.pred, ss.pred, cv.pred, rs.pred, cvrs.pred), y_test[rep(seq_len(n_test), 5), ])  # Make a matrix of dimension 5 * N pred. x (5 methods' predictions + 5 actual values)

ctn.plot.data <- data.frame(rep(c("No-Stacking", "SS", "CV", "RS", "CVRS"), each = n_test), ctn.plot.data_matrix)  # Append the matrix to a column of size 5 (methods) times n_test

colnames(ctn.plot.data) <- c("method", paste0(rep("pred.", ncol(y_test)), colnames(y_test)), colnames(y_test))  # Rename the columns

# Reshape the data to 3 columns times the number of actual values
dm1 <- melt(ctn.plot.data[,c("method","ABC","3TC","AZT","D4T", "DDI")], id=c("method"))

# Reshape the data to 3 columns times the number of predicted values
dm2 <- melt(ctn.plot.data[,c("method","pred.ABC","pred.3TC","pred.AZT","pred.D4T", "pred.DDI")], id=c("method"))

# Combind dm1 and dm2
ctn.plot.data <- cbind(dm1, dm2[, -1])

# Rename the dataframe
colnames(ctn.plot.data) <- c("method", "Y", "yVal", "predict", "predictVal")

# Convert the values within the dataframe: method name to factor class, actual and predicted values to numeric
ctn.plot.data$method <- factor(ctn.plot.data$method, unique(as.character(ctn.plot.data$method)))
ctn.plot.data$yVal <- as.numeric(ctn.plot.data$yVal)
ctn.plot.data$predictVal <- as.numeric(ctn.plot.data$predictVal)

# Render the actual and predicted values into a series of 5 x 5 bivariate plots
ggplot(ctn.plot.data) +
  geom_point(aes(x=predictVal, y=yVal, color=method), size = 0.5, alpha = 0.6) +
  geom_abline(slope=1, alpha = 0.2) +  # 45 degree line
  coord_fixed() +
  ylab("Testing Data Outcome") + xlab("Predicted Outcome on Testing Data") +
  scale_x_discrete(breaks = NULL) +
  scale_y_discrete(breaks = NULL) +
  theme_bw() +
  theme(axis.text = element_blank(),
        strip.placement = "outside",
        strip.background = element_blank()) +
  facet_grid(Y~method)   # display subplots by method

# We see that the RS and CVRS models' prediction patterns are very similar to that of the non-stacking model, while the prediction patterns of SS and CV are nearly identical to each other.

Stacking on Mix Outcomes

The \(\textsf{MTPS}\) packages allows simultaneously modeling of mix outcomes, such as a combination of continuous and binary outcomes. One only need to specify each outcome type in the family argument. The following example demonstrates this using Ridge regression as the base models for the first two outcomes and logit model for the third outcome (binary), followed by tree method as the meta model.

Before we begin, a couple conversion and pre-processing tasks need to be done.

# Dichotomize the last outcome variable to binary scale.
cutoff <- 1.5  # Define the cutoff value
ymat.bin <- ymat
xmat.bin <- xmat
ymat.bin[,5] <- (10^ymat[,5] < cutoff) * 1

# Combine the dichotomized last outcome column with the first two original outcome columns
ymat.mix <- cbind(ymat[,1:2], ymat.bin[,5])
xmat.mix <- xmat  # X variables remain the same

# Use the previously-generated split indicator (split_ind) for train-test split
y.train.mix <- ymat.mix[split_ind, ]
y.test.mix  <- ymat.mix[-split_ind, ]
x.train.mix <- xmat.mix[split_ind, ]
x.test.mix  <- xmat.mix[-split_ind, ]

# Model fitting: ridge models as Step 1 learners, tree method as Step 2 learner
# Use residual stacking (RS)
rs.mix.fit <- MTPS(xmat = x.train.mix, ymat = y.train.mix,
                family=c("gaussian","gaussian","binomial"),  # The first two outcome variables are continuous, the last one is binomial
                cv = FALSE, residual = TRUE,
                method.step1 = glmnet.ridge,
                method.step2 = rpart1)

## Warning: from glmnet C++ code (error code -100); Convergence for 100th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned

## Warning in log(yhat): NaNs produced

## Warning in sqrt(-2 * log(yhat)): NaNs produced

## Warning in log(1 - yhat): NaNs produced

## Warning in sqrt(-2 * (log(1 - yhat))): NaNs produced

# Predict on testing data
rs.mix.pred <- predict(rs.mix.fit, x.test.mix)


## Plot the predicted values against the actual values for the first two outcomes


mix.plot.data <- cbind(rep(colnames(y.test.mix)[1:2], each = nrow(y.test.mix)), # Expand row length by the row length of the test data and fill in outcome variable names
                       rbind(cbind(rs.mix.pred[, 1], y.test.mix[, 1]),  # Append predicted values and actual values of the test data
                             cbind(rs.mix.pred[, 2], y.test.mix[, 2])))

colnames(mix.plot.data) <- c("Y", "predict", "outcome")  # Rename columns
mix.plot.data <- as.data.frame(mix.plot.data)   # Convert to data frame

# Convert the values within the dataframe: method name to factor class, actual and predicted values to numeric class
mix.plot.data$predict <- as.numeric(as.character(mix.plot.data$predict))
mix.plot.data$outcome <- as.numeric(as.character(mix.plot.data$outcome))

ggplot(mix.plot.data) +
  geom_point(aes(x=predict, y=outcome, color=Y), size = 0.5, alpha = 0.6) +
  ylab("Outcome of Testing Data") + xlab("Predicted Outcome of Testing Data") +
  scale_x_discrete(breaks = NULL) +
  scale_y_discrete(breaks = NULL) +
  geom_abline(slope=1, alpha = 0.2) +
  coord_fixed() +
  theme_bw() +
  theme(legend.title = element_blank(),
        axis.text = element_blank(),
        strip.placement = "outside",
        strip.background = element_blank()) +
  facet_grid(~Y)  # Render by outcome variable

# Print and calculate the prediction accuracy of the third outcome variable 
print(colnames(y.test.mix)[3])

## [1] ""

print(table((rs.mix.pred[,3] > 0.5) * 1, y.test.mix[,3]))

##    
##       0   1
##   0 130  40
##   1  19 121

# Accuracy  
mix.accuracy <- table((rs.mix.pred[,3] > 0.5) * 1, y.test.mix[,3])
(mix.accuracy[1, 1] + mix.accuracy[2, 2]) / sum(mix.accuracy)

## [1] 0.8096774