What is voting and how do models vote?
library(AER) # for HMDA data
## Warning: package 'AER' was built under R version 4.4.3
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(e1071) # for SVM and naiveBayes
## Warning: package 'e1071' was built under R version 4.4.3
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(xgboost)
library(mlbench) # our data source
# Load Default data
data(HMDA)
str(HMDA)
## 'data.frame': 2380 obs. of 14 variables:
## $ deny : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ pirat : num 0.221 0.265 0.372 0.32 0.36 ...
## $ hirat : num 0.221 0.265 0.248 0.25 0.35 ...
## $ lvrat : num 0.8 0.922 0.92 0.86 0.6 ...
## $ chist : Factor w/ 6 levels "1","2","3","4",..: 5 2 1 1 1 1 1 2 2 2 ...
## $ mhist : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 1 1 2 2 2 1 ...
## $ phist : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ unemp : num 3.9 3.2 3.2 4.3 3.2 ...
## $ selfemp : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ insurance: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ condomin : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ afam : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ single : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 2 1 1 2 ...
## $ hschool : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
set.seed(123) # for reproducibility
# Do a 75-25 train-test split
split <- createDataPartition(HMDA$deny, p = 0.75, list=FALSE)
HMDA_train <- HMDA[split,]
HMDA_test <- HMDA[-split,]
# Model fitting, prediction, and accuracy
# Probit model
probit.vote <- glm(deny ~ .,
family = binomial(link = "probit"),
data = HMDA_train)
probit.pred <- predict(probit.vote, HMDA_test, type = "response") # Save for soft voting estimation
probit.pred.bin <- as.factor(ifelse(probit.pred >= 0.5, "yes", "no"))
confmat_probit <- table(Predicted = probit.pred.bin, Actual = HMDA_test$deny) # Confusion table
confmat_probit
## Actual
## Predicted no yes
## no 512 54
## yes 11 17
probit_accuracy <- (confmat_probit[1, 1] + confmat_probit[2, 2]) / sum(confmat_probit) * 100
probit_accuracy
## [1] 89.05724
# logit model
logit.vote <- glm(deny ~ .,
family = binomial(link = "logit"),
data = HMDA_train)
logit.pred <- predict(logit.vote, HMDA_test, type = "response") # Save for soft voting estimation
logit.pred.bin <-as.factor(ifelse(logit.pred >= 0.5, "yes", "no"))
confmat_logit <- table(Predicted = logit.pred.bin, Actual = HMDA_test$deny) # Confusion matrix
confmat_logit
## Actual
## Predicted no yes
## no 510 51
## yes 13 20
logit_accuracy <- (confmat_logit[1, 1] + confmat_logit[2, 2]) / sum(confmat_logit) * 100
logit_accuracy
## [1] 89.22559
# SVM
svm.vote <- svm(deny ~ .,
scale = TRUE,
kernel = "polynomial",
degree = 3, # if kernel is of type = "polynomial"
cost = 1, # The cost function (C)
probability = TRUE, # Whether to output probability predictions
na.action = na.omit,
data = HMDA_train)
svm.pred <- predict(svm.vote, HMDA_test)
confmat_svm <- table(Predicted = svm.pred, Actual = HMDA_test$deny) # Confusion matrix
confmat_svm
## Actual
## Predicted no yes
## no 522 71
## yes 1 0
svm_accuracy <- (confmat_svm[1, 1] + confmat_svm[2, 2]) / sum(confmat_svm) * 100
svm_accuracy
## [1] 87.87879
# Naive Bayes
nb.vote <- naiveBayes(deny ~ .,
data = HMDA_train)
nb.pred <- predict(nb.vote, HMDA_test)
confmat_nb <- table(Predicted = nb.pred, Actual = HMDA_test$deny) # Confusion matrix
confmat_nb
## Actual
## Predicted no yes
## no 501 49
## yes 22 22
nb_accuracy <- (confmat_nb[1, 1] + confmat_nb[2, 2]) / sum(confmat_nb) * 100
nb_accuracy
## [1] 88.04714
# Random forest
# Fine-tuning hyperparameters
hyper_grid <- expand.grid(
shrinkage = c(.01, .05),
interaction.depth = c(1, 3),
n.minobsinnode = c(5, 10, 25),
bag.fraction = c(.65, .8),
optimal_trees = 0,
min_RMSE = 0
)
# Convert deny to numeric (0, 1)
HMDA_train$deny <- ifelse(HMDA_train$deny == "yes", 1, 0)
for(i in 1:nrow(hyper_grid)) {
# train model
gbm.tune <- gbm(
deny ~ .,
distribution = "bernoulli",
data = HMDA_train,
n.trees = 500,
interaction.depth = hyper_grid$interaction.depth[i],
shrinkage = hyper_grid$shrinkage[i],
n.minobsinnode = hyper_grid$n.minobsinnode[i],
bag.fraction = hyper_grid$bag.fraction[i],
train.fraction = .75,
n.cores = NULL,
verbose = FALSE
)
# Locate minimum training error from the n-th tree, add it to the grid
hyper_grid$optimal_trees[i] <- which.min(gbm.tune$valid.error) # Find the index
hyper_grid$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}
# Get the best-performing hyperparameters
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
##
## slice
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
optimal_hyperparameters <- hyper_grid %>%
dplyr::arrange(min_RMSE) %>%
head(1)
# Set the optimal hyperparameter values and re-train the model
gbm.vote <- gbm(
deny ~ .,
distribution = "bernoulli",
data = HMDA_train,
n.trees = optimal_hyperparameters$optimal_trees,
interaction.depth = optimal_hyperparameters$interaction.depth,
shrinkage = optimal_hyperparameters$shrinkage,
n.minobsinnode = optimal_hyperparameters$n.minobsinnode,
bag.fraction = optimal_hyperparameters$bag.fraction,
train.fraction = 0.63,
n.cores = NULL, # Use all cores by default
verbose = FALSE
)
# Temporarily convert the "deny" variable in HMDA_test
# Convert deny to numeric (0, 1)
deny <- HMDA_test$deny
HMDA_test$deny <- ifelse(HMDA_test$deny == "yes", 1, 0)
gbm.pred <- predict(gbm.vote, HMDA_test, type = "response")
## Using 193 trees...
gbm.pred.bin <-as.factor(ifelse(gbm.pred >= 0.5, "yes", "no"))
confmat_gbm <- table(Predicted = gbm.pred.bin, Actual = HMDA_test$deny) # Confusion matrix
confmat_gbm
## Actual
## Predicted 0 1
## no 509 55
## yes 14 16
gbm_accuracy <- (confmat_gbm[1, 1] + confmat_gbm[2, 2]) / sum(confmat_gbm) * 100
gbm_accuracy
## [1] 88.38384
HMDA_test$deny <- deny # Add it back, we're gonna need it soon
# Combine predictions into a data frame
preds <- data.frame(probit.pred.bin, logit.pred.bin, svm.pred, nb.pred, gbm.pred.bin)
colnames(preds) <- c("Probit", "Logit", "SVM", "Naive Bayes", "GBM")
# Take a look at this dataframe
head(preds)
## Probit Logit SVM Naive Bayes GBM
## 3 no no no no no
## 5 no no no no no
## 7 no no no no no
## 9 yes yes no no yes
## 17 no no no no no
## 24 no no no no no
## Hard Voting (majority vote)
hard_voting <- apply(preds, 1, function(x) {
names(sort(table(x), decreasing = TRUE))[1] # Sort category with the highest freq. first
})
# Accuracy
hard_voting_accuracy <- sum(hard_voting == HMDA_test$deny) / length(HMDA_test$deny) * 100
cat("Hard Voting Accuracy:", hard_voting_accuracy, "\n")
## Hard Voting Accuracy: 89.39394
# Tabulate hard voting accuracy
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:randomForest':
##
## combine
library(grid)
tab_hard <- data.frame(Accuracy = c(probit_accuracy, logit_accuracy, svm_accuracy, nb_accuracy, gbm_accuracy, hard_voting_accuracy))
tab_hard$Accuracy <- format(round(tab_hard$Accuracy, 3), nsmall = 3) # Limit to 3 digits after the decimal
rownames(tab_hard) <- c("Probit", "Logit", "SVM", "Naive Bayes", "GBM", "Hard-Voting")
theme_1 <- ttheme_default() # Plotting scheme
grid.arrange(tableGrob(tab_hard, theme = theme_1))
# The accuracy of the hard voting model is indeed the best
## Soft Voting (Probabilities)
# since we need to convert them to probability, a few adjustments are needed. We need to convert these prediction values to two columns ("no", "yes") of length nrow(HMDA_test)
probit.pred_0 <- 1 - probit.pred # Also need to encode the probabilities for those 0 outcomes
probit.pred.prob <- as.matrix(data.frame(no = probit.pred_0, yes = probit.pred))
logit.pred_0 <- 1 - logit.pred
logit.pred.prob <- as.matrix(data.frame(no = logit.pred_0, yes = logit.pred))
svm.pred.prob <- attr(predict(svm.vote, HMDA_test, probability = TRUE), "probabilities")
nb.pred.prob <- predict(nb.vote, HMDA_test, type = "raw") # Set type = "raw" to get raw score for each class
gbm.pred_0 <- 1 - gbm.pred
gbm.pred.prob <- as.matrix(data.frame(no = gbm.pred_0, yes = gbm.pred))
# Average probabilities
average_prob <- (probit.pred.prob + logit.pred.prob + svm.pred.prob + nb.pred.prob + gbm.pred.prob) / 5
# Convert to class prediction
soft_voting <- colnames(average_prob)[max.col(average_prob)] # Select the class/outcome with the highest probability using max.col()
soft_voting_accuracy <- sum(soft_voting == HMDA_test$deny) / length(HMDA_test$deny) * 100
cat("Soft Voting Accuracy:", soft_voting_accuracy, "\n")
## Soft Voting Accuracy: 89.39394
# Tabulate soft voting accuracy
tab_soft <- data.frame(Accuracy = c(probit_accuracy, logit_accuracy, svm_accuracy, nb_accuracy, gbm_accuracy, soft_voting_accuracy))
tab_soft$Accuracy <- format(round(tab_soft$Accuracy, 3), nsmall = 3)
rownames(tab_soft) <- c("Probit", "Logit", "SVM", "Naive Bayes", "GBM", "Soft-Voting")
grid.arrange(tableGrob(tab_soft, theme = theme_1))
# Alas, the accuracy of the soft voting model only ranks the third.
There are many existing R ensemble functions that can implement voting methods. Functions from \(\textsf{h2o}\) and \(\textsf{caretEnsemble}\) are two examples. However, \(\textsf{caretList}\), \(\textsf{caretEnsemble}\), and \(\textsf{caretStack}\) tend to re-scale the data, which would require re-scaling, conversion, … , etc. along the way. I for one, do not recommend this package. For this week’s exercise, we will stick to functions from the \(\textsf{h2o}\) package, the scalable open source machine learning platform.
It should be noted that, at the moment, the functions supported by \(\textsf{h2o}\) are somehow limited. For example, the \(\textsf{h2o.glm()}\) function in H2O does not support the “probit” link function. Its \(\textsf{h2o.psvm()}\) does not allow direct specification of degree for polynomial kernels. The cost parameter (C) is also not directly tunable, though it can be controlled via lambda (i.e., regularization strength). For pedagogical purpose, the ensemble examples below will only include three base models, logit, naive Bayes, and GBM.
We first begin with hard-voting, followed by soft-voting. We then assess their performance, in terms of AUC, compared to the three base models against which the ensemble model should produce more accurate predictions.
# Re-load Default data (just in case we've altered it somehow in our earlier data wrangling ... )
data(HMDA) # From the AER package
str(HMDA)
## 'data.frame': 2380 obs. of 14 variables:
## $ deny : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ pirat : num 0.221 0.265 0.372 0.32 0.36 ...
## $ hirat : num 0.221 0.265 0.248 0.25 0.35 ...
## $ lvrat : num 0.8 0.922 0.92 0.86 0.6 ...
## $ chist : Factor w/ 6 levels "1","2","3","4",..: 5 2 1 1 1 1 1 2 2 2 ...
## $ mhist : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 1 1 2 2 2 1 ...
## $ phist : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ unemp : num 3.9 3.2 3.2 4.3 3.2 ...
## $ selfemp : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ insurance: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ condomin : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ afam : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ single : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 2 1 1 2 ...
## $ hschool : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
# Check if the outcome variable "deny" is of factor class (H2O requires categorical targets as factors)
class(HMDA$deny)
## [1] "factor"
set.seed(123) # For reproducibility
# Do a 75-25 train-test split
split <- createDataPartition(HMDA$deny, p = 0.75, list=FALSE)
HMDA_train <- HMDA[split,]
HMDA_test <- HMDA[-split,]
# Initialize H2O
library(h2o)
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\hktse\AppData\Local\Temp\RtmpigHSKx\file3a804f7b77a3/h2o_hktse_started_from_r.out
## C:\Users\hktse\AppData\Local\Temp\RtmpigHSKx\file3a803b71f2a/h2o_hktse_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 5 seconds 856 milliseconds
## H2O cluster timezone: Asia/Taipei
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 9 days
## H2O cluster name: H2O_started_from_R_hktse_yom614
## H2O cluster total nodes: 1
## H2O cluster total memory: 2.98 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 9 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert data to H2O format
HMDA_train_h2o <- as.h2o(HMDA_train)
## | | | 0% | |======================================================================| 100%
HMDA_test_h2o <- as.h2o(HMDA_test)
## | | | 0% | |======================================================================| 100%
# Define predictor variables and target variable
Y_HMDA <- "deny"
X_HMDA <- names(HMDA)[-1] # All columns except the outcome variable assigned to X_HMDA
# Specify base models (i.e., your "voters")
base_logit.vote <- h2o.glm(
x = X_HMDA, y = Y_HMDA, train = HMDA_train_h2o,
family = "binomial", # binary outcomes
link = "logit",
nfolds = 5, fold_assignment = "Modulo", # "Module" evenly splits the dataset into the designated folds and does not depend on the seed
keep_cross_validation_predictions = TRUE,
stopping_metric = "AUC", # Performance metric, other metrics are available
auc_type = "AUTO",
seed = 123
)
## Warning in .h2o.processResponseWarnings(res): Stopping metric is ignored for _stopping_rounds=0..
## | | | 0% | |======================================================================| 100%
base_nb.vote <- h2o.naiveBayes(
x = X_HMDA, y = Y_HMDA, train = HMDA_train_h2o,
laplace = 0, # No smoothing
nfolds = 5, fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
auc_type = "AUTO",
seed = 123
)
## | | | 0% | |======================================================================| 100%
# For the GBM, we will set the hyperparameter to the optimized values we tuned earlier
base_gbm.vote <- h2o.gbm(
x = X_HMDA, y = Y_HMDA, training_frame = HMDA_train_h2o,
distribution = "bernoulli", # If your response variable is of 2-class categorical input
ntrees = optimal_hyperparameters$n.trees,
learn_rate = optimal_hyperparameters$shrinkage, # It's just another name for learning rate
max_depth = optimal_hyperparameters$interaction.depth,
min_rows = optimal_hyperparameters$n.minobsinnode,
sample_rate = optimal_hyperparameters$bag.fraction,
nfolds = 5,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
stopping_rounds = 3,
stopping_metric = "AUC",
auc_type = "AUTO",
stopping_tolerance = 0,
seed = 123
)
## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.
## | | | 0% | |====== | 8% | |======================================================================| 100%
## Hard-Voting
# Make class predictions for each model
base_logit.pred <- as.vector(h2o.predict(base_logit.vote, HMDA_test_h2o)$predict)
## | | | 0% | |======================================================================| 100%
base_nb.pred <- as.vector(h2o.predict(base_nb.vote, HMDA_test_h2o)$predict)
## | | | 0% | |======================================================================| 100%
base_gbm.pred <- as.vector(h2o.predict(base_gbm.vote, HMDA_test_h2o)$predict)
## | | | 0% | |======================================================================| 100%
# Combine predictions into a data frame
predictions <- data.frame(base_logit.pred, base_nb.pred, base_gbm.pred)
# Define function for majority voting (hard voting)
majority_vote <- function(preds) {
apply(preds, 1, function(row) {
names(sort(table(row), decreasing = TRUE))[1] # Get the most frequent class
})
}
# Get final ensemble prediction
hard_voting_pred <- majority_vote(predictions)
# Convert to factor with correct levels
hard_voting_pred <- factor(hard_voting_pred, levels = levels(HMDA$deny))
# Evaluate hard-voting performance
conf_mat <- table(hard_voting_pred, HMDA_test$deny)
hard_voting_auc <- sum(diag(conf_mat)) / sum(conf_mat)
print(conf_mat)
##
## hard_voting_pred no yes
## no 495 39
## yes 28 32
print(paste("Hard-Voting Ensemble Accuracy:", round(hard_voting_auc, 3)))
## [1] "Hard-Voting Ensemble Accuracy: 0.887"
## Soft-Voting
# Combine models into a list (from the base model outputs obtained earlier, @use model_id to subset)
base_voter_models <- c(base_logit.vote@model_id, base_nb.vote@model_id, base_gbm.vote@model_id)
# Create a Stacked Ensemble (Soft Voting)
soft_voting_ensemble <- h2o.stackedEnsemble(
x = X_HMDA, y = Y_HMDA,
training_frame = HMDA_train_h2o,
base_models = base_voter_models
)
## | | | 0% | |======================================================================| 100%
# Predict on the testing data
soft_voting_pred <- h2o.predict(soft_voting_ensemble, newdata = HMDA_test_h2o)
## | | | 0% | |======================================================================| 100%
# View predictions
head(soft_voting_pred)
## predict no yes
## 1 no 0.9452028 0.05479724
## 2 no 0.9512366 0.04876341
## 3 no 0.9464035 0.05359649
## 4 yes 0.1457385 0.85426154
## 5 no 0.9533266 0.04667338
## 6 no 0.9436444 0.05635562
# Evaluate soft-voting performance
soft_voting_auc <- h2o.auc(h2o.performance(soft_voting_ensemble, newdata = HMDA_test_h2o))
print(soft_voting_auc) # AUC Score, a bit low, not good :/
## [1] 0.8047289
# Get results from base learners
# Define the AUC function
HMDA_get_auc <- function(model) {
results <- h2o.performance(model, newdata = HMDA_test_h2o)
h2o.auc(results)
}
# Get AUC
base_voter_perf <- list(base_logit.vote, base_nb.vote, base_gbm.vote) %>%
purrr::map_dbl(HMDA_get_auc) # Map the pre-defined AUC function to each of the three base models
voting_auc <- c(base_voter_perf, hard_voting_auc, soft_voting_auc)
voting_auc <- format(round(voting_auc, 3), nsmall = 3)
# Display model AUC in a table
tab_voting_auc <- data.frame(AUC = voting_auc)
rownames(tab_voting_auc) <- c("Logit", "Naive Bayes", "GBM", "Hard-Voting", "Soft-Voting")
theme_1 <- ttheme_default()
grid.arrange(tableGrob(tab_voting_auc, theme = theme_1)) # The higher the better. Hard-Voting rocks!
# The performance of the hard-voting model is the highest among all models compared, followed by Naive Bayes and soft-voting counterpart.
h2o.shutdown(prompt = FALSE) # Be sure to close your current session after current computing session
Leveraging the information learned from multiple base learners to improve prediction accuracy. We will (also) demonstrate stacking procedure with functions from the \(\textsf{h2o}\) package.
library(tidyverse) # for rearranging data
library(ISLR2) # load BrainCancer data
library(resample) # initial_split()
library(recipes) # for step_other() function
housing <- read.csv(url("https://www.dropbox.com/scl/fi/jib74gutay1cj00tfg6vh/housing.csv?rlkey=d871strkjvdgrzlnkslsac5c7&st=qcywbsbx&dl=1"))
names(housing) <- c("income", "age", "nrooms", "nbedrooms", "pop", "price", "address")
housing <- housing[, -c(7)] # remove address column
set.seed(123) # for reproducibility
# Do a 75-25 train-test split
split <- createDataPartition(housing$price, p = 0.75, list=FALSE)
housing_train <- housing[split,]
housing_test <- housing[-split,]
# Initialize the h2o
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\hktse\AppData\Local\Temp\Rtmpg1lrMi\file38a036cc16e1/h2o_hktse_started_from_r.out
## C:\Users\hktse\AppData\Local\Temp\Rtmpg1lrMi\file38a0163b6d10/h2o_hktse_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 6 seconds 332 milliseconds
## H2O cluster timezone: Asia/Taipei
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 8 days
## H2O cluster name: H2O_started_from_R_hktse_dwm061
## H2O cluster total nodes: 1
## H2O cluster total memory: 2.98 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 8 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Collapse infrequent categorical levels
collapse_labels <- recipe(price ~ ., data = housing_train) %>%
step_other(all_nominal(), threshold = 0.005) # creates a specification of a recipe step that will pool infrequently occurring values into an "other" category.
# Need to convert the train and test dataframes to h2o format. Might take a while, be patient!
train_h2o <- prep(collapse_labels, training = housing_train, retain = TRUE) %>%
juice() %>%
as.h2o()
## | | | 0% | |======================================================================| 100%
test_h2o <- prep(collapse_labels, training = housing_train) %>%
bake(new_data = housing_test) %>%
as.h2o()
## | | | 0% | |======================================================================| 100%
# You can also use h2o.splitFrame(data = , ratios = ) to create train-test split and assign the first element of the output [[1]] to training data, the second element [[2]] to testing data.
# Check out data class
class(train_h2o)
## [1] "H2OFrame"
class(test_h2o)
## [1] "H2OFrame"
# Split predictors/features (x) and the outcome variable (y) names. Only "names" are needed
Y_house <- "price"
X_house <- names(train_h2o[, -c(6)])
### Define base learners ###
## Must apply the same nfolds to all base learners!
# Train & cross-validate an elastic net model via h2o.glm()
base_glm <- h2o.glm(
x = X_house, y = Y_house, training_frame = train_h2o,
family = "gaussian", # continuous outcomes
alpha = 0.5, # elastic net
lambda = 0.02, # penalty parameter
remove_collinear_columns = FALSE, # When lambda != 0, set this to FALSE
nfolds = 5, fold_assignment = "Modulo", # "Module" evenly splits the dataset into the designated folds and does not depend on the seed
keep_cross_validation_predictions = TRUE,
seed = 123
)
## | | | 0% | |======================================================================| 100%
# Train & cross-validate a random forest model via h2o.randomForest()
base_rf <- h2o.randomForest(
x = X_house, y = Y_house, training_frame = train_h2o,
ntrees = 50, mtries = 5, # The number of trees and the number of random variables used in each tree
max_depth = 50, min_rows = 1, # Maximum tree depth (complexity of the trees) and minimum number of observations in a leaf in order to split
sample_rate = 0.5, nfolds = 5, # Sampling rate (without replacement). The range is between 0 to 1. In GBM and XGBoost, this value defaults to 1.
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
stopping_rounds = 50, stopping_metric = "RMSE", # Should increase performance (stopping_metric) by restricting the number of models built (stopping_rounds)
stopping_tolerance = 0, # The tolerance value by which a model must improve before training stops
seed = 123
)
## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.
## | | | 0% | |== | 2% | |===================================================== | 75% | |==================================================================== | 97% | |======================================================================| 100%
# Train & cross-validate a GBM via h2o.gbm()
base_gbm <- h2o.gbm(
x = X_house, y = Y_house, training_frame = train_h2o,
ntrees = 200, learn_rate = 0.05, # The rate at which GBM learns when building a model. Lower learning rates are preferrable, but more trees will be required
max_depth = 10, min_rows = 10,
sample_rate = 0.8, nfolds = 5,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
stopping_rounds = 50, stopping_metric = "RMSE",
stopping_tolerance = 0,
seed = 123
)
## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.
## | | | 0% | |=== | 4% | |======================= | 32% | |====================================================== | 78% | |======================================================================| 100%
# Train a stacked ensemble using the above base models
ensemble_tree <- h2o.stackedEnsemble(
x = X_house, y = Y_house,
training_frame = train_h2o,
model_id = "ensemble_tree",
base_models = list(base_glm, base_rf, base_gbm), # Enter your base models here
metalearner_algorithm = "AUTO" # The algorithm learns the optimal combination of base learners. Here we use the default setting: GLM with non negative weights w/ standardization turned off
)
## | | | 0% | |======================================================================| 100%
# Get results from base learners
# We first define the RMSE function
get_rmse <- function(model) {
results <- h2o.performance(model, newdata = test_h2o)
results@metrics$RMSE
}
# Get RMSE
base_rmse <- list(base_glm, base_rf, base_gbm) %>%
purrr::map_dbl(get_rmse) # Map the function you defined above to the three base models
# Get RMSE for the Stacked results
ensemble_rmse <- h2o.performance(ensemble_tree, newdata = test_h2o)@metrics$RMSE
# Combine the RMSE of the base learners with that of the meta learner
rmse <- c(base_rmse, ensemble_rmse)
rmse <- format(round(rmse, 3), nsmall = 3)
# Display model RMSE in a table
library(gridExtra)
library(grid)
tab_rmse <- data.frame(RMSE = rmse)
rownames(tab_rmse) <- c("GLM", "RandomForest", "GBM", "Ensemble")
theme_1 <- ttheme_default()
grid.arrange(tableGrob(tab_rmse, theme = theme_1))
# The ensemble model (aka., super learner) does have relative low RMSE; however, the improvement isn't that much, and it's still 0.03 % lower than the base GLM model.
(ensemble_rmse - base_rmse[1]) / ensemble_rmse
## [1] 0.0003274046
# Perhaps the correlation across the base learners, especially between the two tree-based learners, is to be blamed. Let's find out.
data.frame(
GLM_pred = as.vector(h2o.getFrame(base_glm@model$cross_validation_holdout_predictions_frame_id$name)),
RF_pred = as.vector(h2o.getFrame(base_rf@model$cross_validation_holdout_predictions_frame_id$name)),
GBM_pred = as.vector(h2o.getFrame(base_gbm@model$cross_validation_holdout_predictions_frame_id$name))
) %>% cor() # Get correlation between model predictions
## GLM_pred RF_pred GBM_pred
## GLM_pred 1.0000000 0.9794124 0.9883014
## RF_pred 0.9794124 1.0000000 0.9902745
## GBM_pred 0.9883014 0.9902745 1.0000000
# It turned out they are indeed highly-correlated. This should be avoided.
# Let's take a further look at the ensemble model estimate. Since the meta learner (the ensemble model) is the optimal combination of its base learners, what about the "weight" of these level-one models? Does the ensemble model assign higher (lower) weights to any the base models? Unfortunately, h2o's Stacked Ensemble model does not directly supply such information in the traditional sense (like linear regression coefficients). However, you can estimate their relative importance using the metalearner coefficients if you use a linear metalearner. let's do that.
# Extract the metalearner model, lots of subsetting to do ...
metalearner <- h2o.getModel(ensemble_tree@model$metalearner$name)
# Get the coefficients (i.e., weights of base models)
metalearner_coeffs <- h2o.coef(metalearner)
print(metalearner_coeffs)
## Intercept GLM_model_R_1745909452877_1
## -1.723607e+04 9.775170e-01
## DRF_model_R_1745909452877_2 GBM_model_R_1745909452877_3
## 2.293963e-02 1.355666e-02
# The meta learner gives higher weight to the GLM. In fact, if you tweak around the above code, you will see the performance of the meta learner is also almost entirely driven by the GLM base model.
# We can also get a glimpse of its relative importance using:
h2o.varimp(metalearner)
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 GLM_model_R_1745909452877_1 324096.312500 1.000000 0.965538
## 2 DRF_model_R_1745909452877_2 7115.145020 0.021954 0.021197
## 3 GBM_model_R_1745909452877_3 4452.344727 0.013738 0.013264
h2o.shutdown(prompt = FALSE) # Shut down h2o
Let’s try using the ensemble approach to model data of dichotomous outcome.
# Load cancer data
cancer <- read.csv(url("https://www.dropbox.com/scl/fi/ot23dm6h3immb9sv7l248/Cancer_Data.csv?rlkey=vv62t6hi34795v11epc72tpvo&st=7utqlcb3&dl=1"))
# remove id column
cancer <- cancer[, -c(1)]
# Examine and data structure and variable class
names(cancer)
## [1] "diagnosis" "radius_mean"
## [3] "texture_mean" "perimeter_mean"
## [5] "area_mean" "smoothness_mean"
## [7] "compactness_mean" "concavity_mean"
## [9] "concave.points_mean" "symmetry_mean"
## [11] "fractal_dimension_mean" "radius_se"
## [13] "texture_se" "perimeter_se"
## [15] "area_se" "smoothness_se"
## [17] "compactness_se" "concavity_se"
## [19] "concave.points_se" "symmetry_se"
## [21] "fractal_dimension_se" "radius_worst"
## [23] "texture_worst" "perimeter_worst"
## [25] "area_worst" "smoothness_worst"
## [27] "compactness_worst" "concavity_worst"
## [29] "concave.points_worst" "symmetry_worst"
## [31] "fractal_dimension_worst"
str(cancer)
## 'data.frame': 569 obs. of 31 variables:
## $ diagnosis : chr "M" "M" "M" "M" ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
head(cancer)
## diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1 M 17.99 10.38 122.80 1001.0 0.11840
## 2 M 20.57 17.77 132.90 1326.0 0.08474
## 3 M 19.69 21.25 130.00 1203.0 0.10960
## 4 M 11.42 20.38 77.58 386.1 0.14250
## 5 M 20.29 14.34 135.10 1297.0 0.10030
## 6 M 12.45 15.70 82.57 477.1 0.12780
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## 1 0.27760 0.3001 0.14710 0.2419
## 2 0.07864 0.0869 0.07017 0.1812
## 3 0.15990 0.1974 0.12790 0.2069
## 4 0.28390 0.2414 0.10520 0.2597
## 5 0.13280 0.1980 0.10430 0.1809
## 6 0.17000 0.1578 0.08089 0.2087
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1 0.07871 1.0950 0.9053 8.589 153.40
## 2 0.05667 0.5435 0.7339 3.398 74.08
## 3 0.05999 0.7456 0.7869 4.585 94.03
## 4 0.09744 0.4956 1.1560 3.445 27.23
## 5 0.05883 0.7572 0.7813 5.438 94.44
## 6 0.07613 0.3345 0.8902 2.217 27.19
## smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## 1 0.006399 0.04904 0.05373 0.01587 0.03003
## 2 0.005225 0.01308 0.01860 0.01340 0.01389
## 3 0.006150 0.04006 0.03832 0.02058 0.02250
## 4 0.009110 0.07458 0.05661 0.01867 0.05963
## 5 0.011490 0.02461 0.05688 0.01885 0.01756
## 6 0.007510 0.03345 0.03672 0.01137 0.02165
## fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 1 0.006193 25.38 17.33 184.60 2019.0
## 2 0.003532 24.99 23.41 158.80 1956.0
## 3 0.004571 23.57 25.53 152.50 1709.0
## 4 0.009208 14.91 26.50 98.87 567.7
## 5 0.005115 22.54 16.67 152.20 1575.0
## 6 0.005082 15.47 23.75 103.40 741.6
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## 1 0.1622 0.6656 0.7119 0.2654
## 2 0.1238 0.1866 0.2416 0.1860
## 3 0.1444 0.4245 0.4504 0.2430
## 4 0.2098 0.8663 0.6869 0.2575
## 5 0.1374 0.2050 0.4000 0.1625
## 6 0.1791 0.5249 0.5355 0.1741
## symmetry_worst fractal_dimension_worst
## 1 0.4601 0.11890
## 2 0.2750 0.08902
## 3 0.3613 0.08758
## 4 0.6638 0.17300
## 5 0.2364 0.07678
## 6 0.3985 0.12440
# Check out the freq. dist of the outcome variable "diagnosis"
table(cancer$diagnosis)
##
## B M
## 357 212
# Do a 75-25 train-test split
set.seed(123) # for reproducibility
split_ind <- createDataPartition(cancer$diagnosis, p = 0.75, list=FALSE)
cancer_train <- cancer[split_ind,]
cancer_test <- cancer[-split_ind,]
# Initialize the h2o
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\hktse\AppData\Local\Temp\RtmpigHSKx\file3a80404039f9/h2o_hktse_started_from_r.out
## C:\Users\hktse\AppData\Local\Temp\RtmpigHSKx\file3a80538452ad/h2o_hktse_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 5 seconds 960 milliseconds
## H2O cluster timezone: Asia/Taipei
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 9 days
## H2O cluster name: H2O_started_from_R_hktse_uyg267
## H2O cluster total nodes: 1
## H2O cluster total memory: 2.98 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 9 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Collapse infrequent categorical levels
cancer_collapse_labels <- recipe(diagnosis ~ ., data = cancer_train) %>%
step_other(all_nominal(), threshold = 0.005)
# Convert categorical variables to factors
cancer$diagnosis <- as.factor(cancer$diagnosis)
# Need to convert the train and test dataframes to h2o format. Might take a while, be patient!
cancer_train_h2o <- prep(cancer_collapse_labels, training = cancer_train, retain = TRUE) %>%
juice() %>%
as.h2o()
## | | | 0% | |======================================================================| 100%
cancer_test_h2o <- prep(cancer_collapse_labels, training = cancer_train) %>%
bake(new_data = cancer_test) %>%
as.h2o()
## | | | 0% | |======================================================================| 100%
# Check out data class
class(cancer_train_h2o)
## [1] "H2OFrame"
class(cancer_test_h2o)
## [1] "H2OFrame"
# Split predictors/features (x) and the outcome variable (y) names. Only "names" are needed
Y_cancer <- "diagnosis"
X_cancer <- names(cancer_train_h2o[, -c(31)]) # "diagnosis" is in the 31st column
### Define the base learners ###
## Again, you must apply the same nfolds to all base learners
# Train & cross-validate an elastic net model via h2o.glm
cancer_base_glm <- h2o.glm(
x = X_cancer, y = Y_cancer, training_frame = cancer_train_h2o,
family = "binomial", # logit model on binary outcomes
alpha = 0.75, # elastic net
lambda = 0.01, # penalty parameter
remove_collinear_columns = FALSE, # when lambda != 0, set this to FALSE
nfolds = 5, fold_assignment = "Modulo", # "Module" evenly splits the dataset into the designated folds and does not depend on the seed
keep_cross_validation_predictions = TRUE,
auc_type = "AUTO",
seed = 123
)
## | | | 0% | |======================================================================| 100%
# Train & cross-validate a random forest model via h2o.randomForest()
# If your outcome variable is binary and coded as a factor level variable, then H2O's random forest will automatically recognize this as a classification problem.
cancer_base_rf <- h2o.randomForest(
x = X_cancer, y = Y_cancer, training_frame = cancer_train_h2o,
ntrees = 10, mtries = 5,
max_depth = 20, min_rows = 2,
sample_rate = 0.8, nfolds = 5,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
auc_type = "AUTO",
stopping_rounds = 3, stopping_metric = "AUC", #stopping_rounds must be enabled for stopping_metric or stopping_tolerance to work
stopping_tolerance = 0,
seed = 123
)
## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.
## | | | 0% | |======================================================================| 100%
# Train & cross-validate a GBM via h2o.gbm()
cancer_base_gbm <- h2o.gbm(
x = X_cancer, y = Y_cancer, training_frame = cancer_train_h2o,
distribution = "bernoulli", # If your response variable is of 2-class categorical input
ntrees = 50, learn_rate = 0.01,
max_depth = 5, min_rows = 5,
sample_rate = 0.5, nfolds = 5,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
auc_type = "AUTO",
stopping_rounds = 3, stopping_metric = "AUC",
stopping_tolerance = 0,
seed = 123
)
## Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.
## | | | 0% | |================================================ | 69% | |======================================================================| 100%
# Train a stacked tree ensemble using the above base models
cancer_ensemble_tree <- h2o.stackedEnsemble(
x = X_cancer, y = Y_cancer, training_frame = cancer_train_h2o, model_id = "cancer_ensemble_tree",
base_models = list(cancer_base_glm, cancer_base_rf, cancer_base_gbm),
metalearner_algorithm = "AUTO"
)
## | | | 0% | |======================================================================| 100%
# Get AUC from base learners
# Define the AUC function
cancer_get_auc <- function(model) {
results <- h2o.performance(model, newdata = cancer_test_h2o)
h2o.auc(results)
}
# Get AUC
cancer_base_auc <- list(cancer_base_glm, cancer_base_rf, cancer_base_gbm) %>%
purrr::map_dbl(cancer_get_auc)
# Get AUC for Stacked results
ensemble_auc <- h2o.auc(h2o.performance(cancer_ensemble_tree, newdata = cancer_test_h2o))
auc <- c(cancer_base_auc, ensemble_auc)
auc <- format(round(auc, 3), nsmall = 3)
# Display model AUC in a table
tab_auc <- data.frame(AUC = auc)
rownames(tab_auc) <- c("GLM", "RandomForest", "GBM", "Ensemble")
theme_1 <- ttheme_default()
grid.arrange(tableGrob(tab_auc, theme = theme_1)) # The higher the better
# The performance of the ensemble model is very close to the GLM but not higher
h2o.shutdown(prompt = FALSE) # Shut down
We can also apply the concept of stacking to grid search. By stacking the results generated from models using unique combination of hyperparameters, we can find the best model that produces the lowest RMSE.
In the two examples shown earlier, the GBM base models were not optimized. Perhaps we should do a bit hyperparameter tuning. The example below demonstrates this stacking approach to grid search using GBM.
# h2o can very easily lose track of the objects (i.e., your data), let’s restart h2o and reload everything!
cancer <- read.csv(url("https://www.dropbox.com/scl/fi/ot23dm6h3immb9sv7l248/Cancer_Data.csv?rlkey=vv62t6hi34795v11epc72tpvo&st=7utqlcb3&dl=1"))
# Remove id column
cancer <- cancer[, -c(1)]
# Convert categorical variables to factors
cancer$diagnosis <- as.factor(cancer$diagnosis)
# Train-test split
set.seed(123)
split_ind <- sample(1:nrow(cancer), size = 0.75 * nrow(cancer))
cancer_train <- cancer[split_ind, ]
cancer_test <- cancer[-split_ind, ]
# Re-initate h2o
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\hktse\AppData\Local\Temp\RtmpsFijOh\file2c0c715a1fff/h2o_hktse_started_from_r.out
## C:\Users\hktse\AppData\Local\Temp\RtmpsFijOh\file2c0c2f382900/h2o_hktse_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 5 seconds 169 milliseconds
## H2O cluster timezone: Asia/Taipei
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 7 days
## H2O cluster name: H2O_started_from_R_hktse_hku862
## H2O cluster total nodes: 1
## H2O cluster total memory: 2.98 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 7 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert to H2O format
cancer_train_h2o <- as.h2o(cancer_train)
## | | | 0% | |======================================================================| 100%
cancer_test_h2o <- as.h2o(cancer_test)
## | | | 0% | |======================================================================| 100%
# Define X and Y
Y_cancer <- "diagnosis"
X_cancer <- setdiff(names(cancer_train_h2o), Y_cancer)
# Confirm everything is (still) in h2o
h2o.ls()
## key
## 1 cancer_test_sid_b980_20
## 2 cancer_train_sid_b980_18
# As usual and as a minimalist example, we first define hyperparameter grid
hyper_grid <- list(
max_depth = c(5, 10, 20, 50, 100, 200), # Maximum depth for each tree to be built
min_rows = c(1, 3, 5), # Minimum number of obs in a leaf for splitting
learn_rate = c(0.3, 0.1, 0.05, 0.01, 0.005),
learn_rate_annealing = c(0.98, 1), # Reduce the learning rate by this value after each tree
sample_rate = c(0.5, 0.75, 1), # Row sampling
col_sample_rate = c(0.5, 0.75, 1) # Col sampling for each split
)
# Define random grid search criteria
search_criteria <- list(
strategy = "RandomDiscrete", # Tell the model when to stop the search by the following criteria: max number of models, max time, and/or metric-based early stopping
max_models = 50
)
# Perform random grid search via h2o.grid()
gbm_grid <- h2o.grid(
algorithm = "gbm", grid_id = "gbm_grid", x = X_cancer, y = Y_cancer,
training_frame = cancer_train_h2o,
hyper_params = hyper_grid,
search_criteria = search_criteria,
ntrees = 50,
stopping_metric = "AUC",
stopping_rounds = 5,
stopping_tolerance = 0,
nfolds = 5,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
seed = 123
)
## | | | 0% | |======================================================================| 100%
# Sort results by RMSE
random_grid_perf <- h2o.getGrid(
grid_id = "gbm_grid",
sort_by = "AUC",
decreasing = TRUE # Sort by AUC in decreasing order
)
random_grid_perf
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - learn_rate_annealing
## - max_depth
## - min_rows
## - sample_rate
## Number of models: 50
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing AUC
## col_sample_rate learn_rate learn_rate_annealing max_depth min_rows
## 1 0.75000 0.30000 0.98000 20.00000 5.00000
## 2 0.75000 0.30000 1.00000 10.00000 3.00000
## 3 0.50000 0.10000 0.98000 10.00000 5.00000
## 4 0.75000 0.05000 1.00000 10.00000 5.00000
## 5 0.50000 0.30000 0.98000 50.00000 1.00000
## sample_rate model_ids auc
## 1 1.00000 gbm_grid_model_28 0.98388
## 2 0.75000 gbm_grid_model_4 0.98230
## 3 0.75000 gbm_grid_model_11 0.97972
## 4 0.75000 gbm_grid_model_10 0.97835
## 5 0.50000 gbm_grid_model_5 0.97722
##
## ---
## col_sample_rate learn_rate learn_rate_annealing max_depth min_rows
## 45 0.50000 0.00500 0.98000 10.00000 1.00000
## 46 0.50000 0.00500 0.98000 200.00000 5.00000
## 47 0.50000 0.00500 0.98000 20.00000 5.00000
## 48 1.00000 0.01000 0.98000 10.00000 5.00000
## 49 0.75000 0.00500 0.98000 50.00000 3.00000
## 50 1.00000 0.10000 0.98000 50.00000 1.00000
## sample_rate model_ids auc
## 45 0.50000 gbm_grid_model_18 0.95141
## 46 0.50000 gbm_grid_model_14 0.95096
## 47 0.50000 gbm_grid_model_26 0.95096
## 48 1.00000 gbm_grid_model_32 0.94809
## 49 0.50000 gbm_grid_model_16 0.94378
## 50 1.00000 gbm_grid_model_49 0.92001
# Get the model_id for the top-performing model based on AUC (you can subset the best-performer's id using @)
best_model_id <- random_grid_perf@model_ids[[1]]
best_model <- h2o.getModel(best_model_id) # Look for the model indexed by this id
h2o.performance(best_model, newdata = cancer_test_h2o)
## H2OBinomialMetrics: gbm
##
## MSE: 0.04225112
## RMSE: 0.2055508
## LogLoss: 0.1410415
## Mean Per-Class Error: 0.04880952
## AUC: 0.9902778
## AUCPR: 0.988111
## Gini: 0.9805556
## R^2: 0.8285728
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## B M Error Rate
## B 76 4 0.050000 =4/80
## M 3 60 0.047619 =3/63
## Totals 79 64 0.048951 =7/143
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.524929 0.944882 63
## 2 max f2 0.044910 0.966258 73
## 3 max f0point5 0.796560 0.966102 57
## 4 max accuracy 0.796560 0.951049 57
## 5 max precision 0.994307 1.000000 0
## 6 max recall 0.044910 1.000000 73
## 7 max specificity 0.994307 1.000000 0
## 8 max absolute_mcc 0.796560 0.902161 57
## 9 max min_per_class_accuracy 0.524929 0.950000 63
## 10 max mean_per_class_accuracy 0.524929 0.951190 63
## 11 max tns 0.994307 80.000000 0
## 12 max fns 0.994307 62.000000 0
## 13 max fps 0.002966 80.000000 142
## 14 max tps 0.044910 63.000000 73
## 15 max tnr 0.994307 1.000000 0
## 16 max fnr 0.994307 0.984127 0
## 17 max fpr 0.002966 1.000000 142
## 18 max tpr 0.044910 1.000000 73
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# The AUC is very high!
# We can also stack the top-performing models from grid search to train the meta learner.
# Train a stacked ensemble using the GBM grid
ensemble_gbm_grid <- h2o.stackedEnsemble(
x = X_cancer, y = Y_cancer,
training_frame = cancer_train_h2o,
model_id = "ensemble_gbm_grid",
base_models = gbm_grid@model_ids, # The name of the GBM grid search model your trained earlier with h2o.grid() should appear here @ means include all models trained
metalearner_algorithm = "gbm"
)
## | | | 0% | |======================================================================| 100%
# Evaluate ensemble performance on test data
h2o.performance(ensemble_gbm_grid, newdata = cancer_test_h2o)
## H2OBinomialMetrics: stackedensemble
##
## MSE: 0.0469671
## RMSE: 0.2167189
## LogLoss: 0.1691067
## Mean Per-Class Error: 0.05555556
## AUC: 0.9793651
## AUCPR: 0.9831875
## Gini: 0.9587302
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## B M Error Rate
## B 80 0 0.000000 =0/80
## M 7 56 0.111111 =7/63
## Totals 87 56 0.048951 =7/143
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.981867 0.941176 14
## 2 max f2 0.065958 0.953125 26
## 3 max f0point5 0.981867 0.975610 14
## 4 max accuracy 0.981867 0.951049 14
## 5 max precision 0.995783 1.000000 0
## 6 max recall 0.007972 1.000000 46
## 7 max specificity 0.995783 1.000000 0
## 8 max absolute_mcc 0.981867 0.904085 14
## 9 max min_per_class_accuracy 0.176520 0.937500 23
## 10 max mean_per_class_accuracy 0.176520 0.944940 23
## 11 max tns 0.995783 80.000000 0
## 12 max fns 0.995783 62.000000 0
## 13 max fps 0.003539 80.000000 57
## 14 max tps 0.007972 63.000000 46
## 15 max tnr 0.995783 1.000000 0
## 16 max fnr 0.995783 0.984127 0
## 17 max fpr 0.003539 1.000000 57
## 18 max tpr 0.007972 1.000000 46
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# However, the AUC is lower than the best-performing gbm_grid model (random_grid_perf@model_ids[[1]])
h2o.shutdown(prompt = FALSE)
In previous example, we see that sometimes prediction accuracy may actually be compromised by stacking when some base models already provide accurate predictions. Hence, Residual Stacking (RS) is proposed to mitigate this problem. In RS, all information learned from level-one base models is retained. In the stacking stage, the outcome is learned using all other fitted values (but excluding the outcome) as the predictors to predict the residual of the outcome. Finally, level-one predictions will be revised using the residuals.
We will demonstrate this at below using the \(\textsf{MTPS}\) package.
library(MTPS)
library(caret)
library(ggplot2) # for plotting
library(reshape2) # for data wrangling
# We'll be using the HIV Drug Resistance Database in the package. In this database, the resistance to the five Nucleoside RT Inhibitor (NRTI) drugs were used as multivariate outcomes, including Lamivudine (3TC), Abacavir(ABC), Zidovudine (AZT), Stavudine (D4T), Didanosine (DDI). The final outcome data is a matrix of size 1246 × 5, and the predictor data is a matrix of 1246 × 228 values. Here, “YY” refers to the outcome data and “XX” is a vector of the predictors.
data("HIV")
# Take a look at how the data looks like
head(YY)
## ABC 3TC AZT D4T DDI
## 1 0.63346846 2.3010300 0.59106461 0.14612804 0.07918125
## 3 0.04139269 0.1461280 1.44715803 0.00000000 -0.09691001
## 4 0.17609126 0.2552725 0.85125835 0.07918125 0.04139269
## 6 0.85125835 2.3010300 -0.09691001 0.11394335 0.27875360
## 7 0.71600334 0.4065402 0.82607480 0.73239376 0.72427587
## 8 0.77085201 2.3010300 0.27875360 0.11394335 0.23044892
XX[1:4, 5:10]
## X.8I X.8V X.11K X.11R X.20K X.20R
## 1 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 1 0 0
## 6 0 0 0 0 0 0
dim(YY)
## [1] 1246 5
dim(XX)
## [1] 1246 228
# This is a rather large database
# First, convert the data to matrix format
set.seed(123)
xmat <- as.matrix(XX)
ymat <- as.matrix(YY)
nobs <- nrow(xmat)
# Do a 75-25 train-test split
split_ind <- createDataPartition(ymat[,1], p = 0.75, list=FALSE) # This is doable since XX and YY contain the same number of rows
y_train <- ymat[split_ind, ]
y_test <- ymat[-split_ind, ]
x_train <- xmat[split_ind, ]
x_test <- xmat[-split_ind, ]
# The multiFit() function is a non-stacking algorithm that fits individual models for each outcome separately. The following code fits generalized linear models with ridge regression on each outcome.
multi.fit <- multiFit(xmat = x_train, ymat = y_train, method = glmnet.ridge, family = "gaussian")
# Use the base R's predict() function to return predicted values
multi.pred <- predict(multi.fit, x_test)
# To set up the stacking algorithm, we need to specify the base (step 1) and the meta learner (step 2) algorithms. As we are dealing with continuous outcome, we need to set family = "gaussian". The cv and residual argument specify whether we want to use cross-validation stacking (CVS) or residual stacking (RS) or their combination. The default value of cv is “FALSE” and for residual is “TRUE”, denoting residual stacking.
# The code below fits models using the standard stacking (SS), cross-validation stacking (CVS), residual stacking (RS) and the combination of the latter two, cross-validation residual stacking (CVRS) via the MTPS function. Note how the argument settings of cv and residual vary.
# The following models use ridge regression as their base models in Step 1 and tree methods as the meta model in Step 2.
# Standard stacking
ss.fit <- MTPS(xmat = x_train, ymat = y_train, family = "gaussian",
cv = FALSE, residual = FALSE, # cv and residual all set to FALSE
method.step1 = glmnet.ridge, # Base learner
method.step2 = rpart1) # Meta learner
# Predict on testing data
ss.pred <- predict(ss.fit, x_test)
# Cross-validation Stacking
cv.fit <- MTPS(xmat = x_train, ymat = y_train, family = "gaussian",
cv = TRUE, residual = FALSE, # cv set to TRUE
method.step1 = glmnet.ridge,
method.step2 = rpart1)
# Predict on testing data
cv.pred <- predict(cv.fit, x_test)
# Residual Stacking
rs.fit <- MTPS(xmat = x_train, ymat = y_train, family = "gaussian",
cv = FALSE, residual = TRUE, # residual set to TRUE
method.step1 = glmnet.ridge,
method.step2 = rpart1)
# Predict on testing data
rs.pred <- predict(rs.fit, x_test)
# Cross-validation residual stacking
cvrs.fit <- MTPS(xmat = x_train, ymat = y_train, family = "gaussian",
cv = TRUE, residual = TRUE, # all set to TRUE
method.step1 = glmnet.ridge,
method.step2 = rpart1)
# Predict on testing data
cvrs.pred <- predict(cvrs.fit, x_test)
n_test <- nrow(x_test) # Calculate the dimension of the test data
ctn.plot.data_matrix <- cbind(rbind(multi.pred, ss.pred, cv.pred, rs.pred, cvrs.pred), y_test[rep(seq_len(n_test), 5), ]) # Make a matrix of dimension 5 * N pred. x (5 methods' predictions + 5 actual values)
ctn.plot.data <- data.frame(rep(c("No-Stacking", "SS", "CV", "RS", "CVRS"), each = n_test), ctn.plot.data_matrix) # Append the matrix to a column of size 5 (methods) times n_test
colnames(ctn.plot.data) <- c("method", paste0(rep("pred.", ncol(y_test)), colnames(y_test)), colnames(y_test)) # Rename the columns
# Reshape the data to 3 columns times the number of actual values
dm1 <- melt(ctn.plot.data[,c("method","ABC","3TC","AZT","D4T", "DDI")], id=c("method"))
# Reshape the data to 3 columns times the number of predicted values
dm2 <- melt(ctn.plot.data[,c("method","pred.ABC","pred.3TC","pred.AZT","pred.D4T", "pred.DDI")], id=c("method"))
# Combind dm1 and dm2
ctn.plot.data <- cbind(dm1, dm2[, -1])
# Rename the dataframe
colnames(ctn.plot.data) <- c("method", "Y", "yVal", "predict", "predictVal")
# Convert the values within the dataframe: method name to factor class, actual and predicted values to numeric
ctn.plot.data$method <- factor(ctn.plot.data$method, unique(as.character(ctn.plot.data$method)))
ctn.plot.data$yVal <- as.numeric(ctn.plot.data$yVal)
ctn.plot.data$predictVal <- as.numeric(ctn.plot.data$predictVal)
# Render the actual and predicted values into a series of 5 x 5 bivariate plots
ggplot(ctn.plot.data) +
geom_point(aes(x=predictVal, y=yVal, color=method), size = 0.5, alpha = 0.6) +
geom_abline(slope=1, alpha = 0.2) + # 45 degree line
coord_fixed() +
ylab("Testing Data Outcome") + xlab("Predicted Outcome on Testing Data") +
scale_x_discrete(breaks = NULL) +
scale_y_discrete(breaks = NULL) +
theme_bw() +
theme(axis.text = element_blank(),
strip.placement = "outside",
strip.background = element_blank()) +
facet_grid(Y~method) # display subplots by method
# We see that the RS and CVRS models' prediction patterns are very similar to that of the non-stacking model, while the prediction patterns of SS and CV are nearly identical to each other.
The \(\textsf{MTPS}\) packages allows simultaneously modeling of mix outcomes, such as a combination of continuous and binary outcomes. One only need to specify each outcome type in the family argument. The following example demonstrates this using Ridge regression as the base models for the first two outcomes and logit model for the third outcome (binary), followed by tree method as the meta model.
Before we begin, a couple conversion and pre-processing tasks need to be done.
# Dichotomize the last outcome variable to binary scale.
cutoff <- 1.5 # Define the cutoff value
ymat.bin <- ymat
xmat.bin <- xmat
ymat.bin[,5] <- (10^ymat[,5] < cutoff) * 1
# Combine the dichotomized last outcome column with the first two original outcome columns
ymat.mix <- cbind(ymat[,1:2], ymat.bin[,5])
xmat.mix <- xmat # X variables remain the same
# Use the previously-generated split indicator (split_ind) for train-test split
y.train.mix <- ymat.mix[split_ind, ]
y.test.mix <- ymat.mix[-split_ind, ]
x.train.mix <- xmat.mix[split_ind, ]
x.test.mix <- xmat.mix[-split_ind, ]
# Model fitting: ridge models as Step 1 learners, tree method as Step 2 learner
# Use residual stacking (RS)
rs.mix.fit <- MTPS(xmat = x.train.mix, ymat = y.train.mix,
family=c("gaussian","gaussian","binomial"), # The first two outcome variables are continuous, the last one is binomial
cv = FALSE, residual = TRUE,
method.step1 = glmnet.ridge,
method.step2 = rpart1)
## Warning: from glmnet C++ code (error code -100); Convergence for 100th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
## Warning in log(yhat): NaNs produced
## Warning in sqrt(-2 * log(yhat)): NaNs produced
## Warning in log(1 - yhat): NaNs produced
## Warning in sqrt(-2 * (log(1 - yhat))): NaNs produced
# Predict on testing data
rs.mix.pred <- predict(rs.mix.fit, x.test.mix)
## Plot the predicted values against the actual values for the first two outcomes
mix.plot.data <- cbind(rep(colnames(y.test.mix)[1:2], each = nrow(y.test.mix)), # Expand row length by the row length of the test data and fill in outcome variable names
rbind(cbind(rs.mix.pred[, 1], y.test.mix[, 1]), # Append predicted values and actual values of the test data
cbind(rs.mix.pred[, 2], y.test.mix[, 2])))
colnames(mix.plot.data) <- c("Y", "predict", "outcome") # Rename columns
mix.plot.data <- as.data.frame(mix.plot.data) # Convert to data frame
# Convert the values within the dataframe: method name to factor class, actual and predicted values to numeric class
mix.plot.data$predict <- as.numeric(as.character(mix.plot.data$predict))
mix.plot.data$outcome <- as.numeric(as.character(mix.plot.data$outcome))
ggplot(mix.plot.data) +
geom_point(aes(x=predict, y=outcome, color=Y), size = 0.5, alpha = 0.6) +
ylab("Outcome of Testing Data") + xlab("Predicted Outcome of Testing Data") +
scale_x_discrete(breaks = NULL) +
scale_y_discrete(breaks = NULL) +
geom_abline(slope=1, alpha = 0.2) +
coord_fixed() +
theme_bw() +
theme(legend.title = element_blank(),
axis.text = element_blank(),
strip.placement = "outside",
strip.background = element_blank()) +
facet_grid(~Y) # Render by outcome variable
# Print and calculate the prediction accuracy of the third outcome variable
print(colnames(y.test.mix)[3])
## [1] ""
print(table((rs.mix.pred[,3] > 0.5) * 1, y.test.mix[,3]))
##
## 0 1
## 0 130 40
## 1 19 121
# Accuracy
mix.accuracy <- table((rs.mix.pred[,3] > 0.5) * 1, y.test.mix[,3])
(mix.accuracy[1, 1] + mix.accuracy[2, 2]) / sum(mix.accuracy)
## [1] 0.8096774