Set up
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.4 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.3.1 ✔ stringr 1.5.1
## ✔ readr 2.1.2 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.6 ✔ workflows 1.1.3
## ✔ modeldata 1.3.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.3.0
## ✔ recipes 1.0.9
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:h2o':
##
## day, hour, month, week, year
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
##
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
##
## Loading required package: quantmod
## Loading required package: TTR
##
## Attaching package: 'TTR'
##
## The following object is masked from 'package:dials':
##
## momentum
##
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
Import Data
data <- read_csv("../00_data/data_wrangled/data_clean.csv") %>%
# H2O requires all variables to be either numeric or factors
mutate(across(where(is.character), factor))
## Rows: 1470 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (24): Age, DailyRate, DistanceFromHome, Education, EmployeeNumber, Envir...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Split Data
set.seed(1234)
data_split <- initial_split(data, strata = "Attrition")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
Recipes
recipe_obj <- recipe(Attrition ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
Model
# Initialize H2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 6 minutes 56 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 4 months and 3 days
## H2O cluster name: H2O_started_from_R_jasonzink_qxv383
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.58 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.2.1 (2022-06-23)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (4 months and 3 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2567)
##
|
| | 0%
|
|======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
##
|
| | 0%
|
|======================================================================| 100%
y <- "Attrition"
x <- setdiff(names(train_tbl), y)
auto_ml_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 60,
nfolds = 5,
seed = 3456
)
##
|
| | 0%
|
|= | 2%
## 20:08:43.167: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
|
|==== | 5%
|
|====== | 9%
|
|========= | 12%
|
|=========== | 16%
|
|============= | 19%
|
|================ | 23%
|
|=================== | 26%
|
|===================== | 30%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 41%
|
|=============================== | 44%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 55%
|
|========================================= | 59%
|
|============================================ | 62%
|
|============================================== | 66%
|
|================================================= | 69%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|=========================================================== | 84%
|
|============================================================= | 87%
|
|=============================================================== | 91%
|
|================================================================== | 94%
|
|==================================================================== | 98%
|
|======================================================================| 100%
auto_ml_models_h2o@leaderboard
## model_id auc logloss
## 1 StackedEnsemble_BestOfFamily_4_AutoML_6_20240423_200843 0.8355987 0.3209535
## 2 StackedEnsemble_BestOfFamily_2_AutoML_6_20240423_200843 0.8313376 0.3260096
## 3 StackedEnsemble_BestOfFamily_3_AutoML_6_20240423_200843 0.8307983 0.3257974
## 4 StackedEnsemble_BestOfFamily_1_AutoML_6_20240423_200843 0.8290723 0.3271145
## 5 StackedEnsemble_AllModels_3_AutoML_6_20240423_200843 0.8278317 0.3275211
## 6 GLM_1_AutoML_6_20240423_200843 0.8269687 0.3310361
## aucpr mean_per_class_error rmse mse
## 1 0.9530080 0.3180421 0.3058609 0.09355091
## 2 0.9528207 0.2863269 0.3086916 0.09529052
## 3 0.9522887 0.2863269 0.3087046 0.09529853
## 4 0.9504737 0.2930421 0.3087986 0.09535655
## 5 0.9506850 0.3180421 0.3108135 0.09660503
## 6 0.9469916 0.2796117 0.3081993 0.09498682
##
## [54 rows x 7 columns]
auto_ml_models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: stackedensemble
## Model ID: StackedEnsemble_BestOfFamily_4_AutoML_6_20240423_200843
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 5/6
## 3 # GBM base models (used / total) 1/1
## 4 # XGBoost base models (used / total) 1/1
## 5 # GLM base models (used / total) 1/1
## 6 # DeepLearning base models (used / total) 1/1
## 7 # DRF base models (used / total) 1/2
## 8 Metalearner algorithm GLM
## 9 Metalearner fold assignment scheme Random
## 10 Metalearner nfolds 5
## 11 Metalearner fold_column NA
## 12 Custom metalearner hyperparameters None
##
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 0.05291333
## RMSE: 0.230029
## LogLoss: 0.1951596
## Mean Per-Class Error: 0.1188421
## AUC: 0.9685078
## AUCPR: 0.9923538
## Gini: 0.9370155
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Left No Error Rate
## Left 120 34 0.220779 =34/154
## No 13 756 0.016905 =13/769
## Totals 133 790 0.050921 =47/923
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.643229 0.969852 284
## 2 max f2 0.505609 0.983312 309
## 3 max f0point5 0.751266 0.970310 240
## 4 max accuracy 0.643229 0.949079 284
## 5 max precision 0.999451 1.000000 0
## 6 max recall 0.287542 1.000000 354
## 7 max specificity 0.999451 1.000000 0
## 8 max absolute_mcc 0.643229 0.809318 284
## 9 max min_per_class_accuracy 0.779128 0.915584 226
## 10 max mean_per_class_accuracy 0.751266 0.918789 240
## 11 max tns 0.999451 154.000000 0
## 12 max fns 0.999451 767.000000 0
## 13 max fps 0.043157 154.000000 399
## 14 max tps 0.287542 769.000000 354
## 15 max tnr 0.999451 1.000000 0
## 16 max fnr 0.999451 0.997399 0
## 17 max fpr 0.043157 1.000000 399
## 18 max tpr 0.287542 1.000000 354
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on validation data. **
##
## MSE: 0.06777477
## RMSE: 0.2603359
## LogLoss: 0.2616012
## Mean Per-Class Error: 0.143338
## AUC: 0.8692847
## AUCPR: 0.9579188
## Gini: 0.7385694
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Left No Error Rate
## Left 17 6 0.260870 =6/23
## No 4 151 0.025806 =4/155
## Totals 21 157 0.056180 =10/178
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.631304 0.967949 156
## 2 max f2 0.461018 0.979772 170
## 3 max f0point5 0.631304 0.964240 156
## 4 max accuracy 0.631304 0.943820 156
## 5 max precision 0.998916 1.000000 0
## 6 max recall 0.461018 1.000000 170
## 7 max specificity 0.998916 1.000000 0
## 8 max absolute_mcc 0.631304 0.741749 156
## 9 max min_per_class_accuracy 0.821364 0.782609 126
## 10 max mean_per_class_accuracy 0.631304 0.856662 156
## 11 max tns 0.998916 23.000000 0
## 12 max fns 0.998916 154.000000 0
## 13 max fps 0.035945 23.000000 177
## 14 max tps 0.461018 155.000000 170
## 15 max tnr 0.998916 1.000000 0
## 16 max fnr 0.998916 0.993548 0
## 17 max fpr 0.035945 1.000000 177
## 18 max tpr 0.461018 1.000000 170
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.09823331
## RMSE: 0.3134219
## LogLoss: 0.3375866
## Mean Per-Class Error: 0.3181987
## AUC: 0.8283781
## AUCPR: 0.94296
## Gini: 0.6567561
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Left No Error Rate
## Left 60 94 0.610390 =94/154
## No 20 749 0.026008 =20/769
## Totals 80 843 0.123510 =114/923
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.526749 0.929280 331
## 2 max f2 0.279570 0.965552 379
## 3 max f0point5 0.701396 0.920189 273
## 4 max accuracy 0.526749 0.876490 331
## 5 max precision 0.999357 1.000000 0
## 6 max recall 0.185843 1.000000 392
## 7 max specificity 0.999357 1.000000 0
## 8 max absolute_mcc 0.701396 0.516693 273
## 9 max min_per_class_accuracy 0.842116 0.741222 186
## 10 max mean_per_class_accuracy 0.727951 0.772651 254
## 11 max tns 0.999357 154.000000 0
## 12 max fns 0.999357 767.000000 0
## 13 max fps 0.049075 154.000000 399
## 14 max tps 0.185843 769.000000 392
## 15 max tnr 0.999357 1.000000 0
## 16 max fnr 0.999357 0.997399 0
## 17 max fpr 0.049075 1.000000 399
## 18 max tpr 0.185843 1.000000 392
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy 0.884252 0.028423 0.913044 0.848168 0.864407 0.884817
## auc 0.828147 0.045895 0.856943 0.847875 0.800699 0.873428
## err 0.115747 0.028423 0.086957 0.151832 0.135593 0.115183
## err_count 21.400000 5.727129 18.000000 29.000000 24.000000 22.000000
## f0point5 0.906857 0.028576 0.941241 0.872162 0.886792 0.905421
## cv_5_valid
## accuracy 0.910828
## auc 0.761791
## err 0.089172
## err_count 14.000000
## f0point5 0.928668
##
## ---
## mean sd cv_1_valid cv_2_valid cv_3_valid
## precision 0.889983 0.035127 0.935135 0.848837 0.865031
## r2 0.273542 0.069636 0.331446 0.287203 0.252507
## recall 0.982518 0.010079 0.966480 0.979866 0.986014
## residual_deviance 123.393105 21.598175 115.414230 149.762400 139.635040
## rmse 0.313267 0.030630 0.279643 0.349677 0.340595
## specificity 0.391620 0.108602 0.571429 0.380952 0.352941
## cv_4_valid cv_5_valid
## precision 0.887006 0.913907
## r2 0.332356 0.164200
## recall 0.987421 0.992806
## residual_deviance 117.109146 95.044685
## rmse 0.305149 0.291270
## specificity 0.375000 0.277778
best_model <- auto_ml_models_h2o@leader
best_model
## Model Details:
## ==============
##
## H2OBinomialModel: stackedensemble
## Model ID: StackedEnsemble_BestOfFamily_4_AutoML_6_20240423_200843
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 5/6
## 3 # GBM base models (used / total) 1/1
## 4 # XGBoost base models (used / total) 1/1
## 5 # GLM base models (used / total) 1/1
## 6 # DeepLearning base models (used / total) 1/1
## 7 # DRF base models (used / total) 1/2
## 8 Metalearner algorithm GLM
## 9 Metalearner fold assignment scheme Random
## 10 Metalearner nfolds 5
## 11 Metalearner fold_column NA
## 12 Custom metalearner hyperparameters None
##
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 0.05291333
## RMSE: 0.230029
## LogLoss: 0.1951596
## Mean Per-Class Error: 0.1188421
## AUC: 0.9685078
## AUCPR: 0.9923538
## Gini: 0.9370155
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Left No Error Rate
## Left 120 34 0.220779 =34/154
## No 13 756 0.016905 =13/769
## Totals 133 790 0.050921 =47/923
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.643229 0.969852 284
## 2 max f2 0.505609 0.983312 309
## 3 max f0point5 0.751266 0.970310 240
## 4 max accuracy 0.643229 0.949079 284
## 5 max precision 0.999451 1.000000 0
## 6 max recall 0.287542 1.000000 354
## 7 max specificity 0.999451 1.000000 0
## 8 max absolute_mcc 0.643229 0.809318 284
## 9 max min_per_class_accuracy 0.779128 0.915584 226
## 10 max mean_per_class_accuracy 0.751266 0.918789 240
## 11 max tns 0.999451 154.000000 0
## 12 max fns 0.999451 767.000000 0
## 13 max fps 0.043157 154.000000 399
## 14 max tps 0.287542 769.000000 354
## 15 max tnr 0.999451 1.000000 0
## 16 max fnr 0.999451 0.997399 0
## 17 max fpr 0.043157 1.000000 399
## 18 max tpr 0.287542 1.000000 354
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on validation data. **
##
## MSE: 0.06777477
## RMSE: 0.2603359
## LogLoss: 0.2616012
## Mean Per-Class Error: 0.143338
## AUC: 0.8692847
## AUCPR: 0.9579188
## Gini: 0.7385694
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Left No Error Rate
## Left 17 6 0.260870 =6/23
## No 4 151 0.025806 =4/155
## Totals 21 157 0.056180 =10/178
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.631304 0.967949 156
## 2 max f2 0.461018 0.979772 170
## 3 max f0point5 0.631304 0.964240 156
## 4 max accuracy 0.631304 0.943820 156
## 5 max precision 0.998916 1.000000 0
## 6 max recall 0.461018 1.000000 170
## 7 max specificity 0.998916 1.000000 0
## 8 max absolute_mcc 0.631304 0.741749 156
## 9 max min_per_class_accuracy 0.821364 0.782609 126
## 10 max mean_per_class_accuracy 0.631304 0.856662 156
## 11 max tns 0.998916 23.000000 0
## 12 max fns 0.998916 154.000000 0
## 13 max fps 0.035945 23.000000 177
## 14 max tps 0.461018 155.000000 170
## 15 max tnr 0.998916 1.000000 0
## 16 max fnr 0.998916 0.993548 0
## 17 max fpr 0.035945 1.000000 177
## 18 max tpr 0.461018 1.000000 170
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.09823331
## RMSE: 0.3134219
## LogLoss: 0.3375866
## Mean Per-Class Error: 0.3181987
## AUC: 0.8283781
## AUCPR: 0.94296
## Gini: 0.6567561
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Left No Error Rate
## Left 60 94 0.610390 =94/154
## No 20 749 0.026008 =20/769
## Totals 80 843 0.123510 =114/923
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.526749 0.929280 331
## 2 max f2 0.279570 0.965552 379
## 3 max f0point5 0.701396 0.920189 273
## 4 max accuracy 0.526749 0.876490 331
## 5 max precision 0.999357 1.000000 0
## 6 max recall 0.185843 1.000000 392
## 7 max specificity 0.999357 1.000000 0
## 8 max absolute_mcc 0.701396 0.516693 273
## 9 max min_per_class_accuracy 0.842116 0.741222 186
## 10 max mean_per_class_accuracy 0.727951 0.772651 254
## 11 max tns 0.999357 154.000000 0
## 12 max fns 0.999357 767.000000 0
## 13 max fps 0.049075 154.000000 399
## 14 max tps 0.185843 769.000000 392
## 15 max tnr 0.999357 1.000000 0
## 16 max fnr 0.999357 0.997399 0
## 17 max fpr 0.049075 1.000000 399
## 18 max tpr 0.185843 1.000000 392
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy 0.884252 0.028423 0.913044 0.848168 0.864407 0.884817
## auc 0.828147 0.045895 0.856943 0.847875 0.800699 0.873428
## err 0.115747 0.028423 0.086957 0.151832 0.135593 0.115183
## err_count 21.400000 5.727129 18.000000 29.000000 24.000000 22.000000
## f0point5 0.906857 0.028576 0.941241 0.872162 0.886792 0.905421
## cv_5_valid
## accuracy 0.910828
## auc 0.761791
## err 0.089172
## err_count 14.000000
## f0point5 0.928668
##
## ---
## mean sd cv_1_valid cv_2_valid cv_3_valid
## precision 0.889983 0.035127 0.935135 0.848837 0.865031
## r2 0.273542 0.069636 0.331446 0.287203 0.252507
## recall 0.982518 0.010079 0.966480 0.979866 0.986014
## residual_deviance 123.393105 21.598175 115.414230 149.762400 139.635040
## rmse 0.313267 0.030630 0.279643 0.349677 0.340595
## specificity 0.391620 0.108602 0.571429 0.380952 0.352941
## cv_4_valid cv_5_valid
## precision 0.887006 0.913907
## r2 0.332356 0.164200
## recall 0.987421 0.992806
## residual_deviance 117.109146 95.044685
## rmse 0.305149 0.291270
## specificity 0.375000 0.277778
Examine The Output of H2O.automl
auto_ml_models_h2o %>% typeof()
## [1] "S4"
auto_ml_models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
auto_ml_models_h2o@leaderboard
## model_id auc logloss
## 1 StackedEnsemble_BestOfFamily_4_AutoML_6_20240423_200843 0.8355987 0.3209535
## 2 StackedEnsemble_BestOfFamily_2_AutoML_6_20240423_200843 0.8313376 0.3260096
## 3 StackedEnsemble_BestOfFamily_3_AutoML_6_20240423_200843 0.8307983 0.3257974
## 4 StackedEnsemble_BestOfFamily_1_AutoML_6_20240423_200843 0.8290723 0.3271145
## 5 StackedEnsemble_AllModels_3_AutoML_6_20240423_200843 0.8278317 0.3275211
## 6 GLM_1_AutoML_6_20240423_200843 0.8269687 0.3310361
## aucpr mean_per_class_error rmse mse
## 1 0.9530080 0.3180421 0.3058609 0.09355091
## 2 0.9528207 0.2863269 0.3086916 0.09529052
## 3 0.9522887 0.2863269 0.3087046 0.09529853
## 4 0.9504737 0.2930421 0.3087986 0.09535655
## 5 0.9506850 0.3180421 0.3108135 0.09660503
## 6 0.9469916 0.2796117 0.3081993 0.09498682
##
## [54 rows x 7 columns]
auto_ml_models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: stackedensemble
## Model ID: StackedEnsemble_BestOfFamily_4_AutoML_6_20240423_200843
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 5/6
## 3 # GBM base models (used / total) 1/1
## 4 # XGBoost base models (used / total) 1/1
## 5 # GLM base models (used / total) 1/1
## 6 # DeepLearning base models (used / total) 1/1
## 7 # DRF base models (used / total) 1/2
## 8 Metalearner algorithm GLM
## 9 Metalearner fold assignment scheme Random
## 10 Metalearner nfolds 5
## 11 Metalearner fold_column NA
## 12 Custom metalearner hyperparameters None
##
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 0.05291333
## RMSE: 0.230029
## LogLoss: 0.1951596
## Mean Per-Class Error: 0.1188421
## AUC: 0.9685078
## AUCPR: 0.9923538
## Gini: 0.9370155
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Left No Error Rate
## Left 120 34 0.220779 =34/154
## No 13 756 0.016905 =13/769
## Totals 133 790 0.050921 =47/923
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.643229 0.969852 284
## 2 max f2 0.505609 0.983312 309
## 3 max f0point5 0.751266 0.970310 240
## 4 max accuracy 0.643229 0.949079 284
## 5 max precision 0.999451 1.000000 0
## 6 max recall 0.287542 1.000000 354
## 7 max specificity 0.999451 1.000000 0
## 8 max absolute_mcc 0.643229 0.809318 284
## 9 max min_per_class_accuracy 0.779128 0.915584 226
## 10 max mean_per_class_accuracy 0.751266 0.918789 240
## 11 max tns 0.999451 154.000000 0
## 12 max fns 0.999451 767.000000 0
## 13 max fps 0.043157 154.000000 399
## 14 max tps 0.287542 769.000000 354
## 15 max tnr 0.999451 1.000000 0
## 16 max fnr 0.999451 0.997399 0
## 17 max fpr 0.043157 1.000000 399
## 18 max tpr 0.287542 1.000000 354
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on validation data. **
##
## MSE: 0.06777477
## RMSE: 0.2603359
## LogLoss: 0.2616012
## Mean Per-Class Error: 0.143338
## AUC: 0.8692847
## AUCPR: 0.9579188
## Gini: 0.7385694
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Left No Error Rate
## Left 17 6 0.260870 =6/23
## No 4 151 0.025806 =4/155
## Totals 21 157 0.056180 =10/178
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.631304 0.967949 156
## 2 max f2 0.461018 0.979772 170
## 3 max f0point5 0.631304 0.964240 156
## 4 max accuracy 0.631304 0.943820 156
## 5 max precision 0.998916 1.000000 0
## 6 max recall 0.461018 1.000000 170
## 7 max specificity 0.998916 1.000000 0
## 8 max absolute_mcc 0.631304 0.741749 156
## 9 max min_per_class_accuracy 0.821364 0.782609 126
## 10 max mean_per_class_accuracy 0.631304 0.856662 156
## 11 max tns 0.998916 23.000000 0
## 12 max fns 0.998916 154.000000 0
## 13 max fps 0.035945 23.000000 177
## 14 max tps 0.461018 155.000000 170
## 15 max tnr 0.998916 1.000000 0
## 16 max fnr 0.998916 0.993548 0
## 17 max fpr 0.035945 1.000000 177
## 18 max tpr 0.461018 1.000000 170
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.09823331
## RMSE: 0.3134219
## LogLoss: 0.3375866
## Mean Per-Class Error: 0.3181987
## AUC: 0.8283781
## AUCPR: 0.94296
## Gini: 0.6567561
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Left No Error Rate
## Left 60 94 0.610390 =94/154
## No 20 749 0.026008 =20/769
## Totals 80 843 0.123510 =114/923
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.526749 0.929280 331
## 2 max f2 0.279570 0.965552 379
## 3 max f0point5 0.701396 0.920189 273
## 4 max accuracy 0.526749 0.876490 331
## 5 max precision 0.999357 1.000000 0
## 6 max recall 0.185843 1.000000 392
## 7 max specificity 0.999357 1.000000 0
## 8 max absolute_mcc 0.701396 0.516693 273
## 9 max min_per_class_accuracy 0.842116 0.741222 186
## 10 max mean_per_class_accuracy 0.727951 0.772651 254
## 11 max tns 0.999357 154.000000 0
## 12 max fns 0.999357 767.000000 0
## 13 max fps 0.049075 154.000000 399
## 14 max tps 0.185843 769.000000 392
## 15 max tnr 0.999357 1.000000 0
## 16 max fnr 0.999357 0.997399 0
## 17 max fpr 0.049075 1.000000 399
## 18 max tpr 0.185843 1.000000 392
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy 0.884252 0.028423 0.913044 0.848168 0.864407 0.884817
## auc 0.828147 0.045895 0.856943 0.847875 0.800699 0.873428
## err 0.115747 0.028423 0.086957 0.151832 0.135593 0.115183
## err_count 21.400000 5.727129 18.000000 29.000000 24.000000 22.000000
## f0point5 0.906857 0.028576 0.941241 0.872162 0.886792 0.905421
## cv_5_valid
## accuracy 0.910828
## auc 0.761791
## err 0.089172
## err_count 14.000000
## f0point5 0.928668
##
## ---
## mean sd cv_1_valid cv_2_valid cv_3_valid
## precision 0.889983 0.035127 0.935135 0.848837 0.865031
## r2 0.273542 0.069636 0.331446 0.287203 0.252507
## recall 0.982518 0.010079 0.966480 0.979866 0.986014
## residual_deviance 123.393105 21.598175 115.414230 149.762400 139.635040
## rmse 0.313267 0.030630 0.279643 0.349677 0.340595
## specificity 0.391620 0.108602 0.571429 0.380952 0.352941
## cv_4_valid cv_5_valid
## precision 0.887006 0.913907
## r2 0.332356 0.164200
## recall 0.987421 0.992806
## residual_deviance 117.109146 95.044685
## rmse 0.305149 0.291270
## specificity 0.375000 0.277778
Save and Load
# ?h2o.getModel
# ?h2o.saveModel
# ?h2o.loadModel
# h2o.getModel("GLM_1_AutoML_2_20240423_111019") %>%
# h2o.saveModel("h2o_models/")
# best_model <- h2o.loadModel("h2o_models/StackedEnsemble_BestOfFamily_3_AutoML_2_20240423_111019")
Make Predictions
predictions <- h2o.predict(best_model, newdata = test_h2o)
##
|
| | 0%
|
|======================================================================| 100%
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 369 × 35
## predict Left No Age Attrition BusinessTravel DailyRate Department
## <fct> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 Left 0.498 0.502 41 Left Travel_Rarely 1102 Sales
## 2 No 0.0276 0.972 49 No Travel_Frequently 279 Research & …
## 3 No 0.317 0.683 33 No Travel_Frequently 1392 Research & …
## 4 No 0.241 0.759 59 No Travel_Rarely 1324 Research & …
## 5 No 0.0635 0.936 38 No Travel_Frequently 216 Research & …
## 6 No 0.307 0.693 29 No Travel_Rarely 153 Research & …
## 7 No 0.0828 0.917 34 No Travel_Rarely 1346 Research & …
## 8 Left 0.861 0.139 28 Left Travel_Rarely 103 Research & …
## 9 No 0.309 0.691 22 No Non-Travel 1123 Research & …
## 10 No 0.0484 0.952 53 No Travel_Rarely 1219 Sales
## # ℹ 359 more rows
## # ℹ 27 more variables: DistanceFromHome <dbl>, Education <dbl>,
## # EducationField <fct>, EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>,
## # Gender <fct>, HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>,
## # JobRole <fct>, JobSatisfaction <dbl>, MaritalStatus <fct>,
## # MonthlyIncome <dbl>, MonthlyRate <dbl>, NumCompaniesWorked <dbl>,
## # OverTime <fct>, PercentSalaryHike <dbl>, PerformanceRating <dbl>, …
Evaluate Model
?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "StackedEnsemble_BestOfFamily_4_AutoML_6_20240423_200843"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/StackedEnsemble_BestOfFamily_4_AutoML_6_20240423_200843"
##
##
## $model_checksum
## [1] "-3651500479593787536"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_9250_3"
##
##
## $frame_checksum
## [1] "-54192601206779456"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.713917e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.09355091
##
## $RMSE
## [1] 0.3058609
##
## $nobs
## [1] 369
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.3129458
##
## $logloss
## [1] 0.3209535
##
## $AUC
## [1] 0.8355987
##
## $pr_auc
## [1] 0.953008
##
## $Gini
## [1] 0.6711974
##
## $mean_per_class_error
## [1] 0.3180421
##
## $domain
## [1] "Left" "No"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Left No Error Rate
## Left 23 37 0.6167 = 37 / 60
## No 6 303 0.0194 = 6 / 309
## Totals 29 340 0.1165 = 43 / 369
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.999104 0.006452 0.004042 0.015974 0.165312 1.000000 0.003236 1.000000
## 2 0.998245 0.012862 0.008078 0.031546 0.168022 1.000000 0.006472 1.000000
## 3 0.997857 0.019231 0.012107 0.046729 0.170732 1.000000 0.009709 1.000000
## 4 0.997334 0.025559 0.016129 0.061538 0.173442 1.000000 0.012945 1.000000
## 5 0.996853 0.031847 0.020145 0.075988 0.176152 1.000000 0.016181 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.022971 0.003236 0.501618 60 308 0 1
## 2 0.032530 0.006472 0.503236 60 307 0 2
## 3 0.039895 0.009709 0.504854 60 306 0 3
## 4 0.046130 0.012945 0.506472 60 305 0 4
## 5 0.051645 0.016181 0.508091 60 304 0 5
## tnr fnr fpr tpr idx
## 1 1.000000 0.996764 0.000000 0.003236 0
## 2 1.000000 0.993528 0.000000 0.006472 1
## 3 1.000000 0.990291 0.000000 0.009709 2
## 4 1.000000 0.987055 0.000000 0.012945 3
## 5 1.000000 0.983819 0.000000 0.016181 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 364 0.209753 0.918276 0.965625 0.875354 0.850949 0.848901 1.000000
## 365 0.156808 0.916914 0.965022 0.873375 0.848238 0.846575 1.000000
## 366 0.149751 0.915556 0.964419 0.871404 0.845528 0.844262 1.000000
## 367 0.139496 0.914201 0.963818 0.869443 0.842818 0.841962 1.000000
## 368 0.052146 0.912851 0.963217 0.867490 0.840108 0.839674 1.000000
## 369 0.048939 0.911504 0.962617 0.865546 0.837398 0.837398 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 364 0.083333 0.265973 0.083333 0.541667 5
## 365 0.066667 0.237568 0.066667 0.533333 4
## 366 0.050000 0.205458 0.050000 0.525000 3
## 367 0.033333 0.167527 0.033333 0.516667 2
## 368 0.016667 0.118299 0.016667 0.508333 1
## 369 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 364 0 55 309 0.083333 0.000000 0.916667 1.000000 363
## 365 0 56 309 0.066667 0.000000 0.933333 1.000000 364
## 366 0 57 309 0.050000 0.000000 0.950000 1.000000 365
## 367 0 58 309 0.033333 0.000000 0.966667 1.000000 366
## 368 0 59 309 0.016667 0.000000 0.983333 1.000000 367
## 369 0 60 309 0.000000 0.000000 1.000000 1.000000 368
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.458483 0.933744 339
## 2 max f2 0.320588 0.968553 353
## 3 max f0point5 0.642544 0.924855 311
## 4 max accuracy 0.568764 0.883469 325
## 5 max precision 0.999104 1.000000 0
## 6 max recall 0.252523 1.000000 360
## 7 max specificity 0.999104 1.000000 0
## 8 max absolute_mcc 0.634859 0.544809 314
## 9 max min_per_class_accuracy 0.819205 0.766667 250
## 10 max mean_per_class_accuracy 0.870612 0.774515 224
## 11 max tns 0.999104 60.000000 0
## 12 max fns 0.999104 308.000000 0
## 13 max fps 0.048939 60.000000 368
## 14 max tps 0.252523 309.000000 360
## 15 max tnr 0.999104 1.000000 0
## 16 max fnr 0.999104 0.996764 0
## 17 max fpr 0.048939 1.000000 368
## 18 max tpr 0.252523 1.000000 360
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 83.74 %, avg score: 82.81 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.01084011 0.997007 1.194175 1.194175
## 2 2 0.02168022 0.996063 1.194175 1.194175
## 3 3 0.03252033 0.994235 1.194175 1.194175
## 4 4 0.04065041 0.992953 1.194175 1.194175
## 5 5 0.05149051 0.992751 1.194175 1.194175
## 6 6 0.10027100 0.985612 1.127832 1.161900
## 7 7 0.15176152 0.980895 1.194175 1.172850
## 8 8 0.20054201 0.974156 1.061489 1.145762
## 9 9 0.30081301 0.955855 1.097350 1.129625
## 10 10 0.40108401 0.938754 1.194175 1.145762
## 11 11 0.50135501 0.914432 1.129625 1.142535
## 12 12 0.59891599 0.873800 1.161003 1.145543
## 13 13 0.69918699 0.810245 0.968250 1.120117
## 14 14 0.79945799 0.715668 1.000525 1.105118
## 15 15 0.89972900 0.497330 0.806875 1.071880
## 16 16 1.00000000 0.048939 0.355025 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 1.000000 0.998135 1.000000 0.998135
## 2 1.000000 0.996407 1.000000 0.997271
## 3 1.000000 0.995221 1.000000 0.996588
## 4 1.000000 0.993612 1.000000 0.995992
## 5 1.000000 0.992845 1.000000 0.995330
## 6 0.944444 0.989084 0.972973 0.992291
## 7 1.000000 0.983056 0.982143 0.989158
## 8 0.888889 0.977207 0.959459 0.986251
## 9 0.918919 0.964631 0.945946 0.979044
## 10 1.000000 0.947351 0.959459 0.971121
## 11 0.945946 0.926870 0.956757 0.962271
## 12 0.972222 0.895568 0.959276 0.951405
## 13 0.810811 0.842447 0.937984 0.935779
## 14 0.837838 0.768712 0.925424 0.914825
## 15 0.675676 0.627098 0.897590 0.882759
## 16 0.297297 0.337795 0.837398 0.828115
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 0.012945 0.012945 19.417476 19.417476
## 2 0.012945 0.025890 19.417476 19.417476
## 3 0.012945 0.038835 19.417476 19.417476
## 4 0.009709 0.048544 19.417476 19.417476
## 5 0.012945 0.061489 19.417476 19.417476
## 6 0.055016 0.116505 12.783172 16.189976
## 7 0.061489 0.177994 19.417476 17.285021
## 8 0.051780 0.229773 6.148867 14.576227
## 9 0.110032 0.339806 9.734978 12.962477
## 10 0.119741 0.459547 19.417476 14.576227
## 11 0.113269 0.572816 12.962477 14.253477
## 12 0.113269 0.686084 16.100324 14.554321
## 13 0.097087 0.783172 -3.175020 12.011741
## 14 0.100324 0.883495 0.052480 10.511766
## 15 0.080906 0.964401 -19.312516 7.187975
## 16 0.035599 1.000000 -64.497507 0.000000
## kolmogorov_smirnov
## 1 0.012945
## 2 0.025890
## 3 0.038835
## 4 0.048544
## 5 0.061489
## 6 0.099838
## 7 0.161327
## 8 0.179773
## 9 0.239806
## 10 0.359547
## 11 0.439482
## 12 0.536084
## 13 0.516505
## 14 0.516828
## 15 0.397735
## 16 0.000000
##
## $residual_deviance
## [1] 236.8637
##
## $null_deviance
## [1] 327.6898
##
## $AIC
## [1] 248.8637
##
## $loglikelihood
## [1] 0
##
## $null_degrees_of_freedom
## [1] 368
##
## $residual_degrees_of_freedom
## [1] 363
h2o.auc(performance_h2o)
## [1] 0.8355987
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.458483084743864:
## Left No Error Rate
## Left 23 37 0.616667 =37/60
## No 6 303 0.019417 =6/309
## Totals 29 340 0.116531 =43/369
h2o.metric(performance_h2o)
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.999104 0.006452 0.004042 0.015974 0.165312 1.000000 0.003236 1.000000
## 2 0.998245 0.012862 0.008078 0.031546 0.168022 1.000000 0.006472 1.000000
## 3 0.997857 0.019231 0.012107 0.046729 0.170732 1.000000 0.009709 1.000000
## 4 0.997334 0.025559 0.016129 0.061538 0.173442 1.000000 0.012945 1.000000
## 5 0.996853 0.031847 0.020145 0.075988 0.176152 1.000000 0.016181 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.022971 0.003236 0.501618 60 308 0 1
## 2 0.032530 0.006472 0.503236 60 307 0 2
## 3 0.039895 0.009709 0.504854 60 306 0 3
## 4 0.046130 0.012945 0.506472 60 305 0 4
## 5 0.051645 0.016181 0.508091 60 304 0 5
## tnr fnr fpr tpr idx
## 1 1.000000 0.996764 0.000000 0.003236 0
## 2 1.000000 0.993528 0.000000 0.006472 1
## 3 1.000000 0.990291 0.000000 0.009709 2
## 4 1.000000 0.987055 0.000000 0.012945 3
## 5 1.000000 0.983819 0.000000 0.016181 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 364 0.209753 0.918276 0.965625 0.875354 0.850949 0.848901 1.000000
## 365 0.156808 0.916914 0.965022 0.873375 0.848238 0.846575 1.000000
## 366 0.149751 0.915556 0.964419 0.871404 0.845528 0.844262 1.000000
## 367 0.139496 0.914201 0.963818 0.869443 0.842818 0.841962 1.000000
## 368 0.052146 0.912851 0.963217 0.867490 0.840108 0.839674 1.000000
## 369 0.048939 0.911504 0.962617 0.865546 0.837398 0.837398 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 364 0.083333 0.265973 0.083333 0.541667 5
## 365 0.066667 0.237568 0.066667 0.533333 4
## 366 0.050000 0.205458 0.050000 0.525000 3
## 367 0.033333 0.167527 0.033333 0.516667 2
## 368 0.016667 0.118299 0.016667 0.508333 1
## 369 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 364 0 55 309 0.083333 0.000000 0.916667 1.000000 363
## 365 0 56 309 0.066667 0.000000 0.933333 1.000000 364
## 366 0 57 309 0.050000 0.000000 0.950000 1.000000 365
## 367 0 58 309 0.033333 0.000000 0.966667 1.000000 366
## 368 0 59 309 0.016667 0.000000 0.983333 1.000000 367
## 369 0 60 309 0.000000 0.000000 1.000000 1.000000 368