The goal is to automate building and tuning a classification model to predict employee attrition, using the h2o::h2o.automl.

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.4.4     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.3.1     ✔ stringr 1.5.1
## ✔ readr   2.1.2     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.6     ✔ workflows    1.1.3
## ✔ modeldata    1.3.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.3.0
## ✔ recipes      1.0.9     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(tidyquant)

## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:h2o':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## 
## Attaching package: 'TTR'
## 
## The following object is masked from 'package:dials':
## 
##     momentum
## 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean Data

skimr::skim(members)

Data summary
Name	members
Number of rows	76519
Number of columns	21
_______________________
Column type frequency:
character	10
logical	6
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
expedition_id	0	1.00	9	9	10350
member_id	0	1.00	12	12	76518
peak_id	0	1.00	4	4	391
peak_name	15	1.00	4	25	390
season	0	1.00	6	7	5
sex	2	1.00	1	1	2
citizenship	10	1.00	2	23	212
expedition_role	21	1.00	4	25	524
death_cause	75413	0.01	3	27	12
injury_type	74807	0.02	3	27	11

Variable type: logical

skim_variable	complete_rate	mean	count
hired	1	0.21	FAL: 60788, TRU: 15731
success	1	0.38	FAL: 47320, TRU: 29199
solo	1	0.00	FAL: 76398, TRU: 121
oxygen_used	1	0.24	FAL: 58286, TRU: 18233
died	1	0.01	FAL: 75413, TRU: 1106
injured	1	0.02	FAL: 74806, TRU: 1713

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2000.36	14.78	1905	1991	2004	2012	2019	▁▁▁▃▇
age	3497	0.95	37.33	10.40	7	29	36	44	85	▁▇▅▁▁
highpoint_metres	21833	0.71	7470.68	1040.06	3800	6700	7400	8400	8850	▁▁▆▃▇
death_height_metres	75451	0.01	6592.85	1308.19	400	5800	6600	7550	8830	▁▁▂▇▆
injury_height_metres	75510	0.01	7049.91	1214.24	400	6200	7100	8000	8880	▁▁▂▇▇

Split Data

set.seed(1234)

data_split <- initial_split(members, strata = "died")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)

Recipes

recipe_obj <- recipe(died ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors())

Model

# Initialize H2O
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         11 days 17 hours 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    4 months and 14 days 
##     H2O cluster name:           H2O_started_from_R_jasonzink_qxv383 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.98 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.2.1 (2022-06-23)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (4 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2567)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o  <- as.h2o(test_tbl)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

y <- "died"
x <- setdiff(names(train_tbl), y)

auto_ml_models_h2o <- h2o.automl(
    x = x, 
    y = y, 
    training_frame    = train_h2o, 
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    max_runtime_secs  = 150, 
    nfolds            = 5, 
    seed              = 6789)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
## 13:43:06.36: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 13:43:06.109: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   6%
## 13:43:15.475: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
## 13:43:31.748: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  29%
## 13:43:49.466: _train param, Dropping unused columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |==========================================                            |  59%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
## 13:44:39.679: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |============================================                          |  64%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  68%
## 13:44:47.87: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |===================================================                   |  73%
## 13:44:56.111: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
## 13:45:02.843: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |==========================================================            |  83%
## 13:45:09.935: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
## 13:45:17.782: _train param, Dropping unused columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |=================================================================     |  93%
## 13:45:25.766: _train param, Dropping unused columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================| 100%

auto_ml_models_h2o@leaderboard

##                                                   model_id       auc
## 1                          GBM_3_AutoML_14_20240505_134306 0.9981166
## 2                          GBM_1_AutoML_14_20240505_134306 0.9952748
## 3 StackedEnsemble_BestOfFamily_1_AutoML_14_20240505_134306 0.9951716
## 4    StackedEnsemble_AllModels_1_AutoML_14_20240505_134306 0.9947865
## 5                      XGBoost_2_AutoML_14_20240505_134306 0.9945520
## 6                      XGBoost_1_AutoML_14_20240505_134306 0.9944446
##       logloss     aucpr mean_per_class_error       rmse          mse
## 1 0.003951496 0.9879342          0.009285770 0.02247598 0.0005051699
## 2 0.002896306 0.9848636          0.009338793 0.02081201 0.0004331397
## 3 0.002945840 0.9854408          0.009338793 0.02103056 0.0004422843
## 4 0.019468813 0.9860488          0.009312282 0.06580897 0.0043308201
## 5 0.003376228 0.9860617          0.016666667 0.02157198 0.0004653503
## 6 0.003840098 0.9851185          0.016666667 0.02182597 0.0004763731
## 
## [11 rows x 7 columns]

auto_ml_models_h2o@leader

## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model ID:  GBM_3_AutoML_14_20240505_134306 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              28                       28               28750         8
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         8    8.00000         23        127    77.57143
## 
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  0.0004845609
## RMSE:  0.02201275
## LogLoss:  0.003707398
## Mean Per-Class Error:  0.01513915
## AUC:  0.9997291
## AUCPR:  0.9923869
## Gini:  0.9994583
## R^2:  0.9655459
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE TRUE    Error       Rate
## FALSE  48014    3 0.000062   =3/48017
## TRUE      21  674 0.030216    =21/695
## Totals 48035  677 0.000493  =24/48712
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.165603     0.982507 194
## 2                       max f2  0.076166     0.977325 215
## 3                 max f0point5  0.260806     0.992897 189
## 4                 max accuracy  0.260806     0.999507 189
## 5                max precision  0.998540     1.000000   0
## 6                   max recall  0.002166     1.000000 344
## 7              max specificity  0.998540     1.000000   0
## 8             max absolute_mcc  0.165603     0.982344 194
## 9   max min_per_class_accuracy  0.015633     0.991367 278
## 10 max mean_per_class_accuracy  0.015633     0.993226 278
## 11                     max tns  0.998540 48017.000000   0
## 12                     max fns  0.998540   694.000000   0
## 13                     max fps  0.000990 48017.000000 399
## 14                     max tps  0.002166   695.000000 344
## 15                     max tnr  0.998540     1.000000   0
## 16                     max fnr  0.998540     0.998561   0
## 17                     max fpr  0.000990     1.000000 399
## 18                     max tpr  0.002166     1.000000 344
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
## 
## MSE:  0.00051924
## RMSE:  0.02278684
## LogLoss:  0.004263676
## Mean Per-Class Error:  0.01069687
## AUC:  0.9963584
## AUCPR:  0.9863581
## Gini:  0.9927167
## R^2:  0.9675187
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE TRUE    Error     Rate
## FALSE   8535    1 0.000117  =1/8536
## TRUE       3  138 0.021277   =3/141
## Totals  8538  139 0.000461  =4/8677
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.179921    0.985714 110
## 2                       max f2  0.179921    0.981508 110
## 3                 max f0point5  0.503643    0.994194 108
## 4                 max accuracy  0.503643    0.999539 108
## 5                max precision  0.999805    1.000000   0
## 6                   max recall  0.001290    1.000000 342
## 7              max specificity  0.999805    1.000000   0
## 8             max absolute_mcc  0.179921    0.985506 110
## 9   max min_per_class_accuracy  0.051895    0.985816 116
## 10 max mean_per_class_accuracy  0.051895    0.992439 116
## 11                     max tns  0.999805 8536.000000   0
## 12                     max fns  0.999805  140.000000   0
## 13                     max fps  0.000992 8536.000000 399
## 14                     max tps  0.001290  141.000000 342
## 15                     max tnr  0.999805    1.000000   0
## 16                     max fnr  0.999805    0.992908   0
## 17                     max fpr  0.000992    1.000000 399
## 18                     max tpr  0.001290    1.000000 342
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.0006399879
## RMSE:  0.02529798
## LogLoss:  0.005190406
## Mean Per-Class Error:  0.01800644
## AUC:  0.9923808
## AUCPR:  0.9750668
## Gini:  0.9847616
## R^2:  0.9544945
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE TRUE    Error       Rate
## FALSE  48015    2 0.000042   =2/48017
## TRUE      25  670 0.035971    =25/695
## Totals 48040  672 0.000554  =27/48712
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.383374     0.980249 198
## 2                       max f2  0.383374     0.970452 198
## 3                 max f0point5  0.432512     0.991111 196
## 4                 max accuracy  0.432512     0.999446 196
## 5                max precision  0.999655     1.000000   0
## 6                   max recall  0.000939     1.000000 393
## 7              max specificity  0.999655     1.000000   0
## 8             max absolute_mcc  0.383374     0.980110 198
## 9   max min_per_class_accuracy  0.003917     0.978417 325
## 10 max mean_per_class_accuracy  0.072631     0.982432 221
## 11                     max tns  0.999655 48017.000000   0
## 12                     max fns  0.999655   688.000000   0
## 13                     max fps  0.000785 48017.000000 399
## 14                     max tps  0.000939   695.000000 393
## 15                     max tnr  0.999655     1.000000   0
## 16                     max fnr  0.999655     0.989928   0
## 17                     max fpr  0.000785     1.000000 399
## 18                     max tpr  0.000939     1.000000 393
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                              mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                 0.999466 0.000134   0.999589   0.999384   0.999281
## auc                      0.993332 0.001697   0.995541   0.994577   0.992754
## err                      0.000534 0.000134   0.000411   0.000616   0.000719
## err_count                5.200000 1.303840   4.000000   6.000000   7.000000
## f0point5                 0.992289 0.001814   0.994437   0.990854   0.990237
## f1                       0.980951 0.004432   0.986207   0.977444   0.975945
## f2                       0.969881 0.006934   0.978112   0.964392   0.962060
## lift_top_group          70.294120 4.211651  66.278910  71.639710  65.382550
## logloss                  0.005190 0.000580   0.004865   0.005738   0.005819
## max_per_class_error      0.037356 0.008539   0.027211   0.044118   0.046980
## mcc                      0.980871 0.004414   0.986095   0.977387   0.975872
## mean_per_class_accuracy  0.981322 0.004270   0.986395   0.977941   0.976510
## mean_per_class_error     0.018678 0.004270   0.013605   0.022059   0.023490
## mse                      0.000640 0.000121   0.000583   0.000703   0.000823
## pr_auc                   0.975311 0.004149   0.979787   0.970322   0.973553
## precision                1.000000 0.000000   1.000000   1.000000   1.000000
## r2                       0.954624 0.007002   0.960751   0.948888   0.945353
## recall                   0.962644 0.008539   0.972789   0.955882   0.953020
## rmse                     0.025211 0.002340   0.024150   0.026524   0.028688
## specificity              1.000000 0.000000   1.000000   1.000000   1.000000
##                         cv_4_valid cv_5_valid
## accuracy                  0.999487   0.999589
## auc                       0.992452   0.991338
## err                       0.000513   0.000411
## err_count                 5.000000   4.000000
## f0point5                  0.992126   0.993789
## f1                        0.980545   0.984615
## f2                        0.969231   0.975610
## lift_top_group           74.366410  73.803030
## logloss                   0.004463   0.005067
## max_per_class_error       0.038168   0.030303
## mcc                       0.980475   0.984527
## mean_per_class_accuracy   0.980916   0.984849
## mean_per_class_error      0.019084   0.015152
## mse                       0.000548   0.000542
## pr_auc                    0.979469   0.973423
## precision                 1.000000   1.000000
## r2                        0.958684   0.959444
## recall                    0.961832   0.969697
## rmse                      0.023412   0.023282
## specificity               1.000000   1.000000

best_model <- auto_ml_models_h2o@leader

best_model

## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model ID:  GBM_3_AutoML_14_20240505_134306 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              28                       28               28750         8
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         8    8.00000         23        127    77.57143
## 
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  0.0004845609
## RMSE:  0.02201275
## LogLoss:  0.003707398
## Mean Per-Class Error:  0.01513915
## AUC:  0.9997291
## AUCPR:  0.9923869
## Gini:  0.9994583
## R^2:  0.9655459
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE TRUE    Error       Rate
## FALSE  48014    3 0.000062   =3/48017
## TRUE      21  674 0.030216    =21/695
## Totals 48035  677 0.000493  =24/48712
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.165603     0.982507 194
## 2                       max f2  0.076166     0.977325 215
## 3                 max f0point5  0.260806     0.992897 189
## 4                 max accuracy  0.260806     0.999507 189
## 5                max precision  0.998540     1.000000   0
## 6                   max recall  0.002166     1.000000 344
## 7              max specificity  0.998540     1.000000   0
## 8             max absolute_mcc  0.165603     0.982344 194
## 9   max min_per_class_accuracy  0.015633     0.991367 278
## 10 max mean_per_class_accuracy  0.015633     0.993226 278
## 11                     max tns  0.998540 48017.000000   0
## 12                     max fns  0.998540   694.000000   0
## 13                     max fps  0.000990 48017.000000 399
## 14                     max tps  0.002166   695.000000 344
## 15                     max tnr  0.998540     1.000000   0
## 16                     max fnr  0.998540     0.998561   0
## 17                     max fpr  0.000990     1.000000 399
## 18                     max tpr  0.002166     1.000000 344
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
## 
## MSE:  0.00051924
## RMSE:  0.02278684
## LogLoss:  0.004263676
## Mean Per-Class Error:  0.01069687
## AUC:  0.9963584
## AUCPR:  0.9863581
## Gini:  0.9927167
## R^2:  0.9675187
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE TRUE    Error     Rate
## FALSE   8535    1 0.000117  =1/8536
## TRUE       3  138 0.021277   =3/141
## Totals  8538  139 0.000461  =4/8677
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.179921    0.985714 110
## 2                       max f2  0.179921    0.981508 110
## 3                 max f0point5  0.503643    0.994194 108
## 4                 max accuracy  0.503643    0.999539 108
## 5                max precision  0.999805    1.000000   0
## 6                   max recall  0.001290    1.000000 342
## 7              max specificity  0.999805    1.000000   0
## 8             max absolute_mcc  0.179921    0.985506 110
## 9   max min_per_class_accuracy  0.051895    0.985816 116
## 10 max mean_per_class_accuracy  0.051895    0.992439 116
## 11                     max tns  0.999805 8536.000000   0
## 12                     max fns  0.999805  140.000000   0
## 13                     max fps  0.000992 8536.000000 399
## 14                     max tps  0.001290  141.000000 342
## 15                     max tnr  0.999805    1.000000   0
## 16                     max fnr  0.999805    0.992908   0
## 17                     max fpr  0.000992    1.000000 399
## 18                     max tpr  0.001290    1.000000 342
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.0006399879
## RMSE:  0.02529798
## LogLoss:  0.005190406
## Mean Per-Class Error:  0.01800644
## AUC:  0.9923808
## AUCPR:  0.9750668
## Gini:  0.9847616
## R^2:  0.9544945
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE TRUE    Error       Rate
## FALSE  48015    2 0.000042   =2/48017
## TRUE      25  670 0.035971    =25/695
## Totals 48040  672 0.000554  =27/48712
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.383374     0.980249 198
## 2                       max f2  0.383374     0.970452 198
## 3                 max f0point5  0.432512     0.991111 196
## 4                 max accuracy  0.432512     0.999446 196
## 5                max precision  0.999655     1.000000   0
## 6                   max recall  0.000939     1.000000 393
## 7              max specificity  0.999655     1.000000   0
## 8             max absolute_mcc  0.383374     0.980110 198
## 9   max min_per_class_accuracy  0.003917     0.978417 325
## 10 max mean_per_class_accuracy  0.072631     0.982432 221
## 11                     max tns  0.999655 48017.000000   0
## 12                     max fns  0.999655   688.000000   0
## 13                     max fps  0.000785 48017.000000 399
## 14                     max tps  0.000939   695.000000 393
## 15                     max tnr  0.999655     1.000000   0
## 16                     max fnr  0.999655     0.989928   0
## 17                     max fpr  0.000785     1.000000 399
## 18                     max tpr  0.000939     1.000000 393
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                              mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                 0.999466 0.000134   0.999589   0.999384   0.999281
## auc                      0.993332 0.001697   0.995541   0.994577   0.992754
## err                      0.000534 0.000134   0.000411   0.000616   0.000719
## err_count                5.200000 1.303840   4.000000   6.000000   7.000000
## f0point5                 0.992289 0.001814   0.994437   0.990854   0.990237
## f1                       0.980951 0.004432   0.986207   0.977444   0.975945
## f2                       0.969881 0.006934   0.978112   0.964392   0.962060
## lift_top_group          70.294120 4.211651  66.278910  71.639710  65.382550
## logloss                  0.005190 0.000580   0.004865   0.005738   0.005819
## max_per_class_error      0.037356 0.008539   0.027211   0.044118   0.046980
## mcc                      0.980871 0.004414   0.986095   0.977387   0.975872
## mean_per_class_accuracy  0.981322 0.004270   0.986395   0.977941   0.976510
## mean_per_class_error     0.018678 0.004270   0.013605   0.022059   0.023490
## mse                      0.000640 0.000121   0.000583   0.000703   0.000823
## pr_auc                   0.975311 0.004149   0.979787   0.970322   0.973553
## precision                1.000000 0.000000   1.000000   1.000000   1.000000
## r2                       0.954624 0.007002   0.960751   0.948888   0.945353
## recall                   0.962644 0.008539   0.972789   0.955882   0.953020
## rmse                     0.025211 0.002340   0.024150   0.026524   0.028688
## specificity              1.000000 0.000000   1.000000   1.000000   1.000000
##                         cv_4_valid cv_5_valid
## accuracy                  0.999487   0.999589
## auc                       0.992452   0.991338
## err                       0.000513   0.000411
## err_count                 5.000000   4.000000
## f0point5                  0.992126   0.993789
## f1                        0.980545   0.984615
## f2                        0.969231   0.975610
## lift_top_group           74.366410  73.803030
## logloss                   0.004463   0.005067
## max_per_class_error       0.038168   0.030303
## mcc                       0.980475   0.984527
## mean_per_class_accuracy   0.980916   0.984849
## mean_per_class_error      0.019084   0.015152
## mse                       0.000548   0.000542
## pr_auc                    0.979469   0.973423
## precision                 1.000000   1.000000
## r2                        0.958684   0.959444
## recall                    0.961832   0.969697
## rmse                      0.023412   0.023282
## specificity               1.000000   1.000000

Examine The Output of H2O.Automl

auto_ml_models_h2o %>% typeof()

## [1] "S4"

auto_ml_models_h2o %>% slotNames()

## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"

auto_ml_models_h2o@leaderboard

##                                                   model_id       auc
## 1                          GBM_3_AutoML_14_20240505_134306 0.9981166
## 2                          GBM_1_AutoML_14_20240505_134306 0.9952748
## 3 StackedEnsemble_BestOfFamily_1_AutoML_14_20240505_134306 0.9951716
## 4    StackedEnsemble_AllModels_1_AutoML_14_20240505_134306 0.9947865
## 5                      XGBoost_2_AutoML_14_20240505_134306 0.9945520
## 6                      XGBoost_1_AutoML_14_20240505_134306 0.9944446
##       logloss     aucpr mean_per_class_error       rmse          mse
## 1 0.003951496 0.9879342          0.009285770 0.02247598 0.0005051699
## 2 0.002896306 0.9848636          0.009338793 0.02081201 0.0004331397
## 3 0.002945840 0.9854408          0.009338793 0.02103056 0.0004422843
## 4 0.019468813 0.9860488          0.009312282 0.06580897 0.0043308201
## 5 0.003376228 0.9860617          0.016666667 0.02157198 0.0004653503
## 6 0.003840098 0.9851185          0.016666667 0.02182597 0.0004763731
## 
## [11 rows x 7 columns]

auto_ml_models_h2o@leader

## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model ID:  GBM_3_AutoML_14_20240505_134306 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              28                       28               28750         8
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         8    8.00000         23        127    77.57143
## 
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  0.0004845609
## RMSE:  0.02201275
## LogLoss:  0.003707398
## Mean Per-Class Error:  0.01513915
## AUC:  0.9997291
## AUCPR:  0.9923869
## Gini:  0.9994583
## R^2:  0.9655459
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE TRUE    Error       Rate
## FALSE  48014    3 0.000062   =3/48017
## TRUE      21  674 0.030216    =21/695
## Totals 48035  677 0.000493  =24/48712
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.165603     0.982507 194
## 2                       max f2  0.076166     0.977325 215
## 3                 max f0point5  0.260806     0.992897 189
## 4                 max accuracy  0.260806     0.999507 189
## 5                max precision  0.998540     1.000000   0
## 6                   max recall  0.002166     1.000000 344
## 7              max specificity  0.998540     1.000000   0
## 8             max absolute_mcc  0.165603     0.982344 194
## 9   max min_per_class_accuracy  0.015633     0.991367 278
## 10 max mean_per_class_accuracy  0.015633     0.993226 278
## 11                     max tns  0.998540 48017.000000   0
## 12                     max fns  0.998540   694.000000   0
## 13                     max fps  0.000990 48017.000000 399
## 14                     max tps  0.002166   695.000000 344
## 15                     max tnr  0.998540     1.000000   0
## 16                     max fnr  0.998540     0.998561   0
## 17                     max fpr  0.000990     1.000000 399
## 18                     max tpr  0.002166     1.000000 344
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
## 
## MSE:  0.00051924
## RMSE:  0.02278684
## LogLoss:  0.004263676
## Mean Per-Class Error:  0.01069687
## AUC:  0.9963584
## AUCPR:  0.9863581
## Gini:  0.9927167
## R^2:  0.9675187
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE TRUE    Error     Rate
## FALSE   8535    1 0.000117  =1/8536
## TRUE       3  138 0.021277   =3/141
## Totals  8538  139 0.000461  =4/8677
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.179921    0.985714 110
## 2                       max f2  0.179921    0.981508 110
## 3                 max f0point5  0.503643    0.994194 108
## 4                 max accuracy  0.503643    0.999539 108
## 5                max precision  0.999805    1.000000   0
## 6                   max recall  0.001290    1.000000 342
## 7              max specificity  0.999805    1.000000   0
## 8             max absolute_mcc  0.179921    0.985506 110
## 9   max min_per_class_accuracy  0.051895    0.985816 116
## 10 max mean_per_class_accuracy  0.051895    0.992439 116
## 11                     max tns  0.999805 8536.000000   0
## 12                     max fns  0.999805  140.000000   0
## 13                     max fps  0.000992 8536.000000 399
## 14                     max tps  0.001290  141.000000 342
## 15                     max tnr  0.999805    1.000000   0
## 16                     max fnr  0.999805    0.992908   0
## 17                     max fpr  0.000992    1.000000 399
## 18                     max tpr  0.001290    1.000000 342
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.0006399879
## RMSE:  0.02529798
## LogLoss:  0.005190406
## Mean Per-Class Error:  0.01800644
## AUC:  0.9923808
## AUCPR:  0.9750668
## Gini:  0.9847616
## R^2:  0.9544945
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE TRUE    Error       Rate
## FALSE  48015    2 0.000042   =2/48017
## TRUE      25  670 0.035971    =25/695
## Totals 48040  672 0.000554  =27/48712
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.383374     0.980249 198
## 2                       max f2  0.383374     0.970452 198
## 3                 max f0point5  0.432512     0.991111 196
## 4                 max accuracy  0.432512     0.999446 196
## 5                max precision  0.999655     1.000000   0
## 6                   max recall  0.000939     1.000000 393
## 7              max specificity  0.999655     1.000000   0
## 8             max absolute_mcc  0.383374     0.980110 198
## 9   max min_per_class_accuracy  0.003917     0.978417 325
## 10 max mean_per_class_accuracy  0.072631     0.982432 221
## 11                     max tns  0.999655 48017.000000   0
## 12                     max fns  0.999655   688.000000   0
## 13                     max fps  0.000785 48017.000000 399
## 14                     max tps  0.000939   695.000000 393
## 15                     max tnr  0.999655     1.000000   0
## 16                     max fnr  0.999655     0.989928   0
## 17                     max fpr  0.000785     1.000000 399
## 18                     max tpr  0.000939     1.000000 393
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                              mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                 0.999466 0.000134   0.999589   0.999384   0.999281
## auc                      0.993332 0.001697   0.995541   0.994577   0.992754
## err                      0.000534 0.000134   0.000411   0.000616   0.000719
## err_count                5.200000 1.303840   4.000000   6.000000   7.000000
## f0point5                 0.992289 0.001814   0.994437   0.990854   0.990237
## f1                       0.980951 0.004432   0.986207   0.977444   0.975945
## f2                       0.969881 0.006934   0.978112   0.964392   0.962060
## lift_top_group          70.294120 4.211651  66.278910  71.639710  65.382550
## logloss                  0.005190 0.000580   0.004865   0.005738   0.005819
## max_per_class_error      0.037356 0.008539   0.027211   0.044118   0.046980
## mcc                      0.980871 0.004414   0.986095   0.977387   0.975872
## mean_per_class_accuracy  0.981322 0.004270   0.986395   0.977941   0.976510
## mean_per_class_error     0.018678 0.004270   0.013605   0.022059   0.023490
## mse                      0.000640 0.000121   0.000583   0.000703   0.000823
## pr_auc                   0.975311 0.004149   0.979787   0.970322   0.973553
## precision                1.000000 0.000000   1.000000   1.000000   1.000000
## r2                       0.954624 0.007002   0.960751   0.948888   0.945353
## recall                   0.962644 0.008539   0.972789   0.955882   0.953020
## rmse                     0.025211 0.002340   0.024150   0.026524   0.028688
## specificity              1.000000 0.000000   1.000000   1.000000   1.000000
##                         cv_4_valid cv_5_valid
## accuracy                  0.999487   0.999589
## auc                       0.992452   0.991338
## err                       0.000513   0.000411
## err_count                 5.000000   4.000000
## f0point5                  0.992126   0.993789
## f1                        0.980545   0.984615
## f2                        0.969231   0.975610
## lift_top_group           74.366410  73.803030
## logloss                   0.004463   0.005067
## max_per_class_error       0.038168   0.030303
## mcc                       0.980475   0.984527
## mean_per_class_accuracy   0.980916   0.984849
## mean_per_class_error      0.019084   0.015152
## mse                       0.000548   0.000542
## pr_auc                    0.979469   0.973423
## precision                 1.000000   1.000000
## r2                        0.958684   0.959444
## recall                    0.961832   0.969697
## rmse                      0.023412   0.023282
## specificity               1.000000   1.000000

Make Predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

predictions_tbl <- predictions %>% 
    as_tibble()

predictions_tbl %>% 
    bind_cols(test_tbl)

## # A tibble: 19,130 × 24
##    predict FALSE.   TRUE. expedition_id member_id peak_id peak_name  year season
##    <fct>    <dbl>   <dbl> <chr>         <chr>     <chr>   <chr>     <dbl> <chr> 
##  1 FALSE    0.998 0.00174 AMAD78301     AMAD7830… AMAD    Ama Dabl…  1978 Autumn
##  2 FALSE    0.999 0.00122 AMAD79101     AMAD7910… AMAD    Ama Dabl…  1979 Spring
##  3 FALSE    0.997 0.00261 AMAD79101     AMAD7910… AMAD    Ama Dabl…  1979 Spring
##  4 FALSE    0.996 0.00442 AMAD79101     AMAD7910… AMAD    Ama Dabl…  1979 Spring
##  5 FALSE    0.997 0.00267 AMAD79101     AMAD7910… AMAD    Ama Dabl…  1979 Spring
##  6 FALSE    0.999 0.00113 AMAD79101     AMAD7910… AMAD    Ama Dabl…  1979 Spring
##  7 FALSE    0.999 0.00118 AMAD79101     AMAD7910… AMAD    Ama Dabl…  1979 Spring
##  8 FALSE    0.999 0.00126 AMAD79101     AMAD7910… AMAD    Ama Dabl…  1979 Spring
##  9 FALSE    0.997 0.00269 AMAD79101     AMAD7910… AMAD    Ama Dabl…  1979 Spring
## 10 FALSE    0.999 0.00113 AMAD79301     AMAD7930… AMAD    Ama Dabl…  1979 Autumn
## # ℹ 19,120 more rows
## # ℹ 15 more variables: sex <chr>, age <dbl>, citizenship <chr>,
## #   expedition_role <chr>, hired <lgl>, highpoint_metres <dbl>, success <lgl>,
## #   solo <lgl>, oxygen_used <lgl>, died <lgl>, death_cause <chr>,
## #   death_height_metres <dbl>, injured <lgl>, injury_type <chr>,
## #   injury_height_metres <dbl>

Evaluations

performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)

## [1] "S4"

slotNames(performance_h2o)

## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"

performance_h2o@metrics

## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "GBM_3_AutoML_14_20240505_134306"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/GBM_3_AutoML_14_20240505_134306"
## 
## 
## $model_checksum
## [1] "4288655587182429368"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_b77b_3"
## 
## 
## $frame_checksum
## [1] "678340420273909232"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.714931e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.0005051699
## 
## $RMSE
## [1] 0.02247598
## 
## $nobs
## [1] 19130
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.9636954
## 
## $logloss
## [1] 0.003951496
## 
## $AUC
## [1] 0.9981166
## 
## $pr_auc
## [1] 0.9879342
## 
## $Gini
## [1] 0.9962333
## 
## $mean_per_class_error
## [1] 0.00928577
## 
## $domain
## [1] "FALSE" "TRUE" 
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        FALSE TRUE  Error         Rate
## FALSE  18859    1 0.0001 = 1 / 18,860
## TRUE       5  265 0.0185 =    5 / 270
## Totals 18864  266 0.0003 = 6 / 19,130
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.998222 0.014706 0.009242 0.035971 0.985991  1.000000 0.007407    1.000000
## 2  0.998009 0.021978 0.013850 0.053191 0.986043  1.000000 0.011111    1.000000
## 3  0.997652 0.029197 0.018450 0.069930 0.986095  1.000000 0.014815    1.000000
## 4  0.997258 0.043478 0.027624 0.102041 0.986200  1.000000 0.022222    1.000000
## 5  0.997069 0.050542 0.032199 0.117450 0.986252  1.000000 0.025926    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy   tns fns fps tps
## 1     0.085461               0.007407                0.503704 18860 268   0   2
## 2     0.104671               0.011111                0.505556 18860 267   0   3
## 3     0.120867               0.014815                0.507407 18860 266   0   4
## 4     0.148039               0.022222                0.511111 18860 264   0   6
## 5     0.159904               0.025926                0.512963 18860 263   0   7
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.992593 0.000000 0.007407   0
## 2 1.000000 0.988889 0.000000 0.011111   1
## 3 1.000000 0.985185 0.000000 0.014815   2
## 4 1.000000 0.977778 0.000000 0.022222   3
## 5 1.000000 0.974074 0.000000 0.025926   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.001071 0.029197 0.069930 0.018450 0.061422  0.014815 1.000000
## 396  0.001062 0.028435 0.068178 0.017963 0.035494  0.014422 1.000000
## 397  0.001057 0.028016 0.067214 0.017696 0.020648  0.014207 1.000000
## 398  0.001038 0.027926 0.067007 0.017638 0.017407  0.014161 1.000000
## 399  0.001022 0.027880 0.066901 0.017609 0.015734  0.014137 1.000000
## 400  0.000991 0.027835 0.066799 0.017580 0.014114  0.014114 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.047985     0.026663               0.047985                0.523993 905
## 396    0.021686     0.017685               0.021686                0.510843 409
## 397    0.006628     0.009704               0.006628                0.503314 125
## 398    0.003340     0.006878               0.003340                0.501670  63
## 399    0.001644     0.004820               0.001644                0.500822  31
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns   fps tps      tnr      fnr      fpr      tpr idx
## 395   0 17955 270 0.047985 0.000000 0.952015 1.000000 394
## 396   0 18451 270 0.021686 0.000000 0.978314 1.000000 395
## 397   0 18735 270 0.006628 0.000000 0.993372 1.000000 396
## 398   0 18797 270 0.003340 0.000000 0.996660 1.000000 397
## 399   0 18829 270 0.001644 0.000000 0.998356 1.000000 398
## 400   0 18860 270 0.000000 0.000000 1.000000 1.000000 399
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.260806     0.988806 162
## 2                       max f2  0.260806     0.984398 162
## 3                 max f0point5  0.260806     0.993253 162
## 4                 max accuracy  0.260806     0.999686 162
## 5                max precision  0.998222     1.000000   0
## 6                   max recall  0.001327     1.000000 355
## 7              max specificity  0.998222     1.000000   0
## 8             max absolute_mcc  0.260806     0.988675 162
## 9   max min_per_class_accuracy  0.005286     0.988889 250
## 10 max mean_per_class_accuracy  0.260806     0.990714 162
## 11                     max tns  0.998222 18860.000000   0
## 12                     max fns  0.998222   268.000000   0
## 13                     max fps  0.000991 18860.000000 399
## 14                     max tps  0.001327   270.000000 355
## 15                     max tnr  0.998222     1.000000   0
## 16                     max fnr  0.998222     0.992593   0
## 17                     max fpr  0.000991     1.000000 399
## 18                     max tpr  0.001327     1.000000 355
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate:  1.41 %, avg score:  1.48 %
##    group cumulative_data_fraction lower_threshold      lift cumulative_lift
## 1      1               0.01003659        0.972142 70.851852       70.851852
## 2      2               0.02002091        0.007671 27.450456       49.207814
## 3      3               0.03005750        0.003420  0.369020       32.899903
## 4      4               0.04004182        0.002628  0.000000       24.696403
## 5      5               0.05007841        0.002198  0.000000       19.746810
## 6      6               0.10020910        0.001816  0.073881        9.905215
## 7      7               0.15007841        0.001624  0.000000        6.613827
## 8      8               0.20000000        0.001501  0.074190        4.981481
## 9      9               0.30000000        0.001317  0.037037        3.333333
## 10    10               0.40120230        0.001252  0.000000        2.492508
## 11    11               0.50000000        0.001209  0.000000        2.000000
## 12    12               0.60000000        0.001179  0.000000        1.666667
## 13    13               0.70104548        0.001162  0.000000        1.426441
## 14    14               0.80219550        0.001147  0.000000        1.246579
## 15    15               0.90026137        0.001088  0.000000        1.110789
## 16    16               1.00000000        0.000944  0.000000        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 0.988205                 1.000000         0.988205
## 2       0.387435 0.351151                 0.694517         0.670510
## 3       0.005208 0.004836                 0.464348         0.448233
## 4       0.000000 0.002974                 0.348564         0.337209
## 5       0.000000 0.002460                 0.278706         0.270119
## 6       0.001043 0.001951                 0.139802         0.135965
## 7       0.000000 0.001736                 0.093347         0.091362
## 8       0.001047 0.001554                 0.070308         0.068945
## 9       0.000523 0.001402                 0.047047         0.046431
## 10      0.000000 0.001279                 0.035179         0.035042
## 11      0.000000 0.001231                 0.028228         0.028361
## 12      0.000000 0.001192                 0.023523         0.023833
## 13      0.000000 0.001171                 0.020133         0.020566
## 14      0.000000 0.001156                 0.017594         0.018119
## 15      0.000000 0.001124                 0.015678         0.016268
## 16      0.000000 0.001068                 0.014114         0.014752
##    capture_rate cumulative_capture_rate        gain cumulative_gain
## 1      0.711111                0.711111 6985.185185     6985.185185
## 2      0.274074                0.985185 2645.045569     4820.781356
## 3      0.003704                0.988889  -63.097994     3189.990338
## 4      0.000000                0.988889 -100.000000     2369.640267
## 5      0.000000                0.988889 -100.000000     1874.681048
## 6      0.003704                0.992593  -92.611903      890.521455
## 7      0.000000                0.992593 -100.000000      561.382664
## 8      0.003704                0.996296  -92.580958      398.148148
## 9      0.003704                1.000000  -96.296296      233.333333
## 10     0.000000                1.000000 -100.000000      149.250814
## 11     0.000000                1.000000 -100.000000      100.000000
## 12     0.000000                1.000000 -100.000000       66.666667
## 13     0.000000                1.000000 -100.000000       42.644098
## 14     0.000000                1.000000 -100.000000       24.657891
## 15     0.000000                1.000000 -100.000000       11.078853
## 16     0.000000                1.000000 -100.000000        0.000000
##    kolmogorov_smirnov
## 1            0.711111
## 2            0.978982
## 3            0.972558
## 4            0.962431
## 5            0.952251
## 6            0.905159
## 7            0.854576
## 8            0.807696
## 9            0.710021
## 10           0.607370
## 11           0.507158
## 12           0.405726
## 13           0.303234
## 14           0.200636
## 15           0.101166
## 16           0.000000

h2o.auc(performance_h2o)

## [1] 0.9981166

h2o.confusionMatrix(performance_h2o)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.260806196847446:
##        FALSE TRUE    Error      Rate
## FALSE  18859    1 0.000053  =1/18860
## TRUE       5  265 0.018519    =5/270
## Totals 18864  266 0.000314  =6/19130

h2o.metric(performance_h2o) %>% as_tibble() %>% filter(threshold %>% between (0.43, 0.44))

## # A tibble: 0 × 20
## # ℹ 20 variables: threshold <dbl>, f1 <dbl>, f2 <dbl>, f0point5 <dbl>,
## #   accuracy <dbl>, precision <dbl>, recall <dbl>, specificity <dbl>,
## #   absolute_mcc <dbl>, min_per_class_accuracy <dbl>,
## #   mean_per_class_accuracy <dbl>, tns <dbl>, fns <dbl>, fps <dbl>, tps <dbl>,
## #   tnr <dbl>, fnr <dbl>, fpr <dbl>, tpr <dbl>, idx <int>

Apply it to your data 11

Jason Zink

2024-04-25