The goal is to automate building and tuning a classification model to predict employee attrition, using the h2o::h2o.automl.
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.4 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.3.1 ✔ stringr 1.5.1
## ✔ readr 2.1.2 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.6 ✔ workflows 1.1.3
## ✔ modeldata 1.3.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.3.0
## ✔ recipes 1.0.9
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:h2o':
##
## day, hour, month, week, year
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
##
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
##
## Loading required package: quantmod
## Loading required package: TTR
##
## Attaching package: 'TTR'
##
## The following object is masked from 'package:dials':
##
## momentum
##
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Clean Data
skimr::skim(members)
| Name | members |
| Number of rows | 76519 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| logical | 6 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| expedition_id | 0 | 1.00 | 9 | 9 | 0 | 10350 | 0 |
| member_id | 0 | 1.00 | 12 | 12 | 0 | 76518 | 0 |
| peak_id | 0 | 1.00 | 4 | 4 | 0 | 391 | 0 |
| peak_name | 15 | 1.00 | 4 | 25 | 0 | 390 | 0 |
| season | 0 | 1.00 | 6 | 7 | 0 | 5 | 0 |
| sex | 2 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| citizenship | 10 | 1.00 | 2 | 23 | 0 | 212 | 0 |
| expedition_role | 21 | 1.00 | 4 | 25 | 0 | 524 | 0 |
| death_cause | 75413 | 0.01 | 3 | 27 | 0 | 12 | 0 |
| injury_type | 74807 | 0.02 | 3 | 27 | 0 | 11 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| hired | 0 | 1 | 0.21 | FAL: 60788, TRU: 15731 |
| success | 0 | 1 | 0.38 | FAL: 47320, TRU: 29199 |
| solo | 0 | 1 | 0.00 | FAL: 76398, TRU: 121 |
| oxygen_used | 0 | 1 | 0.24 | FAL: 58286, TRU: 18233 |
| died | 0 | 1 | 0.01 | FAL: 75413, TRU: 1106 |
| injured | 0 | 1 | 0.02 | FAL: 74806, TRU: 1713 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 2000.36 | 14.78 | 1905 | 1991 | 2004 | 2012 | 2019 | ▁▁▁▃▇ |
| age | 3497 | 0.95 | 37.33 | 10.40 | 7 | 29 | 36 | 44 | 85 | ▁▇▅▁▁ |
| highpoint_metres | 21833 | 0.71 | 7470.68 | 1040.06 | 3800 | 6700 | 7400 | 8400 | 8850 | ▁▁▆▃▇ |
| death_height_metres | 75451 | 0.01 | 6592.85 | 1308.19 | 400 | 5800 | 6600 | 7550 | 8830 | ▁▁▂▇▆ |
| injury_height_metres | 75510 | 0.01 | 7049.91 | 1214.24 | 400 | 6200 | 7100 | 8000 | 8880 | ▁▁▂▇▇ |
set.seed(1234)
data_split <- initial_split(members, strata = "died")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
recipe_obj <- recipe(died ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
# Initialize H2O
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 11 days 17 hours
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 4 months and 14 days
## H2O cluster name: H2O_started_from_R_jasonzink_qxv383
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.98 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.2.1 (2022-06-23)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (4 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2567)
##
|
| | 0%
|
|======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
##
|
| | 0%
|
|======================================================================| 100%
y <- "died"
x <- setdiff(names(train_tbl), y)
auto_ml_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 150,
nfolds = 5,
seed = 6789)
##
|
| | 0%
|
| | 1%
## 13:43:06.36: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 13:43:06.109: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|= | 2%
|
|== | 4%
|
|=== | 5%
|
|==== | 6%
## 13:43:15.475: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|===== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 12%
|
|========= | 13%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
## 13:43:31.748: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|============= | 19%
|
|============== | 20%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 29%
## 13:43:49.466: _train param, Dropping unused columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|===================== | 30%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================= | 41%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================= | 47%
|
|================================== | 48%
|
|=================================== | 50%
|
|==================================== | 51%
|
|===================================== | 52%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================== | 59%
|
|=========================================== | 61%
|
|=========================================== | 62%
## 13:44:39.679: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|============================================ | 64%
|
|============================================= | 65%
|
|============================================== | 66%
|
|=============================================== | 68%
## 13:44:47.87: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|================================================ | 69%
|
|================================================== | 71%
|
|=================================================== | 73%
## 13:44:56.111: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
## 13:45:02.843: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================== | 83%
## 13:45:09.935: _train param, Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|=========================================================== | 84%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
## 13:45:17.782: _train param, Dropping unused columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================= | 93%
## 13:45:25.766: _train param, Dropping unused columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type]
|
|================================================================== | 94%
|
|=================================================================== | 95%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|======================================================================| 100%
auto_ml_models_h2o@leaderboard
## model_id auc
## 1 GBM_3_AutoML_14_20240505_134306 0.9981166
## 2 GBM_1_AutoML_14_20240505_134306 0.9952748
## 3 StackedEnsemble_BestOfFamily_1_AutoML_14_20240505_134306 0.9951716
## 4 StackedEnsemble_AllModels_1_AutoML_14_20240505_134306 0.9947865
## 5 XGBoost_2_AutoML_14_20240505_134306 0.9945520
## 6 XGBoost_1_AutoML_14_20240505_134306 0.9944446
## logloss aucpr mean_per_class_error rmse mse
## 1 0.003951496 0.9879342 0.009285770 0.02247598 0.0005051699
## 2 0.002896306 0.9848636 0.009338793 0.02081201 0.0004331397
## 3 0.002945840 0.9854408 0.009338793 0.02103056 0.0004422843
## 4 0.019468813 0.9860488 0.009312282 0.06580897 0.0043308201
## 5 0.003376228 0.9860617 0.016666667 0.02157198 0.0004653503
## 6 0.003840098 0.9851185 0.016666667 0.02182597 0.0004763731
##
## [11 rows x 7 columns]
auto_ml_models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model ID: GBM_3_AutoML_14_20240505_134306
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 28 28 28750 8
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 8 8.00000 23 127 77.57143
##
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.0004845609
## RMSE: 0.02201275
## LogLoss: 0.003707398
## Mean Per-Class Error: 0.01513915
## AUC: 0.9997291
## AUCPR: 0.9923869
## Gini: 0.9994583
## R^2: 0.9655459
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 48014 3 0.000062 =3/48017
## TRUE 21 674 0.030216 =21/695
## Totals 48035 677 0.000493 =24/48712
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.165603 0.982507 194
## 2 max f2 0.076166 0.977325 215
## 3 max f0point5 0.260806 0.992897 189
## 4 max accuracy 0.260806 0.999507 189
## 5 max precision 0.998540 1.000000 0
## 6 max recall 0.002166 1.000000 344
## 7 max specificity 0.998540 1.000000 0
## 8 max absolute_mcc 0.165603 0.982344 194
## 9 max min_per_class_accuracy 0.015633 0.991367 278
## 10 max mean_per_class_accuracy 0.015633 0.993226 278
## 11 max tns 0.998540 48017.000000 0
## 12 max fns 0.998540 694.000000 0
## 13 max fps 0.000990 48017.000000 399
## 14 max tps 0.002166 695.000000 344
## 15 max tnr 0.998540 1.000000 0
## 16 max fnr 0.998540 0.998561 0
## 17 max fpr 0.000990 1.000000 399
## 18 max tpr 0.002166 1.000000 344
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
##
## MSE: 0.00051924
## RMSE: 0.02278684
## LogLoss: 0.004263676
## Mean Per-Class Error: 0.01069687
## AUC: 0.9963584
## AUCPR: 0.9863581
## Gini: 0.9927167
## R^2: 0.9675187
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 8535 1 0.000117 =1/8536
## TRUE 3 138 0.021277 =3/141
## Totals 8538 139 0.000461 =4/8677
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.179921 0.985714 110
## 2 max f2 0.179921 0.981508 110
## 3 max f0point5 0.503643 0.994194 108
## 4 max accuracy 0.503643 0.999539 108
## 5 max precision 0.999805 1.000000 0
## 6 max recall 0.001290 1.000000 342
## 7 max specificity 0.999805 1.000000 0
## 8 max absolute_mcc 0.179921 0.985506 110
## 9 max min_per_class_accuracy 0.051895 0.985816 116
## 10 max mean_per_class_accuracy 0.051895 0.992439 116
## 11 max tns 0.999805 8536.000000 0
## 12 max fns 0.999805 140.000000 0
## 13 max fps 0.000992 8536.000000 399
## 14 max tps 0.001290 141.000000 342
## 15 max tnr 0.999805 1.000000 0
## 16 max fnr 0.999805 0.992908 0
## 17 max fpr 0.000992 1.000000 399
## 18 max tpr 0.001290 1.000000 342
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.0006399879
## RMSE: 0.02529798
## LogLoss: 0.005190406
## Mean Per-Class Error: 0.01800644
## AUC: 0.9923808
## AUCPR: 0.9750668
## Gini: 0.9847616
## R^2: 0.9544945
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 48015 2 0.000042 =2/48017
## TRUE 25 670 0.035971 =25/695
## Totals 48040 672 0.000554 =27/48712
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.383374 0.980249 198
## 2 max f2 0.383374 0.970452 198
## 3 max f0point5 0.432512 0.991111 196
## 4 max accuracy 0.432512 0.999446 196
## 5 max precision 0.999655 1.000000 0
## 6 max recall 0.000939 1.000000 393
## 7 max specificity 0.999655 1.000000 0
## 8 max absolute_mcc 0.383374 0.980110 198
## 9 max min_per_class_accuracy 0.003917 0.978417 325
## 10 max mean_per_class_accuracy 0.072631 0.982432 221
## 11 max tns 0.999655 48017.000000 0
## 12 max fns 0.999655 688.000000 0
## 13 max fps 0.000785 48017.000000 399
## 14 max tps 0.000939 695.000000 393
## 15 max tnr 0.999655 1.000000 0
## 16 max fnr 0.999655 0.989928 0
## 17 max fpr 0.000785 1.000000 399
## 18 max tpr 0.000939 1.000000 393
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.999466 0.000134 0.999589 0.999384 0.999281
## auc 0.993332 0.001697 0.995541 0.994577 0.992754
## err 0.000534 0.000134 0.000411 0.000616 0.000719
## err_count 5.200000 1.303840 4.000000 6.000000 7.000000
## f0point5 0.992289 0.001814 0.994437 0.990854 0.990237
## f1 0.980951 0.004432 0.986207 0.977444 0.975945
## f2 0.969881 0.006934 0.978112 0.964392 0.962060
## lift_top_group 70.294120 4.211651 66.278910 71.639710 65.382550
## logloss 0.005190 0.000580 0.004865 0.005738 0.005819
## max_per_class_error 0.037356 0.008539 0.027211 0.044118 0.046980
## mcc 0.980871 0.004414 0.986095 0.977387 0.975872
## mean_per_class_accuracy 0.981322 0.004270 0.986395 0.977941 0.976510
## mean_per_class_error 0.018678 0.004270 0.013605 0.022059 0.023490
## mse 0.000640 0.000121 0.000583 0.000703 0.000823
## pr_auc 0.975311 0.004149 0.979787 0.970322 0.973553
## precision 1.000000 0.000000 1.000000 1.000000 1.000000
## r2 0.954624 0.007002 0.960751 0.948888 0.945353
## recall 0.962644 0.008539 0.972789 0.955882 0.953020
## rmse 0.025211 0.002340 0.024150 0.026524 0.028688
## specificity 1.000000 0.000000 1.000000 1.000000 1.000000
## cv_4_valid cv_5_valid
## accuracy 0.999487 0.999589
## auc 0.992452 0.991338
## err 0.000513 0.000411
## err_count 5.000000 4.000000
## f0point5 0.992126 0.993789
## f1 0.980545 0.984615
## f2 0.969231 0.975610
## lift_top_group 74.366410 73.803030
## logloss 0.004463 0.005067
## max_per_class_error 0.038168 0.030303
## mcc 0.980475 0.984527
## mean_per_class_accuracy 0.980916 0.984849
## mean_per_class_error 0.019084 0.015152
## mse 0.000548 0.000542
## pr_auc 0.979469 0.973423
## precision 1.000000 1.000000
## r2 0.958684 0.959444
## recall 0.961832 0.969697
## rmse 0.023412 0.023282
## specificity 1.000000 1.000000
best_model <- auto_ml_models_h2o@leader
best_model
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model ID: GBM_3_AutoML_14_20240505_134306
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 28 28 28750 8
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 8 8.00000 23 127 77.57143
##
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.0004845609
## RMSE: 0.02201275
## LogLoss: 0.003707398
## Mean Per-Class Error: 0.01513915
## AUC: 0.9997291
## AUCPR: 0.9923869
## Gini: 0.9994583
## R^2: 0.9655459
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 48014 3 0.000062 =3/48017
## TRUE 21 674 0.030216 =21/695
## Totals 48035 677 0.000493 =24/48712
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.165603 0.982507 194
## 2 max f2 0.076166 0.977325 215
## 3 max f0point5 0.260806 0.992897 189
## 4 max accuracy 0.260806 0.999507 189
## 5 max precision 0.998540 1.000000 0
## 6 max recall 0.002166 1.000000 344
## 7 max specificity 0.998540 1.000000 0
## 8 max absolute_mcc 0.165603 0.982344 194
## 9 max min_per_class_accuracy 0.015633 0.991367 278
## 10 max mean_per_class_accuracy 0.015633 0.993226 278
## 11 max tns 0.998540 48017.000000 0
## 12 max fns 0.998540 694.000000 0
## 13 max fps 0.000990 48017.000000 399
## 14 max tps 0.002166 695.000000 344
## 15 max tnr 0.998540 1.000000 0
## 16 max fnr 0.998540 0.998561 0
## 17 max fpr 0.000990 1.000000 399
## 18 max tpr 0.002166 1.000000 344
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
##
## MSE: 0.00051924
## RMSE: 0.02278684
## LogLoss: 0.004263676
## Mean Per-Class Error: 0.01069687
## AUC: 0.9963584
## AUCPR: 0.9863581
## Gini: 0.9927167
## R^2: 0.9675187
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 8535 1 0.000117 =1/8536
## TRUE 3 138 0.021277 =3/141
## Totals 8538 139 0.000461 =4/8677
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.179921 0.985714 110
## 2 max f2 0.179921 0.981508 110
## 3 max f0point5 0.503643 0.994194 108
## 4 max accuracy 0.503643 0.999539 108
## 5 max precision 0.999805 1.000000 0
## 6 max recall 0.001290 1.000000 342
## 7 max specificity 0.999805 1.000000 0
## 8 max absolute_mcc 0.179921 0.985506 110
## 9 max min_per_class_accuracy 0.051895 0.985816 116
## 10 max mean_per_class_accuracy 0.051895 0.992439 116
## 11 max tns 0.999805 8536.000000 0
## 12 max fns 0.999805 140.000000 0
## 13 max fps 0.000992 8536.000000 399
## 14 max tps 0.001290 141.000000 342
## 15 max tnr 0.999805 1.000000 0
## 16 max fnr 0.999805 0.992908 0
## 17 max fpr 0.000992 1.000000 399
## 18 max tpr 0.001290 1.000000 342
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.0006399879
## RMSE: 0.02529798
## LogLoss: 0.005190406
## Mean Per-Class Error: 0.01800644
## AUC: 0.9923808
## AUCPR: 0.9750668
## Gini: 0.9847616
## R^2: 0.9544945
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 48015 2 0.000042 =2/48017
## TRUE 25 670 0.035971 =25/695
## Totals 48040 672 0.000554 =27/48712
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.383374 0.980249 198
## 2 max f2 0.383374 0.970452 198
## 3 max f0point5 0.432512 0.991111 196
## 4 max accuracy 0.432512 0.999446 196
## 5 max precision 0.999655 1.000000 0
## 6 max recall 0.000939 1.000000 393
## 7 max specificity 0.999655 1.000000 0
## 8 max absolute_mcc 0.383374 0.980110 198
## 9 max min_per_class_accuracy 0.003917 0.978417 325
## 10 max mean_per_class_accuracy 0.072631 0.982432 221
## 11 max tns 0.999655 48017.000000 0
## 12 max fns 0.999655 688.000000 0
## 13 max fps 0.000785 48017.000000 399
## 14 max tps 0.000939 695.000000 393
## 15 max tnr 0.999655 1.000000 0
## 16 max fnr 0.999655 0.989928 0
## 17 max fpr 0.000785 1.000000 399
## 18 max tpr 0.000939 1.000000 393
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.999466 0.000134 0.999589 0.999384 0.999281
## auc 0.993332 0.001697 0.995541 0.994577 0.992754
## err 0.000534 0.000134 0.000411 0.000616 0.000719
## err_count 5.200000 1.303840 4.000000 6.000000 7.000000
## f0point5 0.992289 0.001814 0.994437 0.990854 0.990237
## f1 0.980951 0.004432 0.986207 0.977444 0.975945
## f2 0.969881 0.006934 0.978112 0.964392 0.962060
## lift_top_group 70.294120 4.211651 66.278910 71.639710 65.382550
## logloss 0.005190 0.000580 0.004865 0.005738 0.005819
## max_per_class_error 0.037356 0.008539 0.027211 0.044118 0.046980
## mcc 0.980871 0.004414 0.986095 0.977387 0.975872
## mean_per_class_accuracy 0.981322 0.004270 0.986395 0.977941 0.976510
## mean_per_class_error 0.018678 0.004270 0.013605 0.022059 0.023490
## mse 0.000640 0.000121 0.000583 0.000703 0.000823
## pr_auc 0.975311 0.004149 0.979787 0.970322 0.973553
## precision 1.000000 0.000000 1.000000 1.000000 1.000000
## r2 0.954624 0.007002 0.960751 0.948888 0.945353
## recall 0.962644 0.008539 0.972789 0.955882 0.953020
## rmse 0.025211 0.002340 0.024150 0.026524 0.028688
## specificity 1.000000 0.000000 1.000000 1.000000 1.000000
## cv_4_valid cv_5_valid
## accuracy 0.999487 0.999589
## auc 0.992452 0.991338
## err 0.000513 0.000411
## err_count 5.000000 4.000000
## f0point5 0.992126 0.993789
## f1 0.980545 0.984615
## f2 0.969231 0.975610
## lift_top_group 74.366410 73.803030
## logloss 0.004463 0.005067
## max_per_class_error 0.038168 0.030303
## mcc 0.980475 0.984527
## mean_per_class_accuracy 0.980916 0.984849
## mean_per_class_error 0.019084 0.015152
## mse 0.000548 0.000542
## pr_auc 0.979469 0.973423
## precision 1.000000 1.000000
## r2 0.958684 0.959444
## recall 0.961832 0.969697
## rmse 0.023412 0.023282
## specificity 1.000000 1.000000
auto_ml_models_h2o %>% typeof()
## [1] "S4"
auto_ml_models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
auto_ml_models_h2o@leaderboard
## model_id auc
## 1 GBM_3_AutoML_14_20240505_134306 0.9981166
## 2 GBM_1_AutoML_14_20240505_134306 0.9952748
## 3 StackedEnsemble_BestOfFamily_1_AutoML_14_20240505_134306 0.9951716
## 4 StackedEnsemble_AllModels_1_AutoML_14_20240505_134306 0.9947865
## 5 XGBoost_2_AutoML_14_20240505_134306 0.9945520
## 6 XGBoost_1_AutoML_14_20240505_134306 0.9944446
## logloss aucpr mean_per_class_error rmse mse
## 1 0.003951496 0.9879342 0.009285770 0.02247598 0.0005051699
## 2 0.002896306 0.9848636 0.009338793 0.02081201 0.0004331397
## 3 0.002945840 0.9854408 0.009338793 0.02103056 0.0004422843
## 4 0.019468813 0.9860488 0.009312282 0.06580897 0.0043308201
## 5 0.003376228 0.9860617 0.016666667 0.02157198 0.0004653503
## 6 0.003840098 0.9851185 0.016666667 0.02182597 0.0004763731
##
## [11 rows x 7 columns]
auto_ml_models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model ID: GBM_3_AutoML_14_20240505_134306
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 28 28 28750 8
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 8 8.00000 23 127 77.57143
##
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.0004845609
## RMSE: 0.02201275
## LogLoss: 0.003707398
## Mean Per-Class Error: 0.01513915
## AUC: 0.9997291
## AUCPR: 0.9923869
## Gini: 0.9994583
## R^2: 0.9655459
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 48014 3 0.000062 =3/48017
## TRUE 21 674 0.030216 =21/695
## Totals 48035 677 0.000493 =24/48712
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.165603 0.982507 194
## 2 max f2 0.076166 0.977325 215
## 3 max f0point5 0.260806 0.992897 189
## 4 max accuracy 0.260806 0.999507 189
## 5 max precision 0.998540 1.000000 0
## 6 max recall 0.002166 1.000000 344
## 7 max specificity 0.998540 1.000000 0
## 8 max absolute_mcc 0.165603 0.982344 194
## 9 max min_per_class_accuracy 0.015633 0.991367 278
## 10 max mean_per_class_accuracy 0.015633 0.993226 278
## 11 max tns 0.998540 48017.000000 0
## 12 max fns 0.998540 694.000000 0
## 13 max fps 0.000990 48017.000000 399
## 14 max tps 0.002166 695.000000 344
## 15 max tnr 0.998540 1.000000 0
## 16 max fnr 0.998540 0.998561 0
## 17 max fpr 0.000990 1.000000 399
## 18 max tpr 0.002166 1.000000 344
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
##
## MSE: 0.00051924
## RMSE: 0.02278684
## LogLoss: 0.004263676
## Mean Per-Class Error: 0.01069687
## AUC: 0.9963584
## AUCPR: 0.9863581
## Gini: 0.9927167
## R^2: 0.9675187
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 8535 1 0.000117 =1/8536
## TRUE 3 138 0.021277 =3/141
## Totals 8538 139 0.000461 =4/8677
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.179921 0.985714 110
## 2 max f2 0.179921 0.981508 110
## 3 max f0point5 0.503643 0.994194 108
## 4 max accuracy 0.503643 0.999539 108
## 5 max precision 0.999805 1.000000 0
## 6 max recall 0.001290 1.000000 342
## 7 max specificity 0.999805 1.000000 0
## 8 max absolute_mcc 0.179921 0.985506 110
## 9 max min_per_class_accuracy 0.051895 0.985816 116
## 10 max mean_per_class_accuracy 0.051895 0.992439 116
## 11 max tns 0.999805 8536.000000 0
## 12 max fns 0.999805 140.000000 0
## 13 max fps 0.000992 8536.000000 399
## 14 max tps 0.001290 141.000000 342
## 15 max tnr 0.999805 1.000000 0
## 16 max fnr 0.999805 0.992908 0
## 17 max fpr 0.000992 1.000000 399
## 18 max tpr 0.001290 1.000000 342
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.0006399879
## RMSE: 0.02529798
## LogLoss: 0.005190406
## Mean Per-Class Error: 0.01800644
## AUC: 0.9923808
## AUCPR: 0.9750668
## Gini: 0.9847616
## R^2: 0.9544945
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 48015 2 0.000042 =2/48017
## TRUE 25 670 0.035971 =25/695
## Totals 48040 672 0.000554 =27/48712
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.383374 0.980249 198
## 2 max f2 0.383374 0.970452 198
## 3 max f0point5 0.432512 0.991111 196
## 4 max accuracy 0.432512 0.999446 196
## 5 max precision 0.999655 1.000000 0
## 6 max recall 0.000939 1.000000 393
## 7 max specificity 0.999655 1.000000 0
## 8 max absolute_mcc 0.383374 0.980110 198
## 9 max min_per_class_accuracy 0.003917 0.978417 325
## 10 max mean_per_class_accuracy 0.072631 0.982432 221
## 11 max tns 0.999655 48017.000000 0
## 12 max fns 0.999655 688.000000 0
## 13 max fps 0.000785 48017.000000 399
## 14 max tps 0.000939 695.000000 393
## 15 max tnr 0.999655 1.000000 0
## 16 max fnr 0.999655 0.989928 0
## 17 max fpr 0.000785 1.000000 399
## 18 max tpr 0.000939 1.000000 393
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.999466 0.000134 0.999589 0.999384 0.999281
## auc 0.993332 0.001697 0.995541 0.994577 0.992754
## err 0.000534 0.000134 0.000411 0.000616 0.000719
## err_count 5.200000 1.303840 4.000000 6.000000 7.000000
## f0point5 0.992289 0.001814 0.994437 0.990854 0.990237
## f1 0.980951 0.004432 0.986207 0.977444 0.975945
## f2 0.969881 0.006934 0.978112 0.964392 0.962060
## lift_top_group 70.294120 4.211651 66.278910 71.639710 65.382550
## logloss 0.005190 0.000580 0.004865 0.005738 0.005819
## max_per_class_error 0.037356 0.008539 0.027211 0.044118 0.046980
## mcc 0.980871 0.004414 0.986095 0.977387 0.975872
## mean_per_class_accuracy 0.981322 0.004270 0.986395 0.977941 0.976510
## mean_per_class_error 0.018678 0.004270 0.013605 0.022059 0.023490
## mse 0.000640 0.000121 0.000583 0.000703 0.000823
## pr_auc 0.975311 0.004149 0.979787 0.970322 0.973553
## precision 1.000000 0.000000 1.000000 1.000000 1.000000
## r2 0.954624 0.007002 0.960751 0.948888 0.945353
## recall 0.962644 0.008539 0.972789 0.955882 0.953020
## rmse 0.025211 0.002340 0.024150 0.026524 0.028688
## specificity 1.000000 0.000000 1.000000 1.000000 1.000000
## cv_4_valid cv_5_valid
## accuracy 0.999487 0.999589
## auc 0.992452 0.991338
## err 0.000513 0.000411
## err_count 5.000000 4.000000
## f0point5 0.992126 0.993789
## f1 0.980545 0.984615
## f2 0.969231 0.975610
## lift_top_group 74.366410 73.803030
## logloss 0.004463 0.005067
## max_per_class_error 0.038168 0.030303
## mcc 0.980475 0.984527
## mean_per_class_accuracy 0.980916 0.984849
## mean_per_class_error 0.019084 0.015152
## mse 0.000548 0.000542
## pr_auc 0.979469 0.973423
## precision 1.000000 1.000000
## r2 0.958684 0.959444
## recall 0.961832 0.969697
## rmse 0.023412 0.023282
## specificity 1.000000 1.000000
predictions <- h2o.predict(best_model, newdata = test_h2o)
##
|
| | 0%
|
|======================================================================| 100%
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 19,130 × 24
## predict FALSE. TRUE. expedition_id member_id peak_id peak_name year season
## <fct> <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 FALSE 0.998 0.00174 AMAD78301 AMAD7830… AMAD Ama Dabl… 1978 Autumn
## 2 FALSE 0.999 0.00122 AMAD79101 AMAD7910… AMAD Ama Dabl… 1979 Spring
## 3 FALSE 0.997 0.00261 AMAD79101 AMAD7910… AMAD Ama Dabl… 1979 Spring
## 4 FALSE 0.996 0.00442 AMAD79101 AMAD7910… AMAD Ama Dabl… 1979 Spring
## 5 FALSE 0.997 0.00267 AMAD79101 AMAD7910… AMAD Ama Dabl… 1979 Spring
## 6 FALSE 0.999 0.00113 AMAD79101 AMAD7910… AMAD Ama Dabl… 1979 Spring
## 7 FALSE 0.999 0.00118 AMAD79101 AMAD7910… AMAD Ama Dabl… 1979 Spring
## 8 FALSE 0.999 0.00126 AMAD79101 AMAD7910… AMAD Ama Dabl… 1979 Spring
## 9 FALSE 0.997 0.00269 AMAD79101 AMAD7910… AMAD Ama Dabl… 1979 Spring
## 10 FALSE 0.999 0.00113 AMAD79301 AMAD7930… AMAD Ama Dabl… 1979 Autumn
## # ℹ 19,120 more rows
## # ℹ 15 more variables: sex <chr>, age <dbl>, citizenship <chr>,
## # expedition_role <chr>, hired <lgl>, highpoint_metres <dbl>, success <lgl>,
## # solo <lgl>, oxygen_used <lgl>, died <lgl>, death_cause <chr>,
## # death_height_metres <dbl>, injured <lgl>, injury_type <chr>,
## # injury_height_metres <dbl>
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "GBM_3_AutoML_14_20240505_134306"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/GBM_3_AutoML_14_20240505_134306"
##
##
## $model_checksum
## [1] "4288655587182429368"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_b77b_3"
##
##
## $frame_checksum
## [1] "678340420273909232"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.714931e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.0005051699
##
## $RMSE
## [1] 0.02247598
##
## $nobs
## [1] 19130
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.9636954
##
## $logloss
## [1] 0.003951496
##
## $AUC
## [1] 0.9981166
##
## $pr_auc
## [1] 0.9879342
##
## $Gini
## [1] 0.9962333
##
## $mean_per_class_error
## [1] 0.00928577
##
## $domain
## [1] "FALSE" "TRUE"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## FALSE TRUE Error Rate
## FALSE 18859 1 0.0001 = 1 / 18,860
## TRUE 5 265 0.0185 = 5 / 270
## Totals 18864 266 0.0003 = 6 / 19,130
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.998222 0.014706 0.009242 0.035971 0.985991 1.000000 0.007407 1.000000
## 2 0.998009 0.021978 0.013850 0.053191 0.986043 1.000000 0.011111 1.000000
## 3 0.997652 0.029197 0.018450 0.069930 0.986095 1.000000 0.014815 1.000000
## 4 0.997258 0.043478 0.027624 0.102041 0.986200 1.000000 0.022222 1.000000
## 5 0.997069 0.050542 0.032199 0.117450 0.986252 1.000000 0.025926 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.085461 0.007407 0.503704 18860 268 0 2
## 2 0.104671 0.011111 0.505556 18860 267 0 3
## 3 0.120867 0.014815 0.507407 18860 266 0 4
## 4 0.148039 0.022222 0.511111 18860 264 0 6
## 5 0.159904 0.025926 0.512963 18860 263 0 7
## tnr fnr fpr tpr idx
## 1 1.000000 0.992593 0.000000 0.007407 0
## 2 1.000000 0.988889 0.000000 0.011111 1
## 3 1.000000 0.985185 0.000000 0.014815 2
## 4 1.000000 0.977778 0.000000 0.022222 3
## 5 1.000000 0.974074 0.000000 0.025926 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 395 0.001071 0.029197 0.069930 0.018450 0.061422 0.014815 1.000000
## 396 0.001062 0.028435 0.068178 0.017963 0.035494 0.014422 1.000000
## 397 0.001057 0.028016 0.067214 0.017696 0.020648 0.014207 1.000000
## 398 0.001038 0.027926 0.067007 0.017638 0.017407 0.014161 1.000000
## 399 0.001022 0.027880 0.066901 0.017609 0.015734 0.014137 1.000000
## 400 0.000991 0.027835 0.066799 0.017580 0.014114 0.014114 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395 0.047985 0.026663 0.047985 0.523993 905
## 396 0.021686 0.017685 0.021686 0.510843 409
## 397 0.006628 0.009704 0.006628 0.503314 125
## 398 0.003340 0.006878 0.003340 0.501670 63
## 399 0.001644 0.004820 0.001644 0.500822 31
## 400 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 395 0 17955 270 0.047985 0.000000 0.952015 1.000000 394
## 396 0 18451 270 0.021686 0.000000 0.978314 1.000000 395
## 397 0 18735 270 0.006628 0.000000 0.993372 1.000000 396
## 398 0 18797 270 0.003340 0.000000 0.996660 1.000000 397
## 399 0 18829 270 0.001644 0.000000 0.998356 1.000000 398
## 400 0 18860 270 0.000000 0.000000 1.000000 1.000000 399
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.260806 0.988806 162
## 2 max f2 0.260806 0.984398 162
## 3 max f0point5 0.260806 0.993253 162
## 4 max accuracy 0.260806 0.999686 162
## 5 max precision 0.998222 1.000000 0
## 6 max recall 0.001327 1.000000 355
## 7 max specificity 0.998222 1.000000 0
## 8 max absolute_mcc 0.260806 0.988675 162
## 9 max min_per_class_accuracy 0.005286 0.988889 250
## 10 max mean_per_class_accuracy 0.260806 0.990714 162
## 11 max tns 0.998222 18860.000000 0
## 12 max fns 0.998222 268.000000 0
## 13 max fps 0.000991 18860.000000 399
## 14 max tps 0.001327 270.000000 355
## 15 max tnr 0.998222 1.000000 0
## 16 max fnr 0.998222 0.992593 0
## 17 max fpr 0.000991 1.000000 399
## 18 max tpr 0.001327 1.000000 355
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 1.41 %, avg score: 1.48 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.01003659 0.972142 70.851852 70.851852
## 2 2 0.02002091 0.007671 27.450456 49.207814
## 3 3 0.03005750 0.003420 0.369020 32.899903
## 4 4 0.04004182 0.002628 0.000000 24.696403
## 5 5 0.05007841 0.002198 0.000000 19.746810
## 6 6 0.10020910 0.001816 0.073881 9.905215
## 7 7 0.15007841 0.001624 0.000000 6.613827
## 8 8 0.20000000 0.001501 0.074190 4.981481
## 9 9 0.30000000 0.001317 0.037037 3.333333
## 10 10 0.40120230 0.001252 0.000000 2.492508
## 11 11 0.50000000 0.001209 0.000000 2.000000
## 12 12 0.60000000 0.001179 0.000000 1.666667
## 13 13 0.70104548 0.001162 0.000000 1.426441
## 14 14 0.80219550 0.001147 0.000000 1.246579
## 15 15 0.90026137 0.001088 0.000000 1.110789
## 16 16 1.00000000 0.000944 0.000000 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 1.000000 0.988205 1.000000 0.988205
## 2 0.387435 0.351151 0.694517 0.670510
## 3 0.005208 0.004836 0.464348 0.448233
## 4 0.000000 0.002974 0.348564 0.337209
## 5 0.000000 0.002460 0.278706 0.270119
## 6 0.001043 0.001951 0.139802 0.135965
## 7 0.000000 0.001736 0.093347 0.091362
## 8 0.001047 0.001554 0.070308 0.068945
## 9 0.000523 0.001402 0.047047 0.046431
## 10 0.000000 0.001279 0.035179 0.035042
## 11 0.000000 0.001231 0.028228 0.028361
## 12 0.000000 0.001192 0.023523 0.023833
## 13 0.000000 0.001171 0.020133 0.020566
## 14 0.000000 0.001156 0.017594 0.018119
## 15 0.000000 0.001124 0.015678 0.016268
## 16 0.000000 0.001068 0.014114 0.014752
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 0.711111 0.711111 6985.185185 6985.185185
## 2 0.274074 0.985185 2645.045569 4820.781356
## 3 0.003704 0.988889 -63.097994 3189.990338
## 4 0.000000 0.988889 -100.000000 2369.640267
## 5 0.000000 0.988889 -100.000000 1874.681048
## 6 0.003704 0.992593 -92.611903 890.521455
## 7 0.000000 0.992593 -100.000000 561.382664
## 8 0.003704 0.996296 -92.580958 398.148148
## 9 0.003704 1.000000 -96.296296 233.333333
## 10 0.000000 1.000000 -100.000000 149.250814
## 11 0.000000 1.000000 -100.000000 100.000000
## 12 0.000000 1.000000 -100.000000 66.666667
## 13 0.000000 1.000000 -100.000000 42.644098
## 14 0.000000 1.000000 -100.000000 24.657891
## 15 0.000000 1.000000 -100.000000 11.078853
## 16 0.000000 1.000000 -100.000000 0.000000
## kolmogorov_smirnov
## 1 0.711111
## 2 0.978982
## 3 0.972558
## 4 0.962431
## 5 0.952251
## 6 0.905159
## 7 0.854576
## 8 0.807696
## 9 0.710021
## 10 0.607370
## 11 0.507158
## 12 0.405726
## 13 0.303234
## 14 0.200636
## 15 0.101166
## 16 0.000000
h2o.auc(performance_h2o)
## [1] 0.9981166
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.260806196847446:
## FALSE TRUE Error Rate
## FALSE 18859 1 0.000053 =1/18860
## TRUE 5 265 0.018519 =5/270
## Totals 18864 266 0.000314 =6/19130
h2o.metric(performance_h2o) %>% as_tibble() %>% filter(threshold %>% between (0.43, 0.44))
## # A tibble: 0 × 20
## # ℹ 20 variables: threshold <dbl>, f1 <dbl>, f2 <dbl>, f0point5 <dbl>,
## # accuracy <dbl>, precision <dbl>, recall <dbl>, specificity <dbl>,
## # absolute_mcc <dbl>, min_per_class_accuracy <dbl>,
## # mean_per_class_accuracy <dbl>, tns <dbl>, fns <dbl>, fps <dbl>, tps <dbl>,
## # tnr <dbl>, fnr <dbl>, fpr <dbl>, tpr <dbl>, idx <int>