library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ rsample 1.2.1
## ✔ dials 1.4.0 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.3.0 ✔ yardstick 1.3.2
## ✔ recipes 1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
##
## Attaching package: 'h2o'
##
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
##
## The following objects are masked from 'package:stats':
##
## cor, sd, var
##
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-08-15/spam.csv')
## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>% skimr::skim()
Data summary
Name |
Piped data |
Number of rows |
4601 |
Number of columns |
7 |
_______________________ |
|
Column type frequency: |
|
character |
1 |
numeric |
6 |
________________________ |
|
Group variables |
None |
Variable type: character
Variable type: numeric
crl.tot |
0 |
1 |
283.29 |
606.35 |
1 |
35 |
95 |
266.00 |
15841.00 |
▇▁▁▁▁ |
dollar |
0 |
1 |
0.08 |
0.25 |
0 |
0 |
0 |
0.05 |
6.00 |
▇▁▁▁▁ |
bang |
0 |
1 |
0.27 |
0.82 |
0 |
0 |
0 |
0.32 |
32.48 |
▇▁▁▁▁ |
money |
0 |
1 |
0.09 |
0.44 |
0 |
0 |
0 |
0.00 |
12.50 |
▇▁▁▁▁ |
n000 |
0 |
1 |
0.10 |
0.35 |
0 |
0 |
0 |
0.00 |
5.45 |
▇▁▁▁▁ |
make |
0 |
1 |
0.10 |
0.31 |
0 |
0 |
0 |
0.00 |
4.54 |
▇▁▁▁▁ |
clean data
data_clean <- data %>%
# Address factors imported as numeric
# mutate(across(where(is.character), as.factor)) %>%
mutate(yesno = factor(yesno, levels = c("y", "n")))
data_clean
## # A tibble: 4,601 × 7
## crl.tot dollar bang money n000 make yesno
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 278 0 0.778 0 0 0 y
## 2 1028 0.18 0.372 0.43 0.43 0.21 y
## 3 2259 0.184 0.276 0.06 1.16 0.06 y
## 4 191 0 0.137 0 0 0 y
## 5 191 0 0.135 0 0 0 y
## 6 54 0 0 0 0 0 y
## 7 112 0.054 0.164 0 0 0 y
## 8 49 0 0 0 0 0 y
## 9 1257 0.203 0.181 0.15 0 0.15 y
## 10 749 0.081 0.244 0 0.19 0.06 y
## # ℹ 4,591 more rows
skimr::skim(data_clean)
Data summary
Name |
data_clean |
Number of rows |
4601 |
Number of columns |
7 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
6 |
________________________ |
|
Group variables |
None |
Variable type: factor
yesno |
0 |
1 |
FALSE |
2 |
n: 2788, y: 1813 |
Variable type: numeric
crl.tot |
0 |
1 |
283.29 |
606.35 |
1 |
35 |
95 |
266.00 |
15841.00 |
▇▁▁▁▁ |
dollar |
0 |
1 |
0.08 |
0.25 |
0 |
0 |
0 |
0.05 |
6.00 |
▇▁▁▁▁ |
bang |
0 |
1 |
0.27 |
0.82 |
0 |
0 |
0 |
0.32 |
32.48 |
▇▁▁▁▁ |
money |
0 |
1 |
0.09 |
0.44 |
0 |
0 |
0 |
0.00 |
12.50 |
▇▁▁▁▁ |
n000 |
0 |
1 |
0.10 |
0.35 |
0 |
0 |
0 |
0.00 |
5.45 |
▇▁▁▁▁ |
make |
0 |
1 |
0.10 |
0.31 |
0 |
0 |
0 |
0.00 |
4.54 |
▇▁▁▁▁ |
Split data
set.seed(1234)
data_split <- initial_split(data_clean, strata = "yesno")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
Recipes
recipe_obj <- recipe(yesno ~ ., data = train_tbl) %>%
step_zv(all_predictors())
Model
# Initialize h2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 days 2 hours
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 15 days
## H2O cluster name: H2O_started_from_R_aldendimick_ggl822
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.35 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.1 (2024-06-14)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 15 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)
## | | | 0% | |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
## | | | 0% | |======================================================================| 100%
y <- "yesno"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
# max_runtime_secs = 30,
max_models = 10,
exclude_algos = "DeepLearning",
nfolds = 5,
seed = 3456
)
## | | | 0% | |=== | 4%
## 12:08:18.169: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 12:08:18.171: AutoML: XGBoost is not available; skipping it. | |=============== | 21% | |======================= | 33% | |=============================================== | 67% | |======================================================================| 100%
Examine the output of h2o.automl
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id auc logloss
## 1 GBM_grid_1_AutoML_12_20250506_120818_model_1 0.9195293 0.3175906
## 2 GBM_5_AutoML_12_20250506_120818 0.9188403 0.3159186
## 3 GBM_grid_1_AutoML_12_20250506_120818_model_2 0.9187534 0.3224471
## 4 GBM_4_AutoML_12_20250506_120818 0.9187139 0.3211553
## 5 StackedEnsemble_BestOfFamily_1_AutoML_12_20250506_120818 0.9185417 0.3190287
## 6 StackedEnsemble_AllModels_1_AutoML_12_20250506_120818 0.9185069 0.3143932
## aucpr mean_per_class_error rmse mse
## 1 0.9146886 0.1244588 0.3048919 0.09295905
## 2 0.9156096 0.1277122 0.3053640 0.09324718
## 3 0.9131192 0.1328048 0.3088534 0.09539039
## 4 0.9117697 0.1245394 0.3084129 0.09511850
## 5 0.9146343 0.1262269 0.3031829 0.09191989
## 6 0.9134479 0.1268132 0.3030065 0.09181291
##
## [12 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model ID: GBM_grid_1_AutoML_12_20250506_120818_model_1
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 56 56 11862 4
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 9 7.01786 8 16 12.12500
##
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.08633413
## RMSE: 0.2938267
## LogLoss: 0.2918996
## Mean Per-Class Error: 0.1244266
## AUC: 0.9359978
## AUCPR: 0.9301509
## Gini: 0.8719957
## R^2: 0.6389747
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## n y Error Rate
## n 1657 114 0.064370 =114/1771
## y 214 946 0.184483 =214/1160
## Totals 1871 1060 0.111907 =328/2931
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.460105 0.852252 196
## 2 max f2 0.172882 0.864482 298
## 3 max f0point5 0.744784 0.899737 118
## 4 max accuracy 0.481078 0.888434 189
## 5 max precision 0.993124 1.000000 0
## 6 max recall 0.020360 1.000000 397
## 7 max specificity 0.993124 1.000000 0
## 8 max absolute_mcc 0.588570 0.765409 161
## 9 max min_per_class_accuracy 0.299660 0.866379 244
## 10 max mean_per_class_accuracy 0.398256 0.876259 209
## 11 max tns 0.993124 1771.000000 0
## 12 max fns 0.993124 1154.000000 0
## 13 max fps 0.018317 1771.000000 399
## 14 max tps 0.020360 1160.000000 397
## 15 max tnr 0.993124 1.000000 0
## 16 max fnr 0.993124 0.994828 0
## 17 max fpr 0.018317 1.000000 399
## 18 max tpr 0.020360 1.000000 397
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
##
## MSE: 0.1050428
## RMSE: 0.3241031
## LogLoss: 0.3368799
## Mean Per-Class Error: 0.1566033
## AUC: 0.9227701
## AUCPR: 0.9040581
## Gini: 0.8455402
## R^2: 0.5556779
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## n y Error Rate
## n 305 15 0.046875 =15/320
## y 53 146 0.266332 =53/199
## Totals 358 161 0.131021 =68/519
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.553173 0.811111 148
## 2 max f2 0.133945 0.865116 251
## 3 max f0point5 0.721787 0.870968 131
## 4 max accuracy 0.553173 0.868979 148
## 5 max precision 0.993194 1.000000 0
## 6 max recall 0.052004 1.000000 346
## 7 max specificity 0.993194 1.000000 0
## 8 max absolute_mcc 0.553173 0.721894 148
## 9 max min_per_class_accuracy 0.228198 0.825000 207
## 10 max mean_per_class_accuracy 0.553173 0.843397 148
## 11 max tns 0.993194 320.000000 0
## 12 max fns 0.993194 198.000000 0
## 13 max fps 0.018317 320.000000 356
## 14 max tps 0.052004 199.000000 346
## 15 max tnr 0.993194 1.000000 0
## 16 max fnr 0.993194 0.994975 0
## 17 max fpr 0.018317 1.000000 356
## 18 max tpr 0.052004 1.000000 346
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.09373318
## RMSE: 0.3061587
## LogLoss: 0.3147025
## Mean Per-Class Error: 0.1332391
## AUC: 0.9249299
## AUCPR: 0.9168586
## Gini: 0.8498598
## R^2: 0.608034
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## n y Error Rate
## n 1638 133 0.075099 =133/1771
## y 222 938 0.191379 =222/1160
## Totals 1860 1071 0.121119 =355/2931
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.452768 0.840879 198
## 2 max f2 0.175893 0.855567 294
## 3 max f0point5 0.685544 0.888795 133
## 4 max accuracy 0.553867 0.881269 170
## 5 max precision 0.994669 1.000000 0
## 6 max recall 0.015839 1.000000 399
## 7 max specificity 0.994669 1.000000 0
## 8 max absolute_mcc 0.553867 0.751023 170
## 9 max min_per_class_accuracy 0.291899 0.853448 251
## 10 max mean_per_class_accuracy 0.452768 0.866761 198
## 11 max tns 0.994669 1771.000000 0
## 12 max fns 0.994669 1141.000000 0
## 13 max fps 0.015839 1771.000000 399
## 14 max tps 0.015839 1160.000000 399
## 15 max tnr 0.994669 1.000000 0
## 16 max fnr 0.994669 0.983621 0
## 17 max fpr 0.015839 1.000000 399
## 18 max tpr 0.015839 1.000000 399
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.882637 0.013798 0.873935 0.895904 0.899317
## auc 0.924739 0.010715 0.932145 0.923461 0.938504
## err 0.117363 0.013798 0.126065 0.104096 0.100683
## err_count 68.800000 8.105554 74.000000 61.000000 59.000000
## f0point5 0.869741 0.028128 0.834731 0.901961 0.892193
## f1 0.844977 0.018076 0.843220 0.857809 0.866817
## f2 0.822492 0.029111 0.851884 0.817778 0.842845
## lift_top_group 2.526724 0.001928 2.530172 2.525862 2.525862
## logloss 0.315103 0.025279 0.311988 0.306174 0.280152
## max_per_class_error 0.191379 0.040273 0.142241 0.206897 0.172414
## mcc 0.754487 0.028806 0.738154 0.783029 0.788463
## mean_per_class_accuracy 0.869880 0.014271 0.871133 0.878190 0.886957
## mean_per_class_error 0.130120 0.014271 0.128867 0.121810 0.113043
## mse 0.093758 0.009047 0.093147 0.088820 0.081773
## pr_auc 0.916488 0.014438 0.919953 0.919491 0.936460
## precision 0.887694 0.042510 0.829167 0.934010 0.909953
## r2 0.607929 0.037824 0.610301 0.628622 0.658086
## recall 0.808621 0.040273 0.857759 0.793103 0.827586
## rmse 0.305912 0.014842 0.305200 0.298027 0.285961
## specificity 0.931139 0.032065 0.884507 0.963277 0.946328
## cv_4_valid cv_5_valid
## accuracy 0.873720 0.870307
## auc 0.912021 0.917562
## err 0.126280 0.129693
## err_count 74.000000 76.000000
## f0point5 0.849820 0.870000
## f1 0.836283 0.820755
## f2 0.823171 0.776786
## lift_top_group 2.525862 2.525862
## logloss 0.346989 0.330213
## max_per_class_error 0.185345 0.250000
## mcc 0.734309 0.728482
## mean_per_class_accuracy 0.863542 0.849576
## mean_per_class_error 0.136458 0.150424
## mse 0.104150 0.100901
## pr_auc 0.897600 0.908932
## precision 0.859091 0.906250
## r2 0.564524 0.578110
## recall 0.814655 0.750000
## rmse 0.322723 0.317649
## specificity 0.912429 0.949152
Save and Load
?h2o.getModel
?h2o.saveModel
?h2o.loadModel
# h2o.getModel("GBM_grid_1_AutoML_5_20250506_112018_model_9") %>%
# h2o.saveModel("h2o_models2/")
best_model <- h2o.loadModel("h2o_models2/GBM_grid_1_AutoML_5_20250506_112018_model_9")
Make predictions
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 1,151 × 10
## predict n y crl.tot dollar bang money n000 make yesno
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 y 0.0103 0.990 1028 0.18 0.372 0.43 0.43 0.21 y
## 2 n 0.901 0.0988 54 0 0 0 0 0 y
## 3 y 0.237 0.763 2259 0.046 0.25 0 0.05 0.05 y
## 4 y 0.352 0.648 34 0 0.809 0 0 0 y
## 5 y 0.0783 0.922 82 0.196 0.392 0 0 0 y
## 6 y 0.134 0.866 47 0 0.368 3.33 0 0 y
## 7 y 0.308 0.692 129 0 0.091 0.65 0 0 y
## 8 y 0.267 0.733 59 0 0.886 0 0 1.17 y
## 9 y 0.375 0.625 89 0.091 0 0.27 0 0 y
## 10 y 0.0671 0.933 239 0.244 0.488 0 0.48 0 y
## # ℹ 1,141 more rows
Evaluate model
?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "GBM_grid_1_AutoML_5_20250506_112018_model_9"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/GBM_grid_1_AutoML_5_20250506_112018_model_9"
##
##
## $model_checksum
## [1] "-7639920851749533421"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_9edd_3"
##
##
## $frame_checksum
## [1] "5361656593914358099"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.746548e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.09336355
##
## $RMSE
## [1] 0.3055545
##
## $nobs
## [1] 1151
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.6091237
##
## $logloss
## [1] 0.3140552
##
## $AUC
## [1] 0.9216324
##
## $pr_auc
## [1] 0.9185834
##
## $Gini
## [1] 0.8432647
##
## $mean_per_class_error
## [1] 0.1260452
##
## $domain
## [1] "n" "y"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## n y Error Rate
## n 661 36 0.0516 = 36 / 697
## y 91 363 0.2004 = 91 / 454
## Totals 752 399 0.1103 = 127 / 1,151
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.998490 0.059829 0.038251 0.137255 0.617724 1.000000 0.030837 1.000000
## 2 0.998061 0.080338 0.051771 0.179245 0.622068 1.000000 0.041850 1.000000
## 3 0.997566 0.096436 0.062534 0.210623 0.625543 1.000000 0.050661 1.000000
## 4 0.997085 0.116183 0.075922 0.247350 0.629887 1.000000 0.061674 1.000000
## 5 0.996452 0.135524 0.089237 0.281570 0.634231 1.000000 0.072687 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.137490 0.030837 0.515419 697 440 0 14
## 2 0.160525 0.041850 0.520925 697 435 0 19
## 3 0.176929 0.050661 0.525330 697 431 0 23
## 4 0.195649 0.061674 0.530837 697 426 0 28
## 5 0.212875 0.072687 0.536344 697 421 0 33
## tnr fnr fpr tpr idx
## 1 1.000000 0.969163 0.000000 0.030837 0
## 2 1.000000 0.958150 0.000000 0.041850 1
## 3 1.000000 0.949339 0.000000 0.050661 2
## 4 1.000000 0.938326 0.000000 0.061674 3
## 5 1.000000 0.927313 0.000000 0.072687 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 395 0.034039 0.567500 0.766374 0.450576 0.398784 0.396161 1.000000
## 396 0.032176 0.567146 0.766115 0.450218 0.397915 0.395815 1.000000
## 397 0.031255 0.566792 0.765857 0.449861 0.397046 0.395470 1.000000
## 398 0.028624 0.566438 0.765599 0.449505 0.396177 0.395126 1.000000
## 399 0.026161 0.566085 0.765341 0.449149 0.395308 0.394783 1.000000
## 400 0.018994 0.565732 0.765083 0.448794 0.394440 0.394440 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395 0.007174 0.053309 0.007174 0.503587 5
## 396 0.005739 0.047661 0.005739 0.502869 4
## 397 0.004304 0.041257 0.004304 0.502152 3
## 398 0.002869 0.033672 0.002869 0.501435 2
## 399 0.001435 0.023799 0.001435 0.500717 1
## 400 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 395 0 692 454 0.007174 0.000000 0.992826 1.000000 394
## 396 0 693 454 0.005739 0.000000 0.994261 1.000000 395
## 397 0 694 454 0.004304 0.000000 0.995696 1.000000 396
## 398 0 695 454 0.002869 0.000000 0.997131 1.000000 397
## 399 0 696 454 0.001435 0.000000 0.998565 1.000000 398
## 400 0 697 454 0.000000 0.000000 1.000000 1.000000 399
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.536846 0.851114 181
## 2 max f2 0.254687 0.846843 266
## 3 max f0point5 0.583559 0.893756 168
## 4 max accuracy 0.570686 0.890530 171
## 5 max precision 0.998490 1.000000 0
## 6 max recall 0.039450 1.000000 393
## 7 max specificity 0.998490 1.000000 0
## 8 max absolute_mcc 0.570686 0.770944 171
## 9 max min_per_class_accuracy 0.318139 0.852423 239
## 10 max mean_per_class_accuracy 0.536846 0.873955 181
## 11 max tns 0.998490 697.000000 0
## 12 max fns 0.998490 440.000000 0
## 13 max fps 0.018994 697.000000 399
## 14 max tps 0.039450 454.000000 393
## 15 max tnr 0.998490 1.000000 0
## 16 max fnr 0.998490 0.969163 0
## 17 max fpr 0.018994 1.000000 399
## 18 max tpr 0.039450 1.000000 393
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 39.44 %, avg score: 40.43 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.01042572 0.998308 2.535242 2.535242
## 2 2 0.02085143 0.997181 2.535242 2.535242
## 3 3 0.03040834 0.996217 2.535242 2.535242
## 4 4 0.04083406 0.995525 2.535242 2.535242
## 5 5 0.05039096 0.993871 2.535242 2.535242
## 6 6 0.10078193 0.984868 2.491531 2.513387
## 7 7 0.15030408 0.968155 2.535242 2.520588
## 8 8 0.20069505 0.930470 2.491531 2.513292
## 9 9 0.30060817 0.709669 2.094331 2.374042
## 10 10 0.40139010 0.358650 1.202055 2.079777
## 11 11 0.50043440 0.182334 0.511496 1.769388
## 12 12 0.60642919 0.136069 0.394833 1.529136
## 13 13 0.70634231 0.103549 0.242501 1.347140
## 14 14 0.80017376 0.068769 0.187796 1.211191
## 15 15 0.90790617 0.039689 0.184010 1.089305
## 16 16 1.00000000 0.018994 0.119587 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 1.000000 0.998524 1.000000 0.998524
## 2 1.000000 0.997861 1.000000 0.998192
## 3 1.000000 0.996634 1.000000 0.997703
## 4 1.000000 0.995912 1.000000 0.997245
## 5 1.000000 0.994906 1.000000 0.996802
## 6 0.982759 0.989673 0.991379 0.993237
## 7 1.000000 0.977650 0.994220 0.988102
## 8 0.982759 0.950777 0.991342 0.978730
## 9 0.826087 0.841582 0.936416 0.933146
## 10 0.474138 0.521028 0.820346 0.829671
## 11 0.201754 0.267275 0.697917 0.718363
## 12 0.155738 0.145150 0.603152 0.618174
## 13 0.095652 0.116912 0.531365 0.547270
## 14 0.074074 0.088899 0.477742 0.493520
## 15 0.072581 0.054050 0.429665 0.441372
## 16 0.047170 0.038899 0.394440 0.404307
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 0.026432 0.026432 153.524229 153.524229
## 2 0.026432 0.052863 153.524229 153.524229
## 3 0.024229 0.077093 153.524229 153.524229
## 4 0.026432 0.103524 153.524229 153.524229
## 5 0.024229 0.127753 153.524229 153.524229
## 6 0.125551 0.253304 149.153122 151.338675
## 7 0.125551 0.378855 153.524229 152.058771
## 8 0.125551 0.504405 149.153122 151.329214
## 9 0.209251 0.713656 109.433059 137.404191
## 10 0.121145 0.834802 20.205453 107.977668
## 11 0.050661 0.885463 -48.850375 76.938785
## 12 0.041850 0.927313 -60.516718 52.913611
## 13 0.024229 0.951542 -75.749856 34.713982
## 14 0.017621 0.969163 -81.220427 21.119067
## 15 0.019824 0.988987 -81.599048 8.930506
## 16 0.011013 1.000000 -88.041310 0.000000
## kolmogorov_smirnov
## 1 0.026432
## 2 0.052863
## 3 0.077093
## 4 0.103524
## 5 0.127753
## 6 0.251869
## 7 0.377420
## 8 0.501536
## 9 0.682093
## 10 0.715720
## 11 0.635821
## 12 0.529895
## 13 0.404913
## 14 0.279063
## 15 0.133894
## 16 0.000000
h2o.auc(performance_h2o)
## [1] 0.9216324
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.536845881880662:
## n y Error Rate
## n 661 36 0.051650 =36/697
## y 91 363 0.200441 =91/454
## Totals 752 399 0.110339 =127/1151
h2o.metric(performance_h2o)
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.998490 0.059829 0.038251 0.137255 0.617724 1.000000 0.030837 1.000000
## 2 0.998061 0.080338 0.051771 0.179245 0.622068 1.000000 0.041850 1.000000
## 3 0.997566 0.096436 0.062534 0.210623 0.625543 1.000000 0.050661 1.000000
## 4 0.997085 0.116183 0.075922 0.247350 0.629887 1.000000 0.061674 1.000000
## 5 0.996452 0.135524 0.089237 0.281570 0.634231 1.000000 0.072687 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.137490 0.030837 0.515419 697 440 0 14
## 2 0.160525 0.041850 0.520925 697 435 0 19
## 3 0.176929 0.050661 0.525330 697 431 0 23
## 4 0.195649 0.061674 0.530837 697 426 0 28
## 5 0.212875 0.072687 0.536344 697 421 0 33
## tnr fnr fpr tpr idx
## 1 1.000000 0.969163 0.000000 0.030837 0
## 2 1.000000 0.958150 0.000000 0.041850 1
## 3 1.000000 0.949339 0.000000 0.050661 2
## 4 1.000000 0.938326 0.000000 0.061674 3
## 5 1.000000 0.927313 0.000000 0.072687 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 395 0.034039 0.567500 0.766374 0.450576 0.398784 0.396161 1.000000
## 396 0.032176 0.567146 0.766115 0.450218 0.397915 0.395815 1.000000
## 397 0.031255 0.566792 0.765857 0.449861 0.397046 0.395470 1.000000
## 398 0.028624 0.566438 0.765599 0.449505 0.396177 0.395126 1.000000
## 399 0.026161 0.566085 0.765341 0.449149 0.395308 0.394783 1.000000
## 400 0.018994 0.565732 0.765083 0.448794 0.394440 0.394440 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395 0.007174 0.053309 0.007174 0.503587 5
## 396 0.005739 0.047661 0.005739 0.502869 4
## 397 0.004304 0.041257 0.004304 0.502152 3
## 398 0.002869 0.033672 0.002869 0.501435 2
## 399 0.001435 0.023799 0.001435 0.500717 1
## 400 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 395 0 692 454 0.007174 0.000000 0.992826 1.000000 394
## 396 0 693 454 0.005739 0.000000 0.994261 1.000000 395
## 397 0 694 454 0.004304 0.000000 0.995696 1.000000 396
## 398 0 695 454 0.002869 0.000000 0.997131 1.000000 397
## 399 0 696 454 0.001435 0.000000 0.998565 1.000000 398
## 400 0 697 454 0.000000 0.000000 1.000000 1.000000 399