library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.7     ✔ rsample      1.2.1
## ✔ dials        1.4.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.3.0     ✔ yardstick    1.3.2
## ✔ recipes      1.1.0     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-08-15/spam.csv')

## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	4601
Number of columns	7
_______________________
Column type frequency:
character	1
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
yesno	0	1	1	1	0	2	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
crl.tot	1	283.29	606.35	1	35	95	266.00	15841.00	▇▁▁▁▁
dollar	1	0.08	0.25	0	0	0	0.05	6.00	▇▁▁▁▁
bang	1	0.27	0.82	0	0	0	0.32	32.48	▇▁▁▁▁
money	1	0.09	0.44	0	0	0	0.00	12.50	▇▁▁▁▁
n000	1	0.10	0.35	0	0	0	0.00	5.45	▇▁▁▁▁
make	1	0.10	0.31	0	0	0	0.00	4.54	▇▁▁▁▁

clean data

data_clean <- data %>% 
    
    # Address factors imported as numeric 
    # mutate(across(where(is.character), as.factor)) %>%
    mutate(yesno = factor(yesno, levels = c("y", "n")))

data_clean

## # A tibble: 4,601 × 7
##    crl.tot dollar  bang money  n000  make yesno
##      <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
##  1     278  0     0.778  0     0     0    y    
##  2    1028  0.18  0.372  0.43  0.43  0.21 y    
##  3    2259  0.184 0.276  0.06  1.16  0.06 y    
##  4     191  0     0.137  0     0     0    y    
##  5     191  0     0.135  0     0     0    y    
##  6      54  0     0      0     0     0    y    
##  7     112  0.054 0.164  0     0     0    y    
##  8      49  0     0      0     0     0    y    
##  9    1257  0.203 0.181  0.15  0     0.15 y    
## 10     749  0.081 0.244  0     0.19  0.06 y    
## # ℹ 4,591 more rows

skimr::skim(data_clean)

Data summary
Name	data_clean
Number of rows	4601
Number of columns	7
_______________________
Column type frequency:
factor	1
numeric	6
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
yesno	0	1	FALSE	2	n: 2788, y: 1813

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
crl.tot	1	283.29	606.35	1	35	95	266.00	15841.00	▇▁▁▁▁
dollar	1	0.08	0.25	0	0	0	0.05	6.00	▇▁▁▁▁
bang	1	0.27	0.82	0	0	0	0.32	32.48	▇▁▁▁▁
money	1	0.09	0.44	0	0	0	0.00	12.50	▇▁▁▁▁
n000	1	0.10	0.35	0	0	0	0.00	5.45	▇▁▁▁▁
make	1	0.10	0.31	0	0	0	0.00	4.54	▇▁▁▁▁

Split data

set.seed(1234)

data_split <- initial_split(data_clean, strata = "yesno")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)

Recipes

recipe_obj <- recipe(yesno ~ ., data = train_tbl) %>% 
    
    step_zv(all_predictors())

Model

# Initialize h2o
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 days 2 hours 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 15 days 
##     H2O cluster name:           H2O_started_from_R_aldendimick_ggl822 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.35 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.1 (2024-06-14)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 15 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

y <- "yesno"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
    x = x,
    y = y, 
    training_frame    = train_h2o,
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    # max_runtime_secs  = 30, 
    max_models        = 10, 
    exclude_algos     = "DeepLearning",
    nfolds            = 5, 
    seed              = 3456   
)

##   |                                                                              |                                                                      |   0%  |                                                                              |===                                                                   |   4%
## 12:08:18.169: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 12:08:18.171: AutoML: XGBoost is not available; skipping it.  |                                                                              |===============                                                       |  21%  |                                                                              |=======================                                               |  33%  |                                                                              |===============================================                       |  67%  |                                                                              |======================================================================| 100%

Examine the output of h2o.automl

models_h2o %>% typeof()

## [1] "S4"

models_h2o %>% slotNames()

## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"

models_h2o@leaderboard

##                                                   model_id       auc   logloss
## 1             GBM_grid_1_AutoML_12_20250506_120818_model_1 0.9195293 0.3175906
## 2                          GBM_5_AutoML_12_20250506_120818 0.9188403 0.3159186
## 3             GBM_grid_1_AutoML_12_20250506_120818_model_2 0.9187534 0.3224471
## 4                          GBM_4_AutoML_12_20250506_120818 0.9187139 0.3211553
## 5 StackedEnsemble_BestOfFamily_1_AutoML_12_20250506_120818 0.9185417 0.3190287
## 6    StackedEnsemble_AllModels_1_AutoML_12_20250506_120818 0.9185069 0.3143932
##       aucpr mean_per_class_error      rmse        mse
## 1 0.9146886            0.1244588 0.3048919 0.09295905
## 2 0.9156096            0.1277122 0.3053640 0.09324718
## 3 0.9131192            0.1328048 0.3088534 0.09539039
## 4 0.9117697            0.1245394 0.3084129 0.09511850
## 5 0.9146343            0.1262269 0.3031829 0.09191989
## 6 0.9134479            0.1268132 0.3030065 0.09181291
## 
## [12 rows x 7 columns]

models_h2o@leader

## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model ID:  GBM_grid_1_AutoML_12_20250506_120818_model_1 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              56                       56               11862         4
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         9    7.01786          8         16    12.12500
## 
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  0.08633413
## RMSE:  0.2938267
## LogLoss:  0.2918996
## Mean Per-Class Error:  0.1244266
## AUC:  0.9359978
## AUCPR:  0.9301509
## Gini:  0.8719957
## R^2:  0.6389747
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           n    y    Error       Rate
## n      1657  114 0.064370  =114/1771
## y       214  946 0.184483  =214/1160
## Totals 1871 1060 0.111907  =328/2931
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.460105    0.852252 196
## 2                       max f2  0.172882    0.864482 298
## 3                 max f0point5  0.744784    0.899737 118
## 4                 max accuracy  0.481078    0.888434 189
## 5                max precision  0.993124    1.000000   0
## 6                   max recall  0.020360    1.000000 397
## 7              max specificity  0.993124    1.000000   0
## 8             max absolute_mcc  0.588570    0.765409 161
## 9   max min_per_class_accuracy  0.299660    0.866379 244
## 10 max mean_per_class_accuracy  0.398256    0.876259 209
## 11                     max tns  0.993124 1771.000000   0
## 12                     max fns  0.993124 1154.000000   0
## 13                     max fps  0.018317 1771.000000 399
## 14                     max tps  0.020360 1160.000000 397
## 15                     max tnr  0.993124    1.000000   0
## 16                     max fnr  0.993124    0.994828   0
## 17                     max fpr  0.018317    1.000000 399
## 18                     max tpr  0.020360    1.000000 397
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
## 
## MSE:  0.1050428
## RMSE:  0.3241031
## LogLoss:  0.3368799
## Mean Per-Class Error:  0.1566033
## AUC:  0.9227701
## AUCPR:  0.9040581
## Gini:  0.8455402
## R^2:  0.5556779
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##          n   y    Error     Rate
## n      305  15 0.046875  =15/320
## y       53 146 0.266332  =53/199
## Totals 358 161 0.131021  =68/519
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.553173   0.811111 148
## 2                       max f2  0.133945   0.865116 251
## 3                 max f0point5  0.721787   0.870968 131
## 4                 max accuracy  0.553173   0.868979 148
## 5                max precision  0.993194   1.000000   0
## 6                   max recall  0.052004   1.000000 346
## 7              max specificity  0.993194   1.000000   0
## 8             max absolute_mcc  0.553173   0.721894 148
## 9   max min_per_class_accuracy  0.228198   0.825000 207
## 10 max mean_per_class_accuracy  0.553173   0.843397 148
## 11                     max tns  0.993194 320.000000   0
## 12                     max fns  0.993194 198.000000   0
## 13                     max fps  0.018317 320.000000 356
## 14                     max tps  0.052004 199.000000 346
## 15                     max tnr  0.993194   1.000000   0
## 16                     max fnr  0.993194   0.994975   0
## 17                     max fpr  0.018317   1.000000 356
## 18                     max tpr  0.052004   1.000000 346
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.09373318
## RMSE:  0.3061587
## LogLoss:  0.3147025
## Mean Per-Class Error:  0.1332391
## AUC:  0.9249299
## AUCPR:  0.9168586
## Gini:  0.8498598
## R^2:  0.608034
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           n    y    Error       Rate
## n      1638  133 0.075099  =133/1771
## y       222  938 0.191379  =222/1160
## Totals 1860 1071 0.121119  =355/2931
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.452768    0.840879 198
## 2                       max f2  0.175893    0.855567 294
## 3                 max f0point5  0.685544    0.888795 133
## 4                 max accuracy  0.553867    0.881269 170
## 5                max precision  0.994669    1.000000   0
## 6                   max recall  0.015839    1.000000 399
## 7              max specificity  0.994669    1.000000   0
## 8             max absolute_mcc  0.553867    0.751023 170
## 9   max min_per_class_accuracy  0.291899    0.853448 251
## 10 max mean_per_class_accuracy  0.452768    0.866761 198
## 11                     max tns  0.994669 1771.000000   0
## 12                     max fns  0.994669 1141.000000   0
## 13                     max fps  0.015839 1771.000000 399
## 14                     max tps  0.015839 1160.000000 399
## 15                     max tnr  0.994669    1.000000   0
## 16                     max fnr  0.994669    0.983621   0
## 17                     max fpr  0.015839    1.000000 399
## 18                     max tpr  0.015839    1.000000 399
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                              mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                 0.882637 0.013798   0.873935   0.895904   0.899317
## auc                      0.924739 0.010715   0.932145   0.923461   0.938504
## err                      0.117363 0.013798   0.126065   0.104096   0.100683
## err_count               68.800000 8.105554  74.000000  61.000000  59.000000
## f0point5                 0.869741 0.028128   0.834731   0.901961   0.892193
## f1                       0.844977 0.018076   0.843220   0.857809   0.866817
## f2                       0.822492 0.029111   0.851884   0.817778   0.842845
## lift_top_group           2.526724 0.001928   2.530172   2.525862   2.525862
## logloss                  0.315103 0.025279   0.311988   0.306174   0.280152
## max_per_class_error      0.191379 0.040273   0.142241   0.206897   0.172414
## mcc                      0.754487 0.028806   0.738154   0.783029   0.788463
## mean_per_class_accuracy  0.869880 0.014271   0.871133   0.878190   0.886957
## mean_per_class_error     0.130120 0.014271   0.128867   0.121810   0.113043
## mse                      0.093758 0.009047   0.093147   0.088820   0.081773
## pr_auc                   0.916488 0.014438   0.919953   0.919491   0.936460
## precision                0.887694 0.042510   0.829167   0.934010   0.909953
## r2                       0.607929 0.037824   0.610301   0.628622   0.658086
## recall                   0.808621 0.040273   0.857759   0.793103   0.827586
## rmse                     0.305912 0.014842   0.305200   0.298027   0.285961
## specificity              0.931139 0.032065   0.884507   0.963277   0.946328
##                         cv_4_valid cv_5_valid
## accuracy                  0.873720   0.870307
## auc                       0.912021   0.917562
## err                       0.126280   0.129693
## err_count                74.000000  76.000000
## f0point5                  0.849820   0.870000
## f1                        0.836283   0.820755
## f2                        0.823171   0.776786
## lift_top_group            2.525862   2.525862
## logloss                   0.346989   0.330213
## max_per_class_error       0.185345   0.250000
## mcc                       0.734309   0.728482
## mean_per_class_accuracy   0.863542   0.849576
## mean_per_class_error      0.136458   0.150424
## mse                       0.104150   0.100901
## pr_auc                    0.897600   0.908932
## precision                 0.859091   0.906250
## r2                        0.564524   0.578110
## recall                    0.814655   0.750000
## rmse                      0.322723   0.317649
## specificity               0.912429   0.949152

Save and Load

?h2o.getModel
?h2o.saveModel
?h2o.loadModel

# h2o.getModel("GBM_grid_1_AutoML_5_20250506_112018_model_9") %>% 
    # h2o.saveModel("h2o_models2/")

best_model <- h2o.loadModel("h2o_models2/GBM_grid_1_AutoML_5_20250506_112018_model_9")

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

predictions_tbl <- predictions %>% 
    as_tibble()

predictions_tbl %>% 
    bind_cols(test_tbl)

## # A tibble: 1,151 × 10
##    predict      n      y crl.tot dollar  bang money  n000  make yesno
##    <fct>    <dbl>  <dbl>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
##  1 y       0.0103 0.990     1028  0.18  0.372  0.43  0.43  0.21 y    
##  2 n       0.901  0.0988      54  0     0      0     0     0    y    
##  3 y       0.237  0.763     2259  0.046 0.25   0     0.05  0.05 y    
##  4 y       0.352  0.648       34  0     0.809  0     0     0    y    
##  5 y       0.0783 0.922       82  0.196 0.392  0     0     0    y    
##  6 y       0.134  0.866       47  0     0.368  3.33  0     0    y    
##  7 y       0.308  0.692      129  0     0.091  0.65  0     0    y    
##  8 y       0.267  0.733       59  0     0.886  0     0     1.17 y    
##  9 y       0.375  0.625       89  0.091 0      0.27  0     0    y    
## 10 y       0.0671 0.933      239  0.244 0.488  0     0.48  0    y    
## # ℹ 1,141 more rows

Evaluate model

?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)

## [1] "S4"

slotNames(performance_h2o)

## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"

performance_h2o@metrics

## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "GBM_grid_1_AutoML_5_20250506_112018_model_9"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/GBM_grid_1_AutoML_5_20250506_112018_model_9"
## 
## 
## $model_checksum
## [1] "-7639920851749533421"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_9edd_3"
## 
## 
## $frame_checksum
## [1] "5361656593914358099"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.746548e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.09336355
## 
## $RMSE
## [1] 0.3055545
## 
## $nobs
## [1] 1151
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.6091237
## 
## $logloss
## [1] 0.3140552
## 
## $AUC
## [1] 0.9216324
## 
## $pr_auc
## [1] 0.9185834
## 
## $Gini
## [1] 0.8432647
## 
## $mean_per_class_error
## [1] 0.1260452
## 
## $domain
## [1] "n" "y"
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##          n   y  Error          Rate
## n      661  36 0.0516 =    36 / 697
## y       91 363 0.2004 =    91 / 454
## Totals 752 399 0.1103 = 127 / 1,151
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.998490 0.059829 0.038251 0.137255 0.617724  1.000000 0.030837    1.000000
## 2  0.998061 0.080338 0.051771 0.179245 0.622068  1.000000 0.041850    1.000000
## 3  0.997566 0.096436 0.062534 0.210623 0.625543  1.000000 0.050661    1.000000
## 4  0.997085 0.116183 0.075922 0.247350 0.629887  1.000000 0.061674    1.000000
## 5  0.996452 0.135524 0.089237 0.281570 0.634231  1.000000 0.072687    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.137490               0.030837                0.515419 697 440   0  14
## 2     0.160525               0.041850                0.520925 697 435   0  19
## 3     0.176929               0.050661                0.525330 697 431   0  23
## 4     0.195649               0.061674                0.530837 697 426   0  28
## 5     0.212875               0.072687                0.536344 697 421   0  33
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.969163 0.000000 0.030837   0
## 2 1.000000 0.958150 0.000000 0.041850   1
## 3 1.000000 0.949339 0.000000 0.050661   2
## 4 1.000000 0.938326 0.000000 0.061674   3
## 5 1.000000 0.927313 0.000000 0.072687   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.034039 0.567500 0.766374 0.450576 0.398784  0.396161 1.000000
## 396  0.032176 0.567146 0.766115 0.450218 0.397915  0.395815 1.000000
## 397  0.031255 0.566792 0.765857 0.449861 0.397046  0.395470 1.000000
## 398  0.028624 0.566438 0.765599 0.449505 0.396177  0.395126 1.000000
## 399  0.026161 0.566085 0.765341 0.449149 0.395308  0.394783 1.000000
## 400  0.018994 0.565732 0.765083 0.448794 0.394440  0.394440 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.007174     0.053309               0.007174                0.503587   5
## 396    0.005739     0.047661               0.005739                0.502869   4
## 397    0.004304     0.041257               0.004304                0.502152   3
## 398    0.002869     0.033672               0.002869                0.501435   2
## 399    0.001435     0.023799               0.001435                0.500717   1
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns fps tps      tnr      fnr      fpr      tpr idx
## 395   0 692 454 0.007174 0.000000 0.992826 1.000000 394
## 396   0 693 454 0.005739 0.000000 0.994261 1.000000 395
## 397   0 694 454 0.004304 0.000000 0.995696 1.000000 396
## 398   0 695 454 0.002869 0.000000 0.997131 1.000000 397
## 399   0 696 454 0.001435 0.000000 0.998565 1.000000 398
## 400   0 697 454 0.000000 0.000000 1.000000 1.000000 399
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.536846   0.851114 181
## 2                       max f2  0.254687   0.846843 266
## 3                 max f0point5  0.583559   0.893756 168
## 4                 max accuracy  0.570686   0.890530 171
## 5                max precision  0.998490   1.000000   0
## 6                   max recall  0.039450   1.000000 393
## 7              max specificity  0.998490   1.000000   0
## 8             max absolute_mcc  0.570686   0.770944 171
## 9   max min_per_class_accuracy  0.318139   0.852423 239
## 10 max mean_per_class_accuracy  0.536846   0.873955 181
## 11                     max tns  0.998490 697.000000   0
## 12                     max fns  0.998490 440.000000   0
## 13                     max fps  0.018994 697.000000 399
## 14                     max tps  0.039450 454.000000 393
## 15                     max tnr  0.998490   1.000000   0
## 16                     max fnr  0.998490   0.969163   0
## 17                     max fpr  0.018994   1.000000 399
## 18                     max tpr  0.039450   1.000000 393
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 39.44 %, avg score: 40.43 %
##    group cumulative_data_fraction lower_threshold     lift cumulative_lift
## 1      1               0.01042572        0.998308 2.535242        2.535242
## 2      2               0.02085143        0.997181 2.535242        2.535242
## 3      3               0.03040834        0.996217 2.535242        2.535242
## 4      4               0.04083406        0.995525 2.535242        2.535242
## 5      5               0.05039096        0.993871 2.535242        2.535242
## 6      6               0.10078193        0.984868 2.491531        2.513387
## 7      7               0.15030408        0.968155 2.535242        2.520588
## 8      8               0.20069505        0.930470 2.491531        2.513292
## 9      9               0.30060817        0.709669 2.094331        2.374042
## 10    10               0.40139010        0.358650 1.202055        2.079777
## 11    11               0.50043440        0.182334 0.511496        1.769388
## 12    12               0.60642919        0.136069 0.394833        1.529136
## 13    13               0.70634231        0.103549 0.242501        1.347140
## 14    14               0.80017376        0.068769 0.187796        1.211191
## 15    15               0.90790617        0.039689 0.184010        1.089305
## 16    16               1.00000000        0.018994 0.119587        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 0.998524                 1.000000         0.998524
## 2       1.000000 0.997861                 1.000000         0.998192
## 3       1.000000 0.996634                 1.000000         0.997703
## 4       1.000000 0.995912                 1.000000         0.997245
## 5       1.000000 0.994906                 1.000000         0.996802
## 6       0.982759 0.989673                 0.991379         0.993237
## 7       1.000000 0.977650                 0.994220         0.988102
## 8       0.982759 0.950777                 0.991342         0.978730
## 9       0.826087 0.841582                 0.936416         0.933146
## 10      0.474138 0.521028                 0.820346         0.829671
## 11      0.201754 0.267275                 0.697917         0.718363
## 12      0.155738 0.145150                 0.603152         0.618174
## 13      0.095652 0.116912                 0.531365         0.547270
## 14      0.074074 0.088899                 0.477742         0.493520
## 15      0.072581 0.054050                 0.429665         0.441372
## 16      0.047170 0.038899                 0.394440         0.404307
##    capture_rate cumulative_capture_rate       gain cumulative_gain
## 1      0.026432                0.026432 153.524229      153.524229
## 2      0.026432                0.052863 153.524229      153.524229
## 3      0.024229                0.077093 153.524229      153.524229
## 4      0.026432                0.103524 153.524229      153.524229
## 5      0.024229                0.127753 153.524229      153.524229
## 6      0.125551                0.253304 149.153122      151.338675
## 7      0.125551                0.378855 153.524229      152.058771
## 8      0.125551                0.504405 149.153122      151.329214
## 9      0.209251                0.713656 109.433059      137.404191
## 10     0.121145                0.834802  20.205453      107.977668
## 11     0.050661                0.885463 -48.850375       76.938785
## 12     0.041850                0.927313 -60.516718       52.913611
## 13     0.024229                0.951542 -75.749856       34.713982
## 14     0.017621                0.969163 -81.220427       21.119067
## 15     0.019824                0.988987 -81.599048        8.930506
## 16     0.011013                1.000000 -88.041310        0.000000
##    kolmogorov_smirnov
## 1            0.026432
## 2            0.052863
## 3            0.077093
## 4            0.103524
## 5            0.127753
## 6            0.251869
## 7            0.377420
## 8            0.501536
## 9            0.682093
## 10           0.715720
## 11           0.635821
## 12           0.529895
## 13           0.404913
## 14           0.279063
## 15           0.133894
## 16           0.000000

h2o.auc(performance_h2o)

## [1] 0.9216324

h2o.confusionMatrix(performance_h2o)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.536845881880662:
##          n   y    Error       Rate
## n      661  36 0.051650    =36/697
## y       91 363 0.200441    =91/454
## Totals 752 399 0.110339  =127/1151

h2o.metric(performance_h2o)

## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.998490 0.059829 0.038251 0.137255 0.617724  1.000000 0.030837    1.000000
## 2  0.998061 0.080338 0.051771 0.179245 0.622068  1.000000 0.041850    1.000000
## 3  0.997566 0.096436 0.062534 0.210623 0.625543  1.000000 0.050661    1.000000
## 4  0.997085 0.116183 0.075922 0.247350 0.629887  1.000000 0.061674    1.000000
## 5  0.996452 0.135524 0.089237 0.281570 0.634231  1.000000 0.072687    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.137490               0.030837                0.515419 697 440   0  14
## 2     0.160525               0.041850                0.520925 697 435   0  19
## 3     0.176929               0.050661                0.525330 697 431   0  23
## 4     0.195649               0.061674                0.530837 697 426   0  28
## 5     0.212875               0.072687                0.536344 697 421   0  33
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.969163 0.000000 0.030837   0
## 2 1.000000 0.958150 0.000000 0.041850   1
## 3 1.000000 0.949339 0.000000 0.050661   2
## 4 1.000000 0.938326 0.000000 0.061674   3
## 5 1.000000 0.927313 0.000000 0.072687   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.034039 0.567500 0.766374 0.450576 0.398784  0.396161 1.000000
## 396  0.032176 0.567146 0.766115 0.450218 0.397915  0.395815 1.000000
## 397  0.031255 0.566792 0.765857 0.449861 0.397046  0.395470 1.000000
## 398  0.028624 0.566438 0.765599 0.449505 0.396177  0.395126 1.000000
## 399  0.026161 0.566085 0.765341 0.449149 0.395308  0.394783 1.000000
## 400  0.018994 0.565732 0.765083 0.448794 0.394440  0.394440 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.007174     0.053309               0.007174                0.503587   5
## 396    0.005739     0.047661               0.005739                0.502869   4
## 397    0.004304     0.041257               0.004304                0.502152   3
## 398    0.002869     0.033672               0.002869                0.501435   2
## 399    0.001435     0.023799               0.001435                0.500717   1
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns fps tps      tnr      fnr      fpr      tpr idx
## 395   0 692 454 0.007174 0.000000 0.992826 1.000000 394
## 396   0 693 454 0.005739 0.000000 0.994261 1.000000 395
## 397   0 694 454 0.004304 0.000000 0.995696 1.000000 396
## 398   0 695 454 0.002869 0.000000 0.997131 1.000000 397
## 399   0 696 454 0.001435 0.000000 0.998565 1.000000 398
## 400   0 697 454 0.000000 0.000000 1.000000 1.000000 399

Apply 11

Alden Dimick

2025-05-04

clean data

Split data

Recipes

Model

Save and Load

Make predictions

Evaluate model