Goal: Build a classification model to predict the spam email (yesno). Click here for data.
library(tidyverse)
library(correlationfunnel)
data <- readr::read_csv("../00_data/data_wrangled/data_clean_2/data_clean") %>%
# h2o requires all variables to be either numeric or factors
mutate(across(where(is.character), factor))
## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(1234)
data_split <- initial_split(data, strata = "yesno")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
recipe_obj <- recipe(yesno ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
#Initialize h2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 23 minutes 23 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 4 months and 18 days
## H2O cluster name: H2O_started_from_R_tomli_fhp551
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.82 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (4 months and 18 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)
## | | | 0% | |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
## | | | 0% | |======================================================================| 100%
y <- "yesno"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
# max_runtime_secs = 30,
max_models = 10,
exclude_algos = "DeepLearning",
nfolds = 5,
seed = 3456
)
## | | | 0% | |=== | 4%
## 11:44:07.162: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 11:44:07.178: AutoML: XGBoost is not available; skipping it. | |=============== | 21% | |======================= | 33% | |======================================================================| 100%
Examine the output of h2o.automl
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id auc logloss
## 1 GBM_grid_1_AutoML_4_20240509_114407_model_1 0.9195293 0.3175906
## 2 GBM_5_AutoML_4_20240509_114407 0.9188403 0.3159186
## 3 GBM_grid_1_AutoML_4_20240509_114407_model_2 0.9187534 0.3224471
## 4 GBM_4_AutoML_4_20240509_114407 0.9187139 0.3211553
## 5 StackedEnsemble_BestOfFamily_1_AutoML_4_20240509_114407 0.9185417 0.3190287
## 6 StackedEnsemble_AllModels_1_AutoML_4_20240509_114407 0.9185069 0.3143932
## aucpr mean_per_class_error rmse mse
## 1 0.9146886 0.1244588 0.3048919 0.09295905
## 2 0.9156096 0.1277122 0.3053640 0.09324718
## 3 0.9131192 0.1328048 0.3088534 0.09539039
## 4 0.9117697 0.1245394 0.3084129 0.09511850
## 5 0.9146343 0.1262269 0.3031829 0.09191989
## 6 0.9134479 0.1268132 0.3030065 0.09181291
##
## [12 rows x 7 columns]
best_model <- models_h2o@leader
best_model
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model ID: GBM_grid_1_AutoML_4_20240509_114407_model_1
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 56 56 11863 4
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 9 7.01786 8 16 12.12500
##
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.08633413
## RMSE: 0.2938267
## LogLoss: 0.2918996
## Mean Per-Class Error: 0.1244266
## AUC: 0.9359978
## AUCPR: 0.9301509
## Gini: 0.8719957
## R^2: 0.6389747
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## n y Error Rate
## n 1657 114 0.064370 =114/1771
## y 214 946 0.184483 =214/1160
## Totals 1871 1060 0.111907 =328/2931
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.460105 0.852252 196
## 2 max f2 0.172882 0.864482 298
## 3 max f0point5 0.744784 0.899737 118
## 4 max accuracy 0.481078 0.888434 189
## 5 max precision 0.993124 1.000000 0
## 6 max recall 0.020360 1.000000 397
## 7 max specificity 0.993124 1.000000 0
## 8 max absolute_mcc 0.588570 0.765409 161
## 9 max min_per_class_accuracy 0.299660 0.866379 244
## 10 max mean_per_class_accuracy 0.398256 0.876259 209
## 11 max tns 0.993124 1771.000000 0
## 12 max fns 0.993124 1154.000000 0
## 13 max fps 0.018317 1771.000000 399
## 14 max tps 0.020360 1160.000000 397
## 15 max tnr 0.993124 1.000000 0
## 16 max fnr 0.993124 0.994828 0
## 17 max fpr 0.018317 1.000000 399
## 18 max tpr 0.020360 1.000000 397
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
##
## MSE: 0.1050428
## RMSE: 0.3241031
## LogLoss: 0.3368799
## Mean Per-Class Error: 0.1566033
## AUC: 0.9227701
## AUCPR: 0.9040581
## Gini: 0.8455402
## R^2: 0.5556779
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## n y Error Rate
## n 305 15 0.046875 =15/320
## y 53 146 0.266332 =53/199
## Totals 358 161 0.131021 =68/519
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.553173 0.811111 148
## 2 max f2 0.133945 0.865116 251
## 3 max f0point5 0.721787 0.870968 131
## 4 max accuracy 0.553173 0.868979 148
## 5 max precision 0.993194 1.000000 0
## 6 max recall 0.052004 1.000000 346
## 7 max specificity 0.993194 1.000000 0
## 8 max absolute_mcc 0.553173 0.721894 148
## 9 max min_per_class_accuracy 0.228198 0.825000 207
## 10 max mean_per_class_accuracy 0.553173 0.843397 148
## 11 max tns 0.993194 320.000000 0
## 12 max fns 0.993194 198.000000 0
## 13 max fps 0.018317 320.000000 356
## 14 max tps 0.052004 199.000000 346
## 15 max tnr 0.993194 1.000000 0
## 16 max fnr 0.993194 0.994975 0
## 17 max fpr 0.018317 1.000000 356
## 18 max tpr 0.052004 1.000000 346
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.09373318
## RMSE: 0.3061587
## LogLoss: 0.3147025
## Mean Per-Class Error: 0.1332391
## AUC: 0.9249299
## AUCPR: 0.9168586
## Gini: 0.8498598
## R^2: 0.608034
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## n y Error Rate
## n 1638 133 0.075099 =133/1771
## y 222 938 0.191379 =222/1160
## Totals 1860 1071 0.121119 =355/2931
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.452768 0.840879 198
## 2 max f2 0.175893 0.855567 294
## 3 max f0point5 0.685544 0.888795 133
## 4 max accuracy 0.553867 0.881269 170
## 5 max precision 0.994669 1.000000 0
## 6 max recall 0.015839 1.000000 399
## 7 max specificity 0.994669 1.000000 0
## 8 max absolute_mcc 0.553867 0.751023 170
## 9 max min_per_class_accuracy 0.291899 0.853448 251
## 10 max mean_per_class_accuracy 0.452768 0.866761 198
## 11 max tns 0.994669 1771.000000 0
## 12 max fns 0.994669 1141.000000 0
## 13 max fps 0.015839 1771.000000 399
## 14 max tps 0.015839 1160.000000 399
## 15 max tnr 0.994669 1.000000 0
## 16 max fnr 0.994669 0.983621 0
## 17 max fpr 0.015839 1.000000 399
## 18 max tpr 0.015839 1.000000 399
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.882637 0.013798 0.873935 0.895904 0.899317
## auc 0.924739 0.010715 0.932145 0.923461 0.938504
## err 0.117363 0.013798 0.126065 0.104096 0.100683
## err_count 68.800000 8.105554 74.000000 61.000000 59.000000
## f0point5 0.869741 0.028128 0.834731 0.901961 0.892193
## f1 0.844977 0.018076 0.843220 0.857809 0.866817
## f2 0.822492 0.029111 0.851884 0.817778 0.842845
## lift_top_group 2.526724 0.001928 2.530172 2.525862 2.525862
## logloss 0.315103 0.025279 0.311988 0.306174 0.280152
## max_per_class_error 0.191379 0.040273 0.142241 0.206897 0.172414
## mcc 0.754487 0.028806 0.738154 0.783029 0.788463
## mean_per_class_accuracy 0.869880 0.014271 0.871133 0.878190 0.886957
## mean_per_class_error 0.130120 0.014271 0.128867 0.121810 0.113043
## mse 0.093758 0.009047 0.093147 0.088820 0.081773
## pr_auc 0.916488 0.014438 0.919953 0.919491 0.936460
## precision 0.887694 0.042510 0.829167 0.934010 0.909953
## r2 0.607929 0.037824 0.610301 0.628622 0.658086
## recall 0.808621 0.040273 0.857759 0.793103 0.827586
## rmse 0.305912 0.014842 0.305200 0.298027 0.285961
## specificity 0.931139 0.032065 0.884507 0.963277 0.946328
## cv_4_valid cv_5_valid
## accuracy 0.873720 0.870307
## auc 0.912021 0.917562
## err 0.126280 0.129693
## err_count 74.000000 76.000000
## f0point5 0.849820 0.870000
## f1 0.836283 0.820755
## f2 0.823171 0.776786
## lift_top_group 2.525862 2.525862
## logloss 0.346989 0.330213
## max_per_class_error 0.185345 0.250000
## mcc 0.734309 0.728482
## mean_per_class_accuracy 0.863542 0.849576
## mean_per_class_error 0.136458 0.150424
## mse 0.104150 0.100901
## pr_auc 0.897600 0.908932
## precision 0.859091 0.906250
## r2 0.564524 0.578110
## recall 0.814655 0.750000
## rmse 0.322723 0.317649
## specificity 0.912429 0.949152
# h2o.getModel("GBM_grid_1_AutoML_2_20240509_112755_model_1") %>%
# h2o.saveModel("h2o_models/")
#
# best_model <- h2o.loadModel("h2o_models/GBM_grid_1_AutoML_2_20240509_112755_model_1")
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
predictions_tbl <- predictions %>%
as.tibble()
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 1,151 × 10
## predict n y crl.tot dollar bang money n000 make yesno
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 y 0.0142 0.986 1028 0.18 0.372 0.43 0.43 0.21 y
## 2 n 0.877 0.123 54 0 0 0 0 0 y
## 3 y 0.107 0.893 2259 0.046 0.25 0 0.05 0.05 y
## 4 y 0.388 0.612 34 0 0.809 0 0 0 y
## 5 y 0.136 0.864 82 0.196 0.392 0 0 0 y
## 6 y 0.262 0.738 47 0 0.368 3.33 0 0 y
## 7 y 0.254 0.746 129 0 0.091 0.65 0 0 y
## 8 y 0.298 0.702 59 0 0.886 0 0 1.17 y
## 9 y 0.167 0.833 89 0.091 0 0.27 0 0 y
## 10 y 0.0230 0.977 239 0.244 0.488 0 0.48 0 y
## # ℹ 1,141 more rows
?h2o.performance
## starting httpd help server ... done
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "GBM_grid_1_AutoML_4_20240509_114407_model_1"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/GBM_grid_1_AutoML_4_20240509_114407_model_1"
##
##
## $model_checksum
## [1] "-874147823862691177"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_b4a1_3"
##
##
## $frame_checksum
## [1] "5361656593914358099"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.71527e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.09295905
##
## $RMSE
## [1] 0.3048919
##
## $nobs
## [1] 1151
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.6108172
##
## $logloss
## [1] 0.3175906
##
## $AUC
## [1] 0.9195293
##
## $pr_auc
## [1] 0.9146886
##
## $Gini
## [1] 0.8390585
##
## $mean_per_class_error
## [1] 0.1244588
##
## $domain
## [1] "n" "y"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## n y Error Rate
## n 654 43 0.0617 = 43 / 697
## y 85 369 0.1872 = 85 / 454
## Totals 739 412 0.1112 = 128 / 1,151
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.993677 0.004396 0.002752 0.010917 0.606429 1.000000 0.002203 1.000000
## 2 0.992708 0.017467 0.010989 0.042553 0.609036 1.000000 0.008811 1.000000
## 3 0.992218 0.055675 0.035539 0.128458 0.616855 1.000000 0.028634 1.000000
## 4 0.991820 0.112266 0.073250 0.240214 0.629018 1.000000 0.059471 1.000000
## 5 0.990642 0.123967 0.081257 0.261324 0.631625 1.000000 0.066079 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.036538 0.002203 0.501101 697 453 0 1
## 2 0.073171 0.008811 0.504405 697 450 0 4
## 3 0.132431 0.028634 0.514317 697 441 0 13
## 4 0.192038 0.059471 0.529736 697 427 0 27
## 5 0.202697 0.066079 0.533040 697 424 0 30
## tnr fnr fpr tpr idx
## 1 1.000000 0.997797 0.000000 0.002203 0
## 2 1.000000 0.991189 0.000000 0.008811 1
## 3 1.000000 0.971366 0.000000 0.028634 2
## 4 1.000000 0.940529 0.000000 0.059471 3
## 5 1.000000 0.933921 0.000000 0.066079 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 395 0.023689 0.581186 0.773850 0.465332 0.435274 0.410747 0.993392
## 396 0.021174 0.576726 0.770677 0.460768 0.424848 0.406306 0.993392
## 397 0.020825 0.570886 0.766485 0.454820 0.410947 0.400533 0.993392
## 398 0.020332 0.569085 0.765185 0.452993 0.406603 0.398762 0.993392
## 399 0.019058 0.568025 0.765979 0.451375 0.401390 0.397020 0.997797
## 400 0.018317 0.565732 0.765083 0.448794 0.394440 0.394440 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395 0.071736 0.151871 0.071736 0.532564 50
## 396 0.054519 0.126337 0.054519 0.523956 38
## 397 0.031564 0.083672 0.031564 0.512478 22
## 398 0.024390 0.066510 0.024390 0.508891 17
## 399 0.012912 0.056401 0.012912 0.505355 9
## 400 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 395 3 647 451 0.071736 0.006608 0.928264 0.993392 394
## 396 3 659 451 0.054519 0.006608 0.945481 0.993392 395
## 397 3 675 451 0.031564 0.006608 0.968436 0.993392 396
## 398 3 680 451 0.024390 0.006608 0.975610 0.993392 397
## 399 1 688 453 0.012912 0.002203 0.987088 0.997797 398
## 400 0 697 454 0.000000 0.000000 1.000000 1.000000 399
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.477283 0.852194 174
## 2 max f2 0.257556 0.850471 250
## 3 max f0point5 0.676467 0.891167 135
## 4 max accuracy 0.477283 0.888792 174
## 5 max precision 0.993677 1.000000 0
## 6 max recall 0.018317 1.000000 399
## 7 max specificity 0.993677 1.000000 0
## 8 max absolute_mcc 0.477283 0.765704 174
## 9 max min_per_class_accuracy 0.309127 0.856828 227
## 10 max mean_per_class_accuracy 0.477283 0.875541 174
## 11 max tns 0.993677 697.000000 0
## 12 max fns 0.993677 453.000000 0
## 13 max fps 0.018317 697.000000 399
## 14 max tps 0.018317 454.000000 399
## 15 max tnr 0.993677 1.000000 0
## 16 max fnr 0.993677 0.997797 0
## 17 max fpr 0.018317 1.000000 399
## 18 max tpr 0.018317 1.000000 399
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 39.44 %, avg score: 40.23 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.01042572 0.992096 2.535242 2.535242
## 2 2 0.02085143 0.991720 2.535242 2.535242
## 3 3 0.03040834 0.989219 2.535242 2.535242
## 4 4 0.04257168 0.987667 2.535242 2.535242
## 5 5 0.05039096 0.987211 2.535242 2.535242
## 6 6 0.10078193 0.980466 2.491531 2.513387
## 7 7 0.15030408 0.967178 2.490764 2.505933
## 8 8 0.20069505 0.939415 2.360398 2.469392
## 9 9 0.30060817 0.740400 2.204559 2.381369
## 10 10 0.40139010 0.349985 1.267621 2.101727
## 11 11 0.50043440 0.175590 0.467018 1.778191
## 12 12 0.60034752 0.124544 0.308638 1.533620
## 13 13 0.70112945 0.098727 0.218555 1.344589
## 14 14 0.80017376 0.070146 0.311346 1.216696
## 15 15 0.93136403 0.031755 0.134317 1.064234
## 16 16 1.00000000 0.018317 0.128367 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 1.000000 0.992474 1.000000 0.992474
## 2 1.000000 0.991892 1.000000 0.992183
## 3 1.000000 0.990363 1.000000 0.991611
## 4 1.000000 0.988485 1.000000 0.990718
## 5 1.000000 0.987405 1.000000 0.990204
## 6 0.982759 0.984059 0.991379 0.987131
## 7 0.982456 0.975782 0.988439 0.983392
## 8 0.931034 0.956514 0.974026 0.976643
## 9 0.869565 0.863433 0.939306 0.939016
## 10 0.500000 0.515857 0.829004 0.832768
## 11 0.184211 0.261101 0.701389 0.719626
## 12 0.121739 0.146808 0.604920 0.624294
## 13 0.086207 0.110415 0.530359 0.550428
## 14 0.122807 0.085708 0.479913 0.492906
## 15 0.052980 0.048248 0.419776 0.430272
## 16 0.050633 0.022076 0.394440 0.402255
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 0.026432 0.026432 153.524229 153.524229
## 2 0.026432 0.052863 153.524229 153.524229
## 3 0.024229 0.077093 153.524229 153.524229
## 4 0.030837 0.107930 153.524229 153.524229
## 5 0.019824 0.127753 153.524229 153.524229
## 6 0.125551 0.253304 149.153122 151.338675
## 7 0.123348 0.376652 149.076436 150.593313
## 8 0.118943 0.495595 136.039799 146.939184
## 9 0.220264 0.715859 120.455851 138.136920
## 10 0.127753 0.843612 26.762115 110.172683
## 11 0.046256 0.889868 -53.298168 77.819077
## 12 0.030837 0.920705 -69.136181 53.361979
## 13 0.022026 0.942731 -78.144463 34.458947
## 14 0.030837 0.973568 -68.865446 21.669608
## 15 0.017621 0.991189 -86.568253 6.423417
## 16 0.008811 1.000000 -87.163330 0.000000
## kolmogorov_smirnov
## 1 0.026432
## 2 0.052863
## 3 0.077093
## 4 0.107930
## 5 0.127753
## 6 0.251869
## 7 0.373783
## 8 0.486986
## 9 0.685730
## 10 0.730269
## 11 0.643096
## 12 0.529026
## 13 0.398972
## 14 0.286337
## 15 0.098793
## 16 0.000000
h2o.auc(performance_h2o)
## [1] 0.9195293
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.477283415294237:
## n y Error Rate
## n 654 43 0.061693 =43/697
## y 85 369 0.187225 =85/454
## Totals 739 412 0.111208 =128/1151
h2o.metric(performance_h2o)
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.993677 0.004396 0.002752 0.010917 0.606429 1.000000 0.002203 1.000000
## 2 0.992708 0.017467 0.010989 0.042553 0.609036 1.000000 0.008811 1.000000
## 3 0.992218 0.055675 0.035539 0.128458 0.616855 1.000000 0.028634 1.000000
## 4 0.991820 0.112266 0.073250 0.240214 0.629018 1.000000 0.059471 1.000000
## 5 0.990642 0.123967 0.081257 0.261324 0.631625 1.000000 0.066079 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.036538 0.002203 0.501101 697 453 0 1
## 2 0.073171 0.008811 0.504405 697 450 0 4
## 3 0.132431 0.028634 0.514317 697 441 0 13
## 4 0.192038 0.059471 0.529736 697 427 0 27
## 5 0.202697 0.066079 0.533040 697 424 0 30
## tnr fnr fpr tpr idx
## 1 1.000000 0.997797 0.000000 0.002203 0
## 2 1.000000 0.991189 0.000000 0.008811 1
## 3 1.000000 0.971366 0.000000 0.028634 2
## 4 1.000000 0.940529 0.000000 0.059471 3
## 5 1.000000 0.933921 0.000000 0.066079 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 395 0.023689 0.581186 0.773850 0.465332 0.435274 0.410747 0.993392
## 396 0.021174 0.576726 0.770677 0.460768 0.424848 0.406306 0.993392
## 397 0.020825 0.570886 0.766485 0.454820 0.410947 0.400533 0.993392
## 398 0.020332 0.569085 0.765185 0.452993 0.406603 0.398762 0.993392
## 399 0.019058 0.568025 0.765979 0.451375 0.401390 0.397020 0.997797
## 400 0.018317 0.565732 0.765083 0.448794 0.394440 0.394440 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395 0.071736 0.151871 0.071736 0.532564 50
## 396 0.054519 0.126337 0.054519 0.523956 38
## 397 0.031564 0.083672 0.031564 0.512478 22
## 398 0.024390 0.066510 0.024390 0.508891 17
## 399 0.012912 0.056401 0.012912 0.505355 9
## 400 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 395 3 647 451 0.071736 0.006608 0.928264 0.993392 394
## 396 3 659 451 0.054519 0.006608 0.945481 0.993392 395
## 397 3 675 451 0.031564 0.006608 0.968436 0.993392 396
## 398 3 680 451 0.024390 0.006608 0.975610 0.993392 397
## 399 1 688 453 0.012912 0.002203 0.987088 0.997797 398
## 400 0 697 454 0.000000 0.000000 1.000000 1.000000 399