Goal: Build a classification model to predict the spam email (yesno). Click here for data.

Set up

Import data

library(tidyverse)
library(correlationfunnel)

data <- readr::read_csv("../00_data/data_wrangled/data_clean_2/data_clean") %>%

    # h2o requires all variables to be either numeric or factors
    mutate(across(where(is.character), factor))

## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Split data

set.seed(1234)

data_split <- initial_split(data, strata = "yesno")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)

Recipes

recipe_obj <- recipe(yesno ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors())

Model

#Initialize h2o
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         23 minutes 23 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    4 months and 18 days 
##     H2O cluster name:           H2O_started_from_R_tomli_fhp551 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.82 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (4 months and 18 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o  <- as.h2o(test_tbl)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

y <- "yesno"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
    x = x,
    y = y, 
    training_frame    = train_h2o,
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    # max_runtime_secs  = 30, 
    max_models        = 10, 
    exclude_algos     = "DeepLearning",
    nfolds            = 5, 
    seed              = 3456   
)

##   |                                                                              |                                                                      |   0%  |                                                                              |===                                                                   |   4%
## 11:44:07.162: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 11:44:07.178: AutoML: XGBoost is not available; skipping it.  |                                                                              |===============                                                       |  21%  |                                                                              |=======================                                               |  33%  |                                                                              |======================================================================| 100%

Examine the output of h2o.automl

models_h2o %>% typeof()

## [1] "S4"

models_h2o %>% slotNames()

## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"

models_h2o@leaderboard

##                                                  model_id       auc   logloss
## 1             GBM_grid_1_AutoML_4_20240509_114407_model_1 0.9195293 0.3175906
## 2                          GBM_5_AutoML_4_20240509_114407 0.9188403 0.3159186
## 3             GBM_grid_1_AutoML_4_20240509_114407_model_2 0.9187534 0.3224471
## 4                          GBM_4_AutoML_4_20240509_114407 0.9187139 0.3211553
## 5 StackedEnsemble_BestOfFamily_1_AutoML_4_20240509_114407 0.9185417 0.3190287
## 6    StackedEnsemble_AllModels_1_AutoML_4_20240509_114407 0.9185069 0.3143932
##       aucpr mean_per_class_error      rmse        mse
## 1 0.9146886            0.1244588 0.3048919 0.09295905
## 2 0.9156096            0.1277122 0.3053640 0.09324718
## 3 0.9131192            0.1328048 0.3088534 0.09539039
## 4 0.9117697            0.1245394 0.3084129 0.09511850
## 5 0.9146343            0.1262269 0.3031829 0.09191989
## 6 0.9134479            0.1268132 0.3030065 0.09181291
## 
## [12 rows x 7 columns]

best_model <- models_h2o@leader
best_model

## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model ID:  GBM_grid_1_AutoML_4_20240509_114407_model_1 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              56                       56               11863         4
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         9    7.01786          8         16    12.12500
## 
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  0.08633413
## RMSE:  0.2938267
## LogLoss:  0.2918996
## Mean Per-Class Error:  0.1244266
## AUC:  0.9359978
## AUCPR:  0.9301509
## Gini:  0.8719957
## R^2:  0.6389747
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           n    y    Error       Rate
## n      1657  114 0.064370  =114/1771
## y       214  946 0.184483  =214/1160
## Totals 1871 1060 0.111907  =328/2931
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.460105    0.852252 196
## 2                       max f2  0.172882    0.864482 298
## 3                 max f0point5  0.744784    0.899737 118
## 4                 max accuracy  0.481078    0.888434 189
## 5                max precision  0.993124    1.000000   0
## 6                   max recall  0.020360    1.000000 397
## 7              max specificity  0.993124    1.000000   0
## 8             max absolute_mcc  0.588570    0.765409 161
## 9   max min_per_class_accuracy  0.299660    0.866379 244
## 10 max mean_per_class_accuracy  0.398256    0.876259 209
## 11                     max tns  0.993124 1771.000000   0
## 12                     max fns  0.993124 1154.000000   0
## 13                     max fps  0.018317 1771.000000 399
## 14                     max tps  0.020360 1160.000000 397
## 15                     max tnr  0.993124    1.000000   0
## 16                     max fnr  0.993124    0.994828   0
## 17                     max fpr  0.018317    1.000000 399
## 18                     max tpr  0.020360    1.000000 397
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
## 
## MSE:  0.1050428
## RMSE:  0.3241031
## LogLoss:  0.3368799
## Mean Per-Class Error:  0.1566033
## AUC:  0.9227701
## AUCPR:  0.9040581
## Gini:  0.8455402
## R^2:  0.5556779
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##          n   y    Error     Rate
## n      305  15 0.046875  =15/320
## y       53 146 0.266332  =53/199
## Totals 358 161 0.131021  =68/519
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.553173   0.811111 148
## 2                       max f2  0.133945   0.865116 251
## 3                 max f0point5  0.721787   0.870968 131
## 4                 max accuracy  0.553173   0.868979 148
## 5                max precision  0.993194   1.000000   0
## 6                   max recall  0.052004   1.000000 346
## 7              max specificity  0.993194   1.000000   0
## 8             max absolute_mcc  0.553173   0.721894 148
## 9   max min_per_class_accuracy  0.228198   0.825000 207
## 10 max mean_per_class_accuracy  0.553173   0.843397 148
## 11                     max tns  0.993194 320.000000   0
## 12                     max fns  0.993194 198.000000   0
## 13                     max fps  0.018317 320.000000 356
## 14                     max tps  0.052004 199.000000 346
## 15                     max tnr  0.993194   1.000000   0
## 16                     max fnr  0.993194   0.994975   0
## 17                     max fpr  0.018317   1.000000 356
## 18                     max tpr  0.052004   1.000000 346
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.09373318
## RMSE:  0.3061587
## LogLoss:  0.3147025
## Mean Per-Class Error:  0.1332391
## AUC:  0.9249299
## AUCPR:  0.9168586
## Gini:  0.8498598
## R^2:  0.608034
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           n    y    Error       Rate
## n      1638  133 0.075099  =133/1771
## y       222  938 0.191379  =222/1160
## Totals 1860 1071 0.121119  =355/2931
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.452768    0.840879 198
## 2                       max f2  0.175893    0.855567 294
## 3                 max f0point5  0.685544    0.888795 133
## 4                 max accuracy  0.553867    0.881269 170
## 5                max precision  0.994669    1.000000   0
## 6                   max recall  0.015839    1.000000 399
## 7              max specificity  0.994669    1.000000   0
## 8             max absolute_mcc  0.553867    0.751023 170
## 9   max min_per_class_accuracy  0.291899    0.853448 251
## 10 max mean_per_class_accuracy  0.452768    0.866761 198
## 11                     max tns  0.994669 1771.000000   0
## 12                     max fns  0.994669 1141.000000   0
## 13                     max fps  0.015839 1771.000000 399
## 14                     max tps  0.015839 1160.000000 399
## 15                     max tnr  0.994669    1.000000   0
## 16                     max fnr  0.994669    0.983621   0
## 17                     max fpr  0.015839    1.000000 399
## 18                     max tpr  0.015839    1.000000 399
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                              mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                 0.882637 0.013798   0.873935   0.895904   0.899317
## auc                      0.924739 0.010715   0.932145   0.923461   0.938504
## err                      0.117363 0.013798   0.126065   0.104096   0.100683
## err_count               68.800000 8.105554  74.000000  61.000000  59.000000
## f0point5                 0.869741 0.028128   0.834731   0.901961   0.892193
## f1                       0.844977 0.018076   0.843220   0.857809   0.866817
## f2                       0.822492 0.029111   0.851884   0.817778   0.842845
## lift_top_group           2.526724 0.001928   2.530172   2.525862   2.525862
## logloss                  0.315103 0.025279   0.311988   0.306174   0.280152
## max_per_class_error      0.191379 0.040273   0.142241   0.206897   0.172414
## mcc                      0.754487 0.028806   0.738154   0.783029   0.788463
## mean_per_class_accuracy  0.869880 0.014271   0.871133   0.878190   0.886957
## mean_per_class_error     0.130120 0.014271   0.128867   0.121810   0.113043
## mse                      0.093758 0.009047   0.093147   0.088820   0.081773
## pr_auc                   0.916488 0.014438   0.919953   0.919491   0.936460
## precision                0.887694 0.042510   0.829167   0.934010   0.909953
## r2                       0.607929 0.037824   0.610301   0.628622   0.658086
## recall                   0.808621 0.040273   0.857759   0.793103   0.827586
## rmse                     0.305912 0.014842   0.305200   0.298027   0.285961
## specificity              0.931139 0.032065   0.884507   0.963277   0.946328
##                         cv_4_valid cv_5_valid
## accuracy                  0.873720   0.870307
## auc                       0.912021   0.917562
## err                       0.126280   0.129693
## err_count                74.000000  76.000000
## f0point5                  0.849820   0.870000
## f1                        0.836283   0.820755
## f2                        0.823171   0.776786
## lift_top_group            2.525862   2.525862
## logloss                   0.346989   0.330213
## max_per_class_error       0.185345   0.250000
## mcc                       0.734309   0.728482
## mean_per_class_accuracy   0.863542   0.849576
## mean_per_class_error      0.136458   0.150424
## mse                       0.104150   0.100901
## pr_auc                    0.897600   0.908932
## precision                 0.859091   0.906250
## r2                        0.564524   0.578110
## recall                    0.814655   0.750000
## rmse                      0.322723   0.317649
## specificity               0.912429   0.949152

Save and Load

# h2o.getModel("GBM_grid_1_AutoML_2_20240509_112755_model_1") %>%
#       h2o.saveModel("h2o_models/")
# 
# best_model <- h2o.loadModel("h2o_models/GBM_grid_1_AutoML_2_20240509_112755_model_1")

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

predictions_tbl <- predictions %>%
  as.tibble()

## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

predictions_tbl %>%
    bind_cols(test_tbl)

## # A tibble: 1,151 × 10
##    predict      n     y crl.tot dollar  bang money  n000  make yesno
##    <fct>    <dbl> <dbl>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
##  1 y       0.0142 0.986    1028  0.18  0.372  0.43  0.43  0.21 y    
##  2 n       0.877  0.123      54  0     0      0     0     0    y    
##  3 y       0.107  0.893    2259  0.046 0.25   0     0.05  0.05 y    
##  4 y       0.388  0.612      34  0     0.809  0     0     0    y    
##  5 y       0.136  0.864      82  0.196 0.392  0     0     0    y    
##  6 y       0.262  0.738      47  0     0.368  3.33  0     0    y    
##  7 y       0.254  0.746     129  0     0.091  0.65  0     0    y    
##  8 y       0.298  0.702      59  0     0.886  0     0     1.17 y    
##  9 y       0.167  0.833      89  0.091 0      0.27  0     0    y    
## 10 y       0.0230 0.977     239  0.244 0.488  0     0.48  0    y    
## # ℹ 1,141 more rows

Evaluate model

?h2o.performance

## starting httpd help server ... done

performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)

## [1] "S4"

slotNames(performance_h2o)

## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"

performance_h2o@metrics

## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "GBM_grid_1_AutoML_4_20240509_114407_model_1"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/GBM_grid_1_AutoML_4_20240509_114407_model_1"
## 
## 
## $model_checksum
## [1] "-874147823862691177"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_b4a1_3"
## 
## 
## $frame_checksum
## [1] "5361656593914358099"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.71527e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.09295905
## 
## $RMSE
## [1] 0.3048919
## 
## $nobs
## [1] 1151
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.6108172
## 
## $logloss
## [1] 0.3175906
## 
## $AUC
## [1] 0.9195293
## 
## $pr_auc
## [1] 0.9146886
## 
## $Gini
## [1] 0.8390585
## 
## $mean_per_class_error
## [1] 0.1244588
## 
## $domain
## [1] "n" "y"
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##          n   y  Error          Rate
## n      654  43 0.0617 =    43 / 697
## y       85 369 0.1872 =    85 / 454
## Totals 739 412 0.1112 = 128 / 1,151
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.993677 0.004396 0.002752 0.010917 0.606429  1.000000 0.002203    1.000000
## 2  0.992708 0.017467 0.010989 0.042553 0.609036  1.000000 0.008811    1.000000
## 3  0.992218 0.055675 0.035539 0.128458 0.616855  1.000000 0.028634    1.000000
## 4  0.991820 0.112266 0.073250 0.240214 0.629018  1.000000 0.059471    1.000000
## 5  0.990642 0.123967 0.081257 0.261324 0.631625  1.000000 0.066079    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.036538               0.002203                0.501101 697 453   0   1
## 2     0.073171               0.008811                0.504405 697 450   0   4
## 3     0.132431               0.028634                0.514317 697 441   0  13
## 4     0.192038               0.059471                0.529736 697 427   0  27
## 5     0.202697               0.066079                0.533040 697 424   0  30
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.997797 0.000000 0.002203   0
## 2 1.000000 0.991189 0.000000 0.008811   1
## 3 1.000000 0.971366 0.000000 0.028634   2
## 4 1.000000 0.940529 0.000000 0.059471   3
## 5 1.000000 0.933921 0.000000 0.066079   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.023689 0.581186 0.773850 0.465332 0.435274  0.410747 0.993392
## 396  0.021174 0.576726 0.770677 0.460768 0.424848  0.406306 0.993392
## 397  0.020825 0.570886 0.766485 0.454820 0.410947  0.400533 0.993392
## 398  0.020332 0.569085 0.765185 0.452993 0.406603  0.398762 0.993392
## 399  0.019058 0.568025 0.765979 0.451375 0.401390  0.397020 0.997797
## 400  0.018317 0.565732 0.765083 0.448794 0.394440  0.394440 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.071736     0.151871               0.071736                0.532564  50
## 396    0.054519     0.126337               0.054519                0.523956  38
## 397    0.031564     0.083672               0.031564                0.512478  22
## 398    0.024390     0.066510               0.024390                0.508891  17
## 399    0.012912     0.056401               0.012912                0.505355   9
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns fps tps      tnr      fnr      fpr      tpr idx
## 395   3 647 451 0.071736 0.006608 0.928264 0.993392 394
## 396   3 659 451 0.054519 0.006608 0.945481 0.993392 395
## 397   3 675 451 0.031564 0.006608 0.968436 0.993392 396
## 398   3 680 451 0.024390 0.006608 0.975610 0.993392 397
## 399   1 688 453 0.012912 0.002203 0.987088 0.997797 398
## 400   0 697 454 0.000000 0.000000 1.000000 1.000000 399
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.477283   0.852194 174
## 2                       max f2  0.257556   0.850471 250
## 3                 max f0point5  0.676467   0.891167 135
## 4                 max accuracy  0.477283   0.888792 174
## 5                max precision  0.993677   1.000000   0
## 6                   max recall  0.018317   1.000000 399
## 7              max specificity  0.993677   1.000000   0
## 8             max absolute_mcc  0.477283   0.765704 174
## 9   max min_per_class_accuracy  0.309127   0.856828 227
## 10 max mean_per_class_accuracy  0.477283   0.875541 174
## 11                     max tns  0.993677 697.000000   0
## 12                     max fns  0.993677 453.000000   0
## 13                     max fps  0.018317 697.000000 399
## 14                     max tps  0.018317 454.000000 399
## 15                     max tnr  0.993677   1.000000   0
## 16                     max fnr  0.993677   0.997797   0
## 17                     max fpr  0.018317   1.000000 399
## 18                     max tpr  0.018317   1.000000 399
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 39.44 %, avg score: 40.23 %
##    group cumulative_data_fraction lower_threshold     lift cumulative_lift
## 1      1               0.01042572        0.992096 2.535242        2.535242
## 2      2               0.02085143        0.991720 2.535242        2.535242
## 3      3               0.03040834        0.989219 2.535242        2.535242
## 4      4               0.04257168        0.987667 2.535242        2.535242
## 5      5               0.05039096        0.987211 2.535242        2.535242
## 6      6               0.10078193        0.980466 2.491531        2.513387
## 7      7               0.15030408        0.967178 2.490764        2.505933
## 8      8               0.20069505        0.939415 2.360398        2.469392
## 9      9               0.30060817        0.740400 2.204559        2.381369
## 10    10               0.40139010        0.349985 1.267621        2.101727
## 11    11               0.50043440        0.175590 0.467018        1.778191
## 12    12               0.60034752        0.124544 0.308638        1.533620
## 13    13               0.70112945        0.098727 0.218555        1.344589
## 14    14               0.80017376        0.070146 0.311346        1.216696
## 15    15               0.93136403        0.031755 0.134317        1.064234
## 16    16               1.00000000        0.018317 0.128367        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 0.992474                 1.000000         0.992474
## 2       1.000000 0.991892                 1.000000         0.992183
## 3       1.000000 0.990363                 1.000000         0.991611
## 4       1.000000 0.988485                 1.000000         0.990718
## 5       1.000000 0.987405                 1.000000         0.990204
## 6       0.982759 0.984059                 0.991379         0.987131
## 7       0.982456 0.975782                 0.988439         0.983392
## 8       0.931034 0.956514                 0.974026         0.976643
## 9       0.869565 0.863433                 0.939306         0.939016
## 10      0.500000 0.515857                 0.829004         0.832768
## 11      0.184211 0.261101                 0.701389         0.719626
## 12      0.121739 0.146808                 0.604920         0.624294
## 13      0.086207 0.110415                 0.530359         0.550428
## 14      0.122807 0.085708                 0.479913         0.492906
## 15      0.052980 0.048248                 0.419776         0.430272
## 16      0.050633 0.022076                 0.394440         0.402255
##    capture_rate cumulative_capture_rate       gain cumulative_gain
## 1      0.026432                0.026432 153.524229      153.524229
## 2      0.026432                0.052863 153.524229      153.524229
## 3      0.024229                0.077093 153.524229      153.524229
## 4      0.030837                0.107930 153.524229      153.524229
## 5      0.019824                0.127753 153.524229      153.524229
## 6      0.125551                0.253304 149.153122      151.338675
## 7      0.123348                0.376652 149.076436      150.593313
## 8      0.118943                0.495595 136.039799      146.939184
## 9      0.220264                0.715859 120.455851      138.136920
## 10     0.127753                0.843612  26.762115      110.172683
## 11     0.046256                0.889868 -53.298168       77.819077
## 12     0.030837                0.920705 -69.136181       53.361979
## 13     0.022026                0.942731 -78.144463       34.458947
## 14     0.030837                0.973568 -68.865446       21.669608
## 15     0.017621                0.991189 -86.568253        6.423417
## 16     0.008811                1.000000 -87.163330        0.000000
##    kolmogorov_smirnov
## 1            0.026432
## 2            0.052863
## 3            0.077093
## 4            0.107930
## 5            0.127753
## 6            0.251869
## 7            0.373783
## 8            0.486986
## 9            0.685730
## 10           0.730269
## 11           0.643096
## 12           0.529026
## 13           0.398972
## 14           0.286337
## 15           0.098793
## 16           0.000000

h2o.auc(performance_h2o)

## [1] 0.9195293

h2o.confusionMatrix(performance_h2o)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.477283415294237:
##          n   y    Error       Rate
## n      654  43 0.061693    =43/697
## y       85 369 0.187225    =85/454
## Totals 739 412 0.111208  =128/1151

h2o.metric(performance_h2o)

## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.993677 0.004396 0.002752 0.010917 0.606429  1.000000 0.002203    1.000000
## 2  0.992708 0.017467 0.010989 0.042553 0.609036  1.000000 0.008811    1.000000
## 3  0.992218 0.055675 0.035539 0.128458 0.616855  1.000000 0.028634    1.000000
## 4  0.991820 0.112266 0.073250 0.240214 0.629018  1.000000 0.059471    1.000000
## 5  0.990642 0.123967 0.081257 0.261324 0.631625  1.000000 0.066079    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.036538               0.002203                0.501101 697 453   0   1
## 2     0.073171               0.008811                0.504405 697 450   0   4
## 3     0.132431               0.028634                0.514317 697 441   0  13
## 4     0.192038               0.059471                0.529736 697 427   0  27
## 5     0.202697               0.066079                0.533040 697 424   0  30
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.997797 0.000000 0.002203   0
## 2 1.000000 0.991189 0.000000 0.008811   1
## 3 1.000000 0.971366 0.000000 0.028634   2
## 4 1.000000 0.940529 0.000000 0.059471   3
## 5 1.000000 0.933921 0.000000 0.066079   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.023689 0.581186 0.773850 0.465332 0.435274  0.410747 0.993392
## 396  0.021174 0.576726 0.770677 0.460768 0.424848  0.406306 0.993392
## 397  0.020825 0.570886 0.766485 0.454820 0.410947  0.400533 0.993392
## 398  0.020332 0.569085 0.765185 0.452993 0.406603  0.398762 0.993392
## 399  0.019058 0.568025 0.765979 0.451375 0.401390  0.397020 0.997797
## 400  0.018317 0.565732 0.765083 0.448794 0.394440  0.394440 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.071736     0.151871               0.071736                0.532564  50
## 396    0.054519     0.126337               0.054519                0.523956  38
## 397    0.031564     0.083672               0.031564                0.512478  22
## 398    0.024390     0.066510               0.024390                0.508891  17
## 399    0.012912     0.056401               0.012912                0.505355   9
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns fps tps      tnr      fnr      fpr      tpr idx
## 395   3 647 451 0.071736 0.006608 0.928264 0.993392 394
## 396   3 659 451 0.054519 0.006608 0.945481 0.993392 395
## 397   3 675 451 0.031564 0.006608 0.968436 0.993392 396
## 398   3 680 451 0.024390 0.006608 0.975610 0.993392 397
## 399   1 688 453 0.012912 0.002203 0.987088 0.997797 398
## 400   0 697 454 0.000000 0.000000 1.000000 1.000000 399

Apply Data 11: Spam

John Tomlinson

2024-05-06

Set up

Import data

Split data

Recipes

Model

Save and Load

Make predictions

Evaluate model