Goal is to automate building and tuning a classification model to predict employee attrition, using the h2o::h2o.automl.

Set up

Import data

Import the cleaned data from Module 7.

library(h2o)

## Warning: package 'h2o' was built under R version 4.5.3

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::day()   masks h2o::day()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ lubridate::hour()  masks h2o::hour()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ lubridate::month() masks h2o::month()
## ✖ lubridate::week()  masks h2o::week()
## ✖ lubridate::year()  masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.5.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.10     ✔ rsample      1.3.1 
## ✔ dials        1.4.2      ✔ tailor       0.1.0 
## ✔ infer        1.1.0      ✔ tune         2.0.1 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.4.1      ✔ workflowsets 1.1.1 
## ✔ recipes      1.3.1      ✔ yardstick    1.3.2

## Warning: package 'dials' was built under R version 4.5.2

## Warning: package 'infer' was built under R version 4.5.2

## Warning: package 'modeldata' was built under R version 4.5.2

## Warning: package 'parsnip' was built under R version 4.5.2

## Warning: package 'tailor' was built under R version 4.5.2

## Warning: package 'tune' was built under R version 4.5.2

## Warning: package 'workflows' was built under R version 4.5.2

## Warning: package 'workflowsets' was built under R version 4.5.2

## Warning: package 'yardstick' was built under R version 4.5.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()

library(tidyquant)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.28     ✔ xts                  0.14.1── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ scales::col_factor()           masks readr::col_factor()
## ✖ lubridate::day()               masks h2o::day()
## ✖ scales::discard()              masks purrr::discard()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ recipes::fixed()               masks stringr::fixed()
## ✖ lubridate::hour()              masks h2o::hour()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ TTR::momentum()                masks dials::momentum()
## ✖ lubridate::month()             masks h2o::month()
## ✖ yardstick::spec()              masks readr::spec()
## ✖ quantmod::summary()            masks h2o::summary(), base::summary()
## ✖ lubridate::week()              masks h2o::week()
## ✖ lubridate::year()              masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

data <- read_csv("../00_data/data_wrangled/data_clean.csv") %>%
    
    # h2o requires all variables to be either numeric or factors
    mutate(across(where(is.character), factor))

## Rows: 1470 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (24): Age, DailyRate, DistanceFromHome, Education, EmployeeNumber, Envir...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Split data

set.seed(1234)

data_split <- initial_split(data, strata = "Attrition")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)

Recipes

recipe_obj <- recipe(Attrition ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors())

Model

h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         23 minutes 21 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    2 years, 4 months and 5 days 
##     H2O cluster name:           H2O_started_from_R_conno_nlb794 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   7.80 GB 
##     H2O cluster total cores:    16 
##     H2O cluster allowed cores:  16 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.5.1 (2025-06-13 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (2 years, 4 months and 5 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Recreate the H2O data frames (Required for the knit to work)
train_h2o <- as.h2o(train_tbl)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

test_h2o  <- as.h2o(test_tbl)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Load the model HERE so it is available for the rest of the document
best_model <- h2o.loadModel("h2o_models/GLM_1_AutoML_1_20260426_150130")

examine the output of h2o.automl

# This chunk was crashing because models_h2o does not exist.
# We will inspect best_model instead.

typeof(best_model)

## [1] "S4"

slotNames(best_model)

## [1] "model_id"      "algorithm"     "parameters"    "allparameters"
## [5] "params"        "have_pojo"     "have_mojo"     "model"

# Since we don't have the full leaderboard in this session, 
# we can just print the best_model details.
print(best_model)

## Model Details:
## ==============
## 
## H2OBinomialModel: glm
## Model ID:  GLM_1_AutoML_1_20260426_150130 
## GLM Model: summary
##     family  link             regularization
## 1 binomial logit Ridge ( lambda = 0.02062 )
##                                                                lambda_search
## 1 nlambda = 30, lambda.max = 6.2664, lambda.min = 0.02062, lambda.1se = -1.0
##   number_of_predictors_total number_of_active_predictors number_of_iterations
## 1                         52                          52                   26
##                                           training_frame
## 1 AutoML_1_20260426_150130_training_train_tbl_sid_bb68_1
## 
## Coefficients: glm coefficients
##                               names coefficients standardized_coefficients
## 1                         Intercept     4.178306                 -1.710400
## 2 JobRole.Healthcare Representative    -0.143863                 -0.143863
## 3           JobRole.Human Resources    -0.009489                 -0.009489
## 4     JobRole.Laboratory Technician     0.214816                  0.214816
## 5                   JobRole.Manager    -0.044525                 -0.044525
## 
## ---
##                      names coefficients standardized_coefficients
## 48   TrainingTimesLastYear    -0.149741                 -0.195723
## 49         WorkLifeBalance    -0.324669                 -0.234313
## 50          YearsAtCompany     0.028538                  0.178531
## 51      YearsInCurrentRole    -0.079875                 -0.286338
## 52 YearsSinceLastPromotion     0.090227                  0.289193
## 53    YearsWithCurrManager    -0.060290                 -0.215217
## 
## H2OBinomialMetrics: glm
## ** Reported on training data. **
## 
## MSE:  0.09078449
## RMSE:  0.3013046
## LogLoss:  0.3125029
## Mean Per-Class Error:  0.225214
## AUC:  0.8636697
## AUCPR:  0.6985762
## Gini:  0.7273394
## R^2:  0.3399708
## Residual Deviance:  622.5058
## AIC:  728.5058
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         No Yes    Error      Rate
## No     787  45 0.054087   =45/832
## Yes     65  99 0.396341   =65/164
## Totals 852 144 0.110442  =110/996
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.329891   0.642857 105
## 2                       max f2  0.223239   0.685841 166
## 3                 max f0point5  0.439442   0.741870  64
## 4                 max accuracy  0.439442   0.899598  64
## 5                max precision  0.884233   1.000000   0
## 6                   max recall  0.011760   1.000000 389
## 7              max specificity  0.884233   1.000000   0
## 8             max absolute_mcc  0.431101   0.586395  67
## 9   max min_per_class_accuracy  0.184880   0.788462 194
## 10 max mean_per_class_accuracy  0.223239   0.803530 166
## 11                     max tns  0.884233 832.000000   0
## 12                     max fns  0.884233 163.000000   0
## 13                     max fps  0.002109 832.000000 399
## 14                     max tps  0.011760 164.000000 389
## 15                     max tnr  0.884233   1.000000   0
## 16                     max fnr  0.884233   0.993902   0
## 17                     max fpr  0.002109   1.000000 399
## 18                     max tpr  0.011760   1.000000 389
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: glm
## ** Reported on validation data. **
## 
## MSE:  0.08834389
## RMSE:  0.297227
## LogLoss:  0.2999775
## Mean Per-Class Error:  0.1797659
## AUC:  0.8319398
## AUCPR:  0.5005051
## Gini:  0.6638796
## R^2:  0.1856259
## Residual Deviance:  62.99527
## AIC:  168.9953
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        No Yes    Error     Rate
## No     66  26 0.282609   =26/92
## Yes     1  12 0.076923    =1/13
## Totals 67  38 0.257143  =27/105
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold     value idx
## 1                       max f1  0.191749  0.470588  37
## 2                       max f2  0.191749  0.666667  37
## 3                 max f0point5  0.457402  0.606061   4
## 4                 max accuracy  0.533539  0.904762   2
## 5                max precision  0.658813  1.000000   0
## 6                   max recall  0.094711  1.000000  69
## 7              max specificity  0.658813  1.000000   0
## 8             max absolute_mcc  0.457402  0.459069   4
## 9   max min_per_class_accuracy  0.243975  0.769231  29
## 10 max mean_per_class_accuracy  0.191749  0.820234  37
## 11                     max tns  0.658813 92.000000   0
## 12                     max fns  0.658813 12.000000   0
## 13                     max fps  0.008778 92.000000 104
## 14                     max tps  0.094711 13.000000  69
## 15                     max tnr  0.658813  1.000000   0
## 16                     max fnr  0.658813  0.923077   0
## 17                     max fpr  0.008778  1.000000 104
## 18                     max tpr  0.094711  1.000000  69
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

Save and Load

# Evaluate the loaded model instead of models_h2o
typeof(best_model)

## [1] "S4"

slotNames(best_model)

## [1] "model_id"      "algorithm"     "parameters"    "allparameters"
## [5] "params"        "have_pojo"     "have_mojo"     "model"

# Run performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
h2o.auc(performance_h2o)

## [1] 0.8392665

h2o.confusionMatrix(performance_h2o)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.336454545130086:
##         No Yes    Error     Rate
## No     292  17 0.055016  =17/309
## Yes     27  33 0.450000   =27/60
## Totals 319  50 0.119241  =44/369

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

predictions_tbl <- predictions %>%
    as_tibble()

predictions_tbl %>%
    bind_cols(test_tbl)

## # A tibble: 369 × 35
##    predict    No     Yes   Age Attrition BusinessTravel    DailyRate Department 
##    <fct>   <dbl>   <dbl> <dbl> <fct>     <fct>                 <dbl> <fct>      
##  1 No      0.821 0.179      59 No        Travel_Rarely          1324 Research &…
##  2 No      0.920 0.0797     35 No        Travel_Rarely           809 Research &…
##  3 No      0.891 0.109      34 No        Travel_Rarely          1346 Research &…
##  4 No      0.811 0.189      22 No        Non-Travel             1123 Research &…
##  5 No      0.974 0.0259     53 No        Travel_Rarely          1219 Sales      
##  6 No      0.967 0.0326     24 No        Non-Travel              673 Research &…
##  7 No      0.898 0.102      21 No        Travel_Rarely           391 Research &…
##  8 Yes     0.766 0.234      34 Yes       Travel_Rarely           699 Research &…
##  9 No      0.996 0.00448    53 No        Travel_Rarely          1282 Research &…
## 10 Yes     0.199 0.801      32 Yes       Travel_Frequently      1125 Research &…
## # ℹ 359 more rows
## # ℹ 27 more variables: DistanceFromHome <dbl>, Education <dbl>,
## #   EducationField <fct>, EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>,
## #   Gender <fct>, HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>,
## #   JobRole <fct>, JobSatisfaction <dbl>, MaritalStatus <fct>,
## #   MonthlyIncome <dbl>, MonthlyRate <dbl>, NumCompaniesWorked <dbl>,
## #   OverTime <fct>, PercentSalaryHike <dbl>, PerformanceRating <dbl>, …

Evaluate model

?h2o.performance

## starting httpd help server ... done

performance_h2o <- h2o.performance(best_model, newdata = test_h2o)

typeof(performance_h2o)

## [1] "S4"

slotNames(performance_h2o)

## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"

performance_h2o@metrics

## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "GLM_1_AutoML_1_20260426_150130"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/GLM_1_AutoML_1_20260426_150130"
## 
## 
## $model_checksum
## [1] "-8230819784329863776"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_b9e7_3"
## 
## 
## $frame_checksum
## [1] "-54413681510283746"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.777232e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.09508623
## 
## $RMSE
## [1] 0.3083606
## 
## $nobs
## [1] 369
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.3016701
## 
## $logloss
## [1] 0.3225875
## 
## $AUC
## [1] 0.8392665
## 
## $pr_auc
## [1] 0.6600645
## 
## $Gini
## [1] 0.6785329
## 
## $mean_per_class_error
## [1] 0.2525081
## 
## $domain
## [1] "No"  "Yes"
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##         No Yes  Error       Rate
## No     292  17 0.0550 = 17 / 309
## Yes     27  33 0.4500 =  27 / 60
## Totals 319  50 0.1192 = 44 / 369
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.832201 0.032787 0.020747 0.078125 0.840108  1.000000 0.016667    1.000000
## 2  0.800675 0.064516 0.041322 0.147059 0.842818  1.000000 0.033333    1.000000
## 3  0.772351 0.095238 0.061728 0.208333 0.845528  1.000000 0.050000    1.000000
## 4  0.725671 0.125000 0.081967 0.263158 0.848238  1.000000 0.066667    1.000000
## 5  0.668013 0.153846 0.102041 0.312500 0.850949  1.000000 0.083333    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.118299               0.016667                0.508333 309  59   0   1
## 2     0.167527               0.033333                0.516667 309  58   0   2
## 3     0.205458               0.050000                0.525000 309  57   0   3
## 4     0.237568               0.066667                0.533333 309  56   0   4
## 5     0.265973               0.083333                0.541667 309  55   0   5
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.983333 0.000000 0.016667   0
## 2 1.000000 0.966667 0.000000 0.033333   1
## 3 1.000000 0.950000 0.000000 0.050000   2
## 4 1.000000 0.933333 0.000000 0.066667   3
## 5 1.000000 0.916667 0.000000 0.083333   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 364  0.006524 0.283019 0.496689 0.197889 0.176152  0.164835 1.000000
## 365  0.004485 0.282353 0.495868 0.197368 0.173442  0.164384 1.000000
## 366  0.004051 0.281690 0.495050 0.196850 0.170732  0.163934 1.000000
## 367  0.003614 0.281030 0.494234 0.196335 0.168022  0.163488 1.000000
## 368  0.003432 0.280374 0.493421 0.195822 0.165312  0.163043 1.000000
## 369  0.001492 0.279720 0.492611 0.195313 0.162602  0.162602 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 364    0.016181     0.051645               0.016181                0.508091   5
## 365    0.012945     0.046130               0.012945                0.506472   4
## 366    0.009709     0.039895               0.009709                0.504854   3
## 367    0.006472     0.032530               0.006472                0.503236   2
## 368    0.003236     0.022971               0.003236                0.501618   1
## 369    0.000000     0.000000               0.000000                0.500000   0
##     fns fps tps      tnr      fnr      fpr      tpr idx
## 364   0 304  60 0.016181 0.000000 0.983819 1.000000 363
## 365   0 305  60 0.012945 0.000000 0.987055 1.000000 364
## 366   0 306  60 0.009709 0.000000 0.990291 1.000000 365
## 367   0 307  60 0.006472 0.000000 0.993528 1.000000 366
## 368   0 308  60 0.003236 0.000000 0.996764 1.000000 367
## 369   0 309  60 0.000000 0.000000 1.000000 1.000000 368
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.336455   0.600000  49
## 2                       max f2  0.166653   0.652174 127
## 3                 max f0point5  0.445676   0.697674  27
## 4                 max accuracy  0.445676   0.891599  27
## 5                max precision  0.832201   1.000000   0
## 6                   max recall  0.037474   1.000000 297
## 7              max specificity  0.832201   1.000000   0
## 8             max absolute_mcc  0.385320   0.541270  38
## 9   max min_per_class_accuracy  0.186496   0.766667 116
## 10 max mean_per_class_accuracy  0.261206   0.776699  74
## 11                     max tns  0.832201 309.000000   0
## 12                     max fns  0.832201  59.000000   0
## 13                     max fps  0.001492 309.000000 368
## 14                     max tps  0.037474  60.000000 297
## 15                     max tnr  0.832201   1.000000   0
## 16                     max fnr  0.832201   0.983333   0
## 17                     max fpr  0.001492   1.000000 368
## 18                     max tpr  0.037474   1.000000 297
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 16.26 %, avg score: 16.46 %
##    group cumulative_data_fraction lower_threshold     lift cumulative_lift
## 1      1               0.01084011        0.686463 6.150000        6.150000
## 2      2               0.02168022        0.597920 6.150000        6.150000
## 3      3               0.03252033        0.578017 6.150000        6.150000
## 4      4               0.04065041        0.560962 4.100000        5.740000
## 5      5               0.05149051        0.511268 6.150000        5.826316
## 6      6               0.10027100        0.396418 3.075000        4.487838
## 7      7               0.15176152        0.308420 1.942105        3.624107
## 8      8               0.20054201        0.261886 2.050000        3.241216
## 9      9               0.30081301        0.192344 0.831081        2.437838
## 10    10               0.40108401        0.150235 0.831081        2.036149
## 11    11               0.50135501        0.113130 0.332432        1.695405
## 12    12               0.59891599        0.084068 0.341667        1.474887
## 13    13               0.69918699        0.057416 0.498649        1.334884
## 14    14               0.79945799        0.038044 0.498649        1.230000
## 15    15               0.89972900        0.024545 0.166216        1.111446
## 16    16               1.00000000        0.001492 0.000000        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 0.782724                 1.000000         0.782724
## 2       1.000000 0.636409                 1.000000         0.709567
## 3       1.000000 0.588227                 1.000000         0.669120
## 4       0.666667 0.573292                 0.933333         0.649955
## 5       1.000000 0.530102                 0.947368         0.624722
## 6       0.500000 0.446798                 0.729730         0.538165
## 7       0.315789 0.349397                 0.589286         0.474119
## 8       0.333333 0.282186                 0.527027         0.427432
## 9       0.135135 0.227361                 0.396396         0.360742
## 10      0.135135 0.168011                 0.331081         0.312559
## 11      0.054054 0.131943                 0.275676         0.276436
## 12      0.055556 0.097622                 0.239819         0.247308
## 13      0.081081 0.070694                 0.217054         0.221979
## 14      0.081081 0.049383                 0.200000         0.200332
## 15      0.027027 0.031515                 0.180723         0.181518
## 16      0.000000 0.013105                 0.162602         0.164631
##    capture_rate cumulative_capture_rate        gain cumulative_gain
## 1      0.066667                0.066667  515.000000      515.000000
## 2      0.066667                0.133333  515.000000      515.000000
## 3      0.066667                0.200000  515.000000      515.000000
## 4      0.033333                0.233333  310.000000      474.000000
## 5      0.066667                0.300000  515.000000      482.631579
## 6      0.150000                0.450000  207.500000      348.783784
## 7      0.100000                0.550000   94.210526      262.410714
## 8      0.100000                0.650000  105.000000      224.121622
## 9      0.083333                0.733333  -16.891892      143.783784
## 10     0.083333                0.816667  -16.891892      103.614865
## 11     0.033333                0.850000  -66.756757       69.540541
## 12     0.033333                0.883333  -65.833333       47.488688
## 13     0.050000                0.933333  -50.135135       33.488372
## 14     0.050000                0.983333  -50.135135       23.000000
## 15     0.016667                1.000000  -83.378378       11.144578
## 16     0.000000                1.000000 -100.000000        0.000000
##    kolmogorov_smirnov
## 1            0.066667
## 2            0.133333
## 3            0.200000
## 4            0.230097
## 5            0.296764
## 6            0.417638
## 7            0.475566
## 8            0.536731
## 9            0.516505
## 10           0.496278
## 11           0.416343
## 12           0.339644
## 13           0.279612
## 14           0.219579
## 15           0.119741
## 16           0.000000
## 
## $residual_deviance
## [1] 238.0696
## 
## $null_deviance
## [1] 327.6531
## 
## $AIC
## [1] 344.0696
## 
## $loglikelihood
## [1] 0
## 
## $null_degrees_of_freedom
## [1] 368
## 
## $residual_degrees_of_freedom
## [1] 316

h2o.auc(performance_h2o)

## [1] 0.8392665

h2o.confusionMatrix(performance_h2o)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.336454545130086:
##         No Yes    Error     Rate
## No     292  17 0.055016  =17/309
## Yes     27  33 0.450000   =27/60
## Totals 319  50 0.119241  =44/369

h2o.metric(performance_h2o)

## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.832201 0.032787 0.020747 0.078125 0.840108  1.000000 0.016667    1.000000
## 2  0.800675 0.064516 0.041322 0.147059 0.842818  1.000000 0.033333    1.000000
## 3  0.772351 0.095238 0.061728 0.208333 0.845528  1.000000 0.050000    1.000000
## 4  0.725671 0.125000 0.081967 0.263158 0.848238  1.000000 0.066667    1.000000
## 5  0.668013 0.153846 0.102041 0.312500 0.850949  1.000000 0.083333    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.118299               0.016667                0.508333 309  59   0   1
## 2     0.167527               0.033333                0.516667 309  58   0   2
## 3     0.205458               0.050000                0.525000 309  57   0   3
## 4     0.237568               0.066667                0.533333 309  56   0   4
## 5     0.265973               0.083333                0.541667 309  55   0   5
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.983333 0.000000 0.016667   0
## 2 1.000000 0.966667 0.000000 0.033333   1
## 3 1.000000 0.950000 0.000000 0.050000   2
## 4 1.000000 0.933333 0.000000 0.066667   3
## 5 1.000000 0.916667 0.000000 0.083333   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 364  0.006524 0.283019 0.496689 0.197889 0.176152  0.164835 1.000000
## 365  0.004485 0.282353 0.495868 0.197368 0.173442  0.164384 1.000000
## 366  0.004051 0.281690 0.495050 0.196850 0.170732  0.163934 1.000000
## 367  0.003614 0.281030 0.494234 0.196335 0.168022  0.163488 1.000000
## 368  0.003432 0.280374 0.493421 0.195822 0.165312  0.163043 1.000000
## 369  0.001492 0.279720 0.492611 0.195313 0.162602  0.162602 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 364    0.016181     0.051645               0.016181                0.508091   5
## 365    0.012945     0.046130               0.012945                0.506472   4
## 366    0.009709     0.039895               0.009709                0.504854   3
## 367    0.006472     0.032530               0.006472                0.503236   2
## 368    0.003236     0.022971               0.003236                0.501618   1
## 369    0.000000     0.000000               0.000000                0.500000   0
##     fns fps tps      tnr      fnr      fpr      tpr idx
## 364   0 304  60 0.016181 0.000000 0.983819 1.000000 363
## 365   0 305  60 0.012945 0.000000 0.987055 1.000000 364
## 366   0 306  60 0.009709 0.000000 0.990291 1.000000 365
## 367   0 307  60 0.006472 0.000000 0.993528 1.000000 366
## 368   0 308  60 0.003236 0.000000 0.996764 1.000000 367
## 369   0 309  60 0.000000 0.000000 1.000000 1.000000 368

Code Along 11

Connor Sweeney

2026-04-26

Set up

Import data

Split data

Recipes

Model

Save and Load

Make predictions

Evaluate model