Goal is to automate building and tuning a classification model to predict employee attrition, using the h2o::h2o.automl.

Import data

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')

skimr::skim(departures)

Data summary
Name	departures
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

# Select relevant columns
factors_vec <- departures %>% 
    select(departure_code, co_per_rol, fyear, tenure_no_ceodb, max_tenure_ceodb, fyear_gone) %>% 
    names()

library(lubridate)

data_clean <- departures %>% 
    select(-c(interim_coceo, still_there, eight_ks, gvkey, co_per_rol, cik, fyear, '_merge', notes, sources, leftofc, exec_fullname, coname)) %>% 
    filter(fyear_gone != "2997") %>% 
    filter(!is.na(ceo_dismissal)) %>% 
    # Recode ceo_dismissal first
    mutate(ceo_dismissal = if_else(ceo_dismissal == "1", "dismissed", 
                             if_else(ceo_dismissal == "0", "not dismissed", 
                             as.character(ceo_dismissal)))) %>%
    # Convert ceo_dismissal to factor
    mutate(ceo_dismissal = as.factor(ceo_dismissal)) # Handle NA implicitly

#data_clean <- data_clean %>% sample_n(100)

library(h2o)
library(tidymodels)

## Warning: package 'broom' was built under R version 4.3.3

## Warning: package 'modeldata' was built under R version 4.3.3

library(tidyquant)

## Warning: package 'tidyquant' was built under R version 4.3.3

## Warning: package 'xts' was built under R version 4.3.3

data <- read_csv("../11_module13/Data/data_clean.csv") %>% 
    # h2o requires all variables to be either numeric or factors
    mutate(across(where(is.character), factor))

Split data

set.seed(1234)

data_split <- initial_split(data_clean, strata = "ceo_dismissal")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)

Recipes

recipe_obj <- recipe(ceo_dismissal ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors())

Model

# Initialize h2o
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 days 1 hours 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months 
##     H2O cluster name:           H2O_started_from_R_erinmcevoy_fhp551 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.99 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

y <- "ceo_dismissal"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
    x = x,
    y = y, 
    training_frame    = train_h2o,
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    # max_runtime_secs  = 30, 
    max_models        = 10, 
    exclude_algos     = "DeepLearning",
    nfolds            = 5, 
    seed              = 3456   
)

##   |                                                                              |                                                                      |   0%
## 14:48:22.83: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.  |                                                                              |====                                                                  |   5%  |                                                                              |=====                                                                 |   7%  |                                                                              |=====                                                                 |   8%  |                                                                              |=======                                                               |  10%  |                                                                              |=========                                                             |  13%  |                                                                              |==========                                                            |  14%  |                                                                              |===========                                                           |  15%  |                                                                              |=============                                                         |  18%  |                                                                              |================                                                      |  23%  |                                                                              |======================                                                |  31%  |                                                                              |=======================                                               |  33%  |                                                                              |======================================================================| 100%

Examine the output

models_h2o %>% typeof()

## [1] "S4"

models_h2o %>% slotNames()

## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"

models_h2o@leaderboard

##                                                   model_id       auc    logloss
## 1                          GBM_4_AutoML_27_20241121_144822 0.9998993 0.01116995
## 2                          GBM_1_AutoML_27_20241121_144822 0.9998975 0.01279744
## 3                          GBM_3_AutoML_27_20241121_144822 0.9998867 0.01161717
## 4                      XGBoost_2_AutoML_27_20241121_144822 0.9998849 0.01396485
## 5 StackedEnsemble_BestOfFamily_1_AutoML_27_20241121_144822 0.9998813 0.01119842
## 6    StackedEnsemble_AllModels_1_AutoML_27_20241121_144822 0.9998795 0.01104288
##       aucpr mean_per_class_error       rmse         mse
## 1 0.9999752          0.005043794 0.05790869 0.003353416
## 2 0.9999748          0.008753365 0.05620919 0.003159472
## 3 0.9999721          0.005710905 0.05969899 0.003563970
## 4 0.9999718          0.004363196 0.05545154 0.003074873
## 5 0.9999708          0.005710905 0.05734945 0.003288960
## 6 0.9999704          0.005710905 0.05743133 0.003298358
## 
## [12 rows x 7 columns]

models_h2o@leader

## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model ID:  GBM_4_AutoML_27_20241121_144822 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              79                       79              117991        10
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1        10   10.00000         74        178   114.11392
## 
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  0.0006192444
## RMSE:  0.02488462
## LogLoss:  0.003074578
## Mean Per-Class Error:  0.0005268704
## AUC:  0.9999997
## AUCPR:  0.9999999
## Gini:  0.9999994
## R^2:  0.9961218
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##               dismissed not dismissed    Error     Rate
## dismissed           948             1 0.001054   =1/949
## not dismissed         0          3809 0.000000  =0/3809
## Totals              948          3810 0.000210  =1/4758
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.426762    0.999869 159
## 2                       max f2  0.426762    0.999947 159
## 3                 max f0point5  0.504015    0.999947 157
## 4                 max accuracy  0.504015    0.999790 157
## 5                max precision  0.999944    1.000000   0
## 6                   max recall  0.426762    1.000000 159
## 7              max specificity  0.999944    1.000000   0
## 8             max absolute_mcc  0.504015    0.999342 157
## 9   max min_per_class_accuracy  0.504015    0.999737 157
## 10 max mean_per_class_accuracy  0.504015    0.999869 157
## 11                     max tns  0.999944  949.000000   0
## 12                     max fns  0.999944 3808.000000   0
## 13                     max fps  0.000562  949.000000 399
## 14                     max tps  0.426762 3809.000000 159
## 15                     max tnr  0.999944    1.000000   0
## 16                     max fnr  0.999944    0.999737   0
## 17                     max fpr  0.000562    1.000000 399
## 18                     max tpr  0.426762    1.000000 159
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
## 
## MSE:  0.001763995
## RMSE:  0.04199994
## LogLoss:  0.008976636
## Mean Per-Class Error:  0.000729927
## AUC:  0.9998754
## AUCPR:  0.9999707
## Gini:  0.9997508
## R^2:  0.9886818
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##               dismissed not dismissed    Error    Rate
## dismissed           164             0 0.000000  =0/164
## not dismissed         1           684 0.001460  =1/685
## Totals              165           684 0.001178  =1/849
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.486212   0.999270 256
## 2                       max f2  0.486212   0.998832 256
## 3                 max f0point5  0.486212   0.999708 256
## 4                 max accuracy  0.486212   0.998822 256
## 5                max precision  0.999930   1.000000   0
## 6                   max recall  0.006478   1.000000 271
## 7              max specificity  0.999930   1.000000   0
## 8             max absolute_mcc  0.486212   0.996237 256
## 9   max min_per_class_accuracy  0.486212   0.998540 256
## 10 max mean_per_class_accuracy  0.486212   0.999270 256
## 11                     max tns  0.999930 164.000000   0
## 12                     max fns  0.999930 684.000000   0
## 13                     max fps  0.000732 164.000000 399
## 14                     max tps  0.006478 685.000000 271
## 15                     max tnr  0.999930   1.000000   0
## 16                     max fnr  0.999930   0.998540   0
## 17                     max fpr  0.000732   1.000000 399
## 18                     max tpr  0.006478   1.000000 271
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.003297201
## RMSE:  0.05742126
## LogLoss:  0.01214609
## Mean Per-Class Error:  0.001969021
## AUC:  0.9998303
## AUCPR:  0.9999583
## Gini:  0.9996606
## R^2:  0.9793501
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##               dismissed not dismissed    Error      Rate
## dismissed           949             0 0.000000    =0/949
## not dismissed        15          3794 0.003938  =15/3809
## Totals              964          3794 0.003153  =15/4758
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.983067    0.998027 129
## 2                       max f2  0.121216    0.996905 163
## 3                 max f0point5  0.983067    0.999210 129
## 4                 max accuracy  0.983067    0.996847 129
## 5                max precision  0.999987    1.000000   0
## 6                   max recall  0.008510    1.000000 229
## 7              max specificity  0.999987    1.000000   0
## 8             max absolute_mcc  0.983067    0.990234 129
## 9   max min_per_class_accuracy  0.983067    0.996062 129
## 10 max mean_per_class_accuracy  0.983067    0.998031 129
## 11                     max tns  0.999987  949.000000   0
## 12                     max fns  0.999987 3808.000000   0
## 13                     max fps  0.000243  949.000000 399
## 14                     max tps  0.008510 3809.000000 229
## 15                     max tnr  0.999987    1.000000   0
## 16                     max fnr  0.999987    0.999737   0
## 17                     max fpr  0.000243    1.000000 399
## 18                     max tpr  0.008510    1.000000 229
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                             mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                0.996848 0.001661   0.995798   0.997899   0.994748
## auc                     0.999811 0.000130   0.999738   0.999924   0.999634
## err                     0.003152 0.001661   0.004202   0.002101   0.005252
## err_count               3.000000 1.581139   4.000000   2.000000   5.000000
## f0point5                0.999209 0.000418   0.998946   0.999474   0.998681
## f1                      0.998026 0.001042   0.997368   0.998686   0.996708
## f2                      0.996847 0.001663   0.995796   0.997899   0.994744
## lift_top_group          1.249147 0.000639   1.249344   1.249344   1.249344
## logloss                 0.012713 0.005792   0.015947   0.009249   0.020077
## max_per_class_error     0.003938 0.002075   0.005249   0.002625   0.006562
## mcc                     0.990262 0.005084   0.987036   0.993472   0.983852
## mean_per_class_accuracy 0.998031 0.001037   0.997375   0.998688   0.996719
## mean_per_class_error    0.001969 0.001037   0.002625   0.001312   0.003281
## mse                     0.003440 0.001477   0.004297   0.002971   0.005236
## pr_auc                  0.999953 0.000032   0.999935   0.999981   0.999910
## precision               1.000000 0.000000   1.000000   1.000000   1.000000
## r2                      0.978462 0.009239   0.973103   0.981405   0.967223
## recall                  0.996062 0.002075   0.994751   0.997375   0.993438
## rmse                    0.057361 0.013697   0.065549   0.054503   0.072361
## specificity             1.000000 0.000000   1.000000   1.000000   1.000000
##                         cv_4_valid cv_5_valid
## accuracy                  0.996845   0.998948
## auc                       0.999813   0.999944
## err                       0.003155   0.001052
## err_count                 3.000000   1.000000
## f0point5                  0.999209   0.999737
## f1                        0.998025   0.999343
## f2                        0.996844   0.998950
## lift_top_group            1.249672   1.248031
## logloss                   0.013149   0.005143
## max_per_class_error       0.003942   0.001312
## mcc                       0.990240   0.996710
## mean_per_class_accuracy   0.998029   0.999344
## mean_per_class_error      0.001971   0.000656
## mse                       0.003391   0.001307
## pr_auc                    0.999954   0.999986
## precision                 1.000000   1.000000
## r2                        0.978790   0.991789
## recall                    0.996058   0.998688
## rmse                      0.058231   0.036159
## specificity               1.000000   1.000000

?h2o.getModel
?h2o.saveModel
?h2o.loadModel


h2o.getModel("GBM_4_AutoML_17_20241121_142222") %>% 
h2o.saveModel("h2o__model/")

## [1] "/Users/erinmcevoy/Desktop/PSU_DAT3100/00_data/h2o__model/GBM_4_AutoML_17_20241121_142222"

best_model <- models_h2o@leader

Make predictions

# Make predictions using the best model
predictions <- h2o.predict(best_model, newdata = test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

predictions_tbl <- predictions %>%
    as_tibble()

predictions_tbl %>% 
    bind_cols(test_tbl)

## # A tibble: 1,870 × 9
##    predict       dismissed not.dismissed dismissal_dataset_id departure_code
##    <fct>             <dbl>         <dbl>                <dbl>          <dbl>
##  1 not dismissed  0.000340      1.00                       12              5
##  2 dismissed      0.999         0.000765                   13              3
##  3 not dismissed  0.000465      1.00                       65              5
##  4 dismissed      0.999         0.000713                   78              3
##  5 not dismissed  0.000423      1.00                       80              5
##  6 not dismissed  0.000487      1.00                       81              5
##  7 dismissed      0.999         0.00135                    88              3
##  8 not dismissed  0.000243      1.00                       99              5
##  9 not dismissed  0.000262      1.00                      117              5
## 10 not dismissed  0.000301      1.00                      121              5
## # ℹ 1,860 more rows
## # ℹ 4 more variables: ceo_dismissal <fct>, tenure_no_ceodb <dbl>,
## #   max_tenure_ceodb <dbl>, fyear_gone <dbl>

Evaluate model

?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)

## [1] "S4"

slotNames(performance_h2o)

## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"

performance_h2o@metrics

## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "GBM_4_AutoML_27_20241121_144822"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/GBM_4_AutoML_27_20241121_144822"
## 
## 
## $model_checksum
## [1] "2991273419511954476"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_a911_3"
## 
## 
## $frame_checksum
## [1] "9047969800299088666"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.732219e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.003353416
## 
## $RMSE
## [1] 0.05790869
## 
## $nobs
## [1] 1870
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.978914
## 
## $logloss
## [1] 0.01116995
## 
## $AUC
## [1] 0.9998993
## 
## $pr_auc
## [1] 0.9999752
## 
## $Gini
## [1] 0.9997986
## 
## $mean_per_class_error
## [1] 0.005043794
## 
## $domain
## [1] "dismissed"     "not dismissed"
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               dismissed not dismissed  Error        Rate
## dismissed           368             3 0.0081 =   3 / 371
## not dismissed         3          1496 0.0020 = 3 / 1,499
## Totals              371          1499 0.0032 = 6 / 1,870
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.999930 0.001333 0.000834 0.003327 0.198930  1.000000 0.000667    1.000000
## 2  0.999925 0.003995 0.002500 0.009927 0.200000  1.000000 0.002001    1.000000
## 3  0.999922 0.005323 0.003333 0.013201 0.200535  1.000000 0.002668    1.000000
## 4  0.999916 0.007973 0.004998 0.019698 0.201604  1.000000 0.004003    1.000000
## 5  0.999912 0.011936 0.007494 0.029316 0.203209  1.000000 0.006004    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns  fns fps tps
## 1     0.011508               0.000667                0.500334 371 1498   0   1
## 2     0.019942               0.002001                0.501001 371 1496   0   3
## 3     0.023034               0.002668                0.501334 371 1495   0   4
## 4     0.028225               0.004003                0.502001 371 1493   0   6
## 5     0.034597               0.006004                0.503002 371 1490   0   9
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.999333 0.000000 0.000667   0
## 2 1.000000 0.997999 0.000000 0.002001   1
## 3 1.000000 0.997332 0.000000 0.002668   2
## 4 1.000000 0.995997 0.000000 0.004003   3
## 5 1.000000 0.993996 0.000000 0.006004   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.000719 0.891466 0.953562 0.836963 0.804813  0.804185 1.000000
## 396  0.000713 0.891201 0.953441 0.836589 0.804278  0.803753 1.000000
## 397  0.000699 0.890936 0.953320 0.836216 0.803743  0.803323 1.000000
## 398  0.000693 0.890671 0.953199 0.835843 0.803209  0.802892 1.000000
## 399  0.000686 0.890143 0.952956 0.835097 0.802139  0.802033 1.000000
## 400  0.000607 0.889878 0.952835 0.834725 0.801604  0.801604 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.016173     0.114042               0.016173                0.508086   6
## 396    0.013477     0.104078               0.013477                0.506739   5
## 397    0.010782     0.093065               0.010782                0.505391   4
## 398    0.008086     0.080575               0.008086                0.504043   3
## 399    0.002695     0.046495               0.002695                0.501348   1
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns fps  tps      tnr      fnr      fpr      tpr idx
## 395   0 365 1499 0.016173 0.000000 0.983827 1.000000 394
## 396   0 366 1499 0.013477 0.000000 0.986523 1.000000 395
## 397   0 367 1499 0.010782 0.000000 0.989218 1.000000 396
## 398   0 368 1499 0.008086 0.000000 0.991914 1.000000 397
## 399   0 370 1499 0.002695 0.000000 0.997305 1.000000 398
## 400   0 371 1499 0.000000 0.000000 1.000000 1.000000 399
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.197672    0.997999 201
## 2                       max f2  0.197672    0.997999 201
## 3                 max f0point5  0.997764    0.998928 193
## 4                 max accuracy  0.197672    0.996791 201
## 5                max precision  0.999930    1.000000   0
## 6                   max recall  0.023784    1.000000 219
## 7              max specificity  0.999930    1.000000   0
## 8             max absolute_mcc  0.197672    0.989912 201
## 9   max min_per_class_accuracy  0.997764    0.994663 193
## 10 max mean_per_class_accuracy  0.997764    0.997332 193
## 11                     max tns  0.999930  371.000000   0
## 12                     max fns  0.999930 1498.000000   0
## 13                     max fps  0.000607  371.000000 399
## 14                     max tps  0.023784 1499.000000 219
## 15                     max tnr  0.999930    1.000000   0
## 16                     max fnr  0.999930    0.999333   0
## 17                     max fpr  0.000607    1.000000 399
## 18                     max tpr  0.023784    1.000000 219
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 80.16 %, avg score: 79.95 %
##    group cumulative_data_fraction lower_threshold     lift cumulative_lift
## 1      1               0.01016043        0.999883 1.247498        1.247498
## 2      2               0.02032086        0.999856 1.247498        1.247498
## 3      3               0.03048128        0.999841 1.247498        1.247498
## 4      4               0.04010695        0.999831 1.247498        1.247498
## 5      5               0.05026738        0.999823 1.247498        1.247498
## 6      6               0.10000000        0.999793 1.247498        1.247498
## 7      7               0.15026738        0.999772 1.247498        1.247498
## 8      8               0.20000000        0.999757 1.247498        1.247498
## 9      9               0.30000000        0.999727 1.247498        1.247498
## 10    10               0.40000000        0.999699 1.247498        1.247498
## 11    11               0.50000000        0.999670 1.247498        1.247498
## 12    12               0.60000000        0.999641 1.247498        1.247498
## 13    13               0.70000000        0.999582 1.247498        1.247498
## 14    14               0.80000000        0.247503 1.227485        1.244997
## 15    15               0.90053476        0.001238 0.039814        1.110451
## 16    16               1.00000000        0.000607 0.000000        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 0.999908                 1.000000         0.999908
## 2       1.000000 0.999868                 1.000000         0.999888
## 3       1.000000 0.999847                 1.000000         0.999874
## 4       1.000000 0.999835                 1.000000         0.999865
## 5       1.000000 0.999826                 1.000000         0.999857
## 6       1.000000 0.999808                 1.000000         0.999833
## 7       1.000000 0.999783                 1.000000         0.999816
## 8       1.000000 0.999764                 1.000000         0.999803
## 9       1.000000 0.999742                 1.000000         0.999783
## 10      1.000000 0.999714                 1.000000         0.999766
## 11      1.000000 0.999684                 1.000000         0.999749
## 12      1.000000 0.999656                 1.000000         0.999734
## 13      1.000000 0.999616                 1.000000         0.999717
## 14      0.983957 0.982650                 0.997995         0.997584
## 15      0.031915 0.013395                 0.890143         0.887710
## 16      0.000000 0.001018                 0.801604         0.799515
##    capture_rate cumulative_capture_rate        gain cumulative_gain
## 1      0.012675                0.012675   24.749833       24.749833
## 2      0.012675                0.025350   24.749833       24.749833
## 3      0.012675                0.038025   24.749833       24.749833
## 4      0.012008                0.050033   24.749833       24.749833
## 5      0.012675                0.062708   24.749833       24.749833
## 6      0.062041                0.124750   24.749833       24.749833
## 7      0.062708                0.187458   24.749833       24.749833
## 8      0.062041                0.249500   24.749833       24.749833
## 9      0.124750                0.374249   24.749833       24.749833
## 10     0.124750                0.498999   24.749833       24.749833
## 11     0.124750                0.623749   24.749833       24.749833
## 12     0.124750                0.748499   24.749833       24.749833
## 13     0.124750                0.873249   24.749833       24.749833
## 14     0.122748                0.995997   22.748499       24.499666
## 15     0.004003                1.000000  -96.018622       11.045131
## 16     0.000000                1.000000 -100.000000        0.000000
##    kolmogorov_smirnov
## 1            0.012675
## 2            0.025350
## 3            0.038025
## 4            0.050033
## 5            0.062708
## 6            0.124750
## 7            0.187458
## 8            0.249500
## 9            0.374249
## 10           0.498999
## 11           0.623749
## 12           0.748499
## 13           0.873249
## 14           0.987911
## 15           0.501348
## 16           0.000000

h2o.auc(performance_h2o)

## [1] 0.9998993

h2o.confusionMatrix(performance_h2o)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.197672408100051:
##               dismissed not dismissed    Error     Rate
## dismissed           368             3 0.008086   =3/371
## not dismissed         3          1496 0.002001  =3/1499
## Totals              371          1499 0.003209  =6/1870

h2o.metric(performance_h2o)

## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.999930 0.001333 0.000834 0.003327 0.198930  1.000000 0.000667    1.000000
## 2  0.999925 0.003995 0.002500 0.009927 0.200000  1.000000 0.002001    1.000000
## 3  0.999922 0.005323 0.003333 0.013201 0.200535  1.000000 0.002668    1.000000
## 4  0.999916 0.007973 0.004998 0.019698 0.201604  1.000000 0.004003    1.000000
## 5  0.999912 0.011936 0.007494 0.029316 0.203209  1.000000 0.006004    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns  fns fps tps
## 1     0.011508               0.000667                0.500334 371 1498   0   1
## 2     0.019942               0.002001                0.501001 371 1496   0   3
## 3     0.023034               0.002668                0.501334 371 1495   0   4
## 4     0.028225               0.004003                0.502001 371 1493   0   6
## 5     0.034597               0.006004                0.503002 371 1490   0   9
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.999333 0.000000 0.000667   0
## 2 1.000000 0.997999 0.000000 0.002001   1
## 3 1.000000 0.997332 0.000000 0.002668   2
## 4 1.000000 0.995997 0.000000 0.004003   3
## 5 1.000000 0.993996 0.000000 0.006004   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.000719 0.891466 0.953562 0.836963 0.804813  0.804185 1.000000
## 396  0.000713 0.891201 0.953441 0.836589 0.804278  0.803753 1.000000
## 397  0.000699 0.890936 0.953320 0.836216 0.803743  0.803323 1.000000
## 398  0.000693 0.890671 0.953199 0.835843 0.803209  0.802892 1.000000
## 399  0.000686 0.890143 0.952956 0.835097 0.802139  0.802033 1.000000
## 400  0.000607 0.889878 0.952835 0.834725 0.801604  0.801604 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.016173     0.114042               0.016173                0.508086   6
## 396    0.013477     0.104078               0.013477                0.506739   5
## 397    0.010782     0.093065               0.010782                0.505391   4
## 398    0.008086     0.080575               0.008086                0.504043   3
## 399    0.002695     0.046495               0.002695                0.501348   1
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns fps  tps      tnr      fnr      fpr      tpr idx
## 395   0 365 1499 0.016173 0.000000 0.983827 1.000000 394
## 396   0 366 1499 0.013477 0.000000 0.986523 1.000000 395
## 397   0 367 1499 0.010782 0.000000 0.989218 1.000000 396
## 398   0 368 1499 0.008086 0.000000 0.991914 1.000000 397
## 399   0 370 1499 0.002695 0.000000 0.997305 1.000000 398
## 400   0 371 1499 0.000000 0.000000 1.000000 1.000000 399

This model’s predictions performed much better than the xgboost model with an AUC of 0.99

Code Along 11

Erin McEvoy

2024-11-19