Goal is to automate building and tuning a classification model to predict employee attrition, using the h2o::h2o.automl.
departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')
skimr::skim(departures)
Name | departures |
Number of rows | 9423 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
character | 8 |
numeric | 10 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
coname | 0 | 1.00 | 2 | 30 | 0 | 3860 | 0 |
exec_fullname | 0 | 1.00 | 5 | 790 | 0 | 8701 | 0 |
interim_coceo | 9105 | 0.03 | 6 | 7 | 0 | 6 | 0 |
still_there | 7311 | 0.22 | 3 | 10 | 0 | 77 | 0 |
notes | 1644 | 0.83 | 5 | 3117 | 0 | 7755 | 0 |
sources | 1475 | 0.84 | 18 | 1843 | 0 | 7915 | 0 |
eight_ks | 4499 | 0.52 | 69 | 3884 | 0 | 4914 | 0 |
_merge | 0 | 1.00 | 11 | 11 | 0 | 1 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1.00 | 5684.10 | 25005.46 | 1 | 2305.5 | 4593 | 6812.5 | 559044 | ▇▁▁▁▁ |
gvkey | 0 | 1.00 | 40132.48 | 53921.34 | 1004 | 7337.0 | 14385 | 60900.5 | 328795 | ▇▁▁▁▁ |
fyear | 0 | 1.00 | 2007.74 | 8.19 | 1987 | 2000.0 | 2008 | 2016.0 | 2020 | ▁▆▅▅▇ |
co_per_rol | 0 | 1.00 | 25580.22 | 18202.38 | -1 | 8555.5 | 22980 | 39275.5 | 64602 | ▇▆▅▃▃ |
departure_code | 1667 | 0.82 | 5.20 | 1.53 | 1 | 5.0 | 5 | 7.0 | 9 | ▁▃▇▅▁ |
ceo_dismissal | 1813 | 0.81 | 0.20 | 0.40 | 0 | 0.0 | 0 | 0.0 | 1 | ▇▁▁▁▂ |
tenure_no_ceodb | 0 | 1.00 | 1.03 | 0.17 | 0 | 1.0 | 1 | 1.0 | 3 | ▁▇▁▁▁ |
max_tenure_ceodb | 0 | 1.00 | 1.05 | 0.24 | 1 | 1.0 | 1 | 1.0 | 4 | ▇▁▁▁▁ |
fyear_gone | 1802 | 0.81 | 2006.64 | 13.63 | 1980 | 2000.0 | 2007 | 2013.0 | 2997 | ▇▁▁▁▁ |
cik | 245 | 0.97 | 741469.17 | 486551.43 | 1750 | 106413.0 | 857323 | 1050375.8 | 1808065 | ▆▁▇▂▁ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
leftofc | 1802 | 0.81 | 1981-01-01 | 2998-04-27 | 2006-12-31 | 3627 |
# Select relevant columns
factors_vec <- departures %>%
select(departure_code, co_per_rol, fyear, tenure_no_ceodb, max_tenure_ceodb, fyear_gone) %>%
names()
library(lubridate)
data_clean <- departures %>%
select(-c(interim_coceo, still_there, eight_ks, gvkey, co_per_rol, cik, fyear, '_merge', notes, sources, leftofc, exec_fullname, coname)) %>%
filter(fyear_gone != "2997") %>%
filter(!is.na(ceo_dismissal)) %>%
# Recode ceo_dismissal first
mutate(ceo_dismissal = if_else(ceo_dismissal == "1", "dismissed",
if_else(ceo_dismissal == "0", "not dismissed",
as.character(ceo_dismissal)))) %>%
# Convert ceo_dismissal to factor
mutate(ceo_dismissal = as.factor(ceo_dismissal)) # Handle NA implicitly
#data_clean <- data_clean %>% sample_n(100)
library(h2o)
library(tidymodels)
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.3.3
## Warning: package 'xts' was built under R version 4.3.3
data <- read_csv("../11_module13/Data/data_clean.csv") %>%
# h2o requires all variables to be either numeric or factors
mutate(across(where(is.character), factor))
set.seed(1234)
data_split <- initial_split(data_clean, strata = "ceo_dismissal")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
recipe_obj <- recipe(ceo_dismissal ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
# Initialize h2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 days 1 hours
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months
## H2O cluster name: H2O_started_from_R_erinmcevoy_fhp551
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.99 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)
## | | | 0% | |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
## | | | 0% | |======================================================================| 100%
y <- "ceo_dismissal"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
# max_runtime_secs = 30,
max_models = 10,
exclude_algos = "DeepLearning",
nfolds = 5,
seed = 3456
)
## | | | 0%
## 14:48:22.83: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models. | |==== | 5% | |===== | 7% | |===== | 8% | |======= | 10% | |========= | 13% | |========== | 14% | |=========== | 15% | |============= | 18% | |================ | 23% | |====================== | 31% | |======================= | 33% | |======================================================================| 100%
Examine the output
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id auc logloss
## 1 GBM_4_AutoML_27_20241121_144822 0.9998993 0.01116995
## 2 GBM_1_AutoML_27_20241121_144822 0.9998975 0.01279744
## 3 GBM_3_AutoML_27_20241121_144822 0.9998867 0.01161717
## 4 XGBoost_2_AutoML_27_20241121_144822 0.9998849 0.01396485
## 5 StackedEnsemble_BestOfFamily_1_AutoML_27_20241121_144822 0.9998813 0.01119842
## 6 StackedEnsemble_AllModels_1_AutoML_27_20241121_144822 0.9998795 0.01104288
## aucpr mean_per_class_error rmse mse
## 1 0.9999752 0.005043794 0.05790869 0.003353416
## 2 0.9999748 0.008753365 0.05620919 0.003159472
## 3 0.9999721 0.005710905 0.05969899 0.003563970
## 4 0.9999718 0.004363196 0.05545154 0.003074873
## 5 0.9999708 0.005710905 0.05734945 0.003288960
## 6 0.9999704 0.005710905 0.05743133 0.003298358
##
## [12 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model ID: GBM_4_AutoML_27_20241121_144822
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 79 79 117991 10
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 10 10.00000 74 178 114.11392
##
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.0006192444
## RMSE: 0.02488462
## LogLoss: 0.003074578
## Mean Per-Class Error: 0.0005268704
## AUC: 0.9999997
## AUCPR: 0.9999999
## Gini: 0.9999994
## R^2: 0.9961218
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## dismissed not dismissed Error Rate
## dismissed 948 1 0.001054 =1/949
## not dismissed 0 3809 0.000000 =0/3809
## Totals 948 3810 0.000210 =1/4758
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.426762 0.999869 159
## 2 max f2 0.426762 0.999947 159
## 3 max f0point5 0.504015 0.999947 157
## 4 max accuracy 0.504015 0.999790 157
## 5 max precision 0.999944 1.000000 0
## 6 max recall 0.426762 1.000000 159
## 7 max specificity 0.999944 1.000000 0
## 8 max absolute_mcc 0.504015 0.999342 157
## 9 max min_per_class_accuracy 0.504015 0.999737 157
## 10 max mean_per_class_accuracy 0.504015 0.999869 157
## 11 max tns 0.999944 949.000000 0
## 12 max fns 0.999944 3808.000000 0
## 13 max fps 0.000562 949.000000 399
## 14 max tps 0.426762 3809.000000 159
## 15 max tnr 0.999944 1.000000 0
## 16 max fnr 0.999944 0.999737 0
## 17 max fpr 0.000562 1.000000 399
## 18 max tpr 0.426762 1.000000 159
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
##
## MSE: 0.001763995
## RMSE: 0.04199994
## LogLoss: 0.008976636
## Mean Per-Class Error: 0.000729927
## AUC: 0.9998754
## AUCPR: 0.9999707
## Gini: 0.9997508
## R^2: 0.9886818
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## dismissed not dismissed Error Rate
## dismissed 164 0 0.000000 =0/164
## not dismissed 1 684 0.001460 =1/685
## Totals 165 684 0.001178 =1/849
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.486212 0.999270 256
## 2 max f2 0.486212 0.998832 256
## 3 max f0point5 0.486212 0.999708 256
## 4 max accuracy 0.486212 0.998822 256
## 5 max precision 0.999930 1.000000 0
## 6 max recall 0.006478 1.000000 271
## 7 max specificity 0.999930 1.000000 0
## 8 max absolute_mcc 0.486212 0.996237 256
## 9 max min_per_class_accuracy 0.486212 0.998540 256
## 10 max mean_per_class_accuracy 0.486212 0.999270 256
## 11 max tns 0.999930 164.000000 0
## 12 max fns 0.999930 684.000000 0
## 13 max fps 0.000732 164.000000 399
## 14 max tps 0.006478 685.000000 271
## 15 max tnr 0.999930 1.000000 0
## 16 max fnr 0.999930 0.998540 0
## 17 max fpr 0.000732 1.000000 399
## 18 max tpr 0.006478 1.000000 271
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.003297201
## RMSE: 0.05742126
## LogLoss: 0.01214609
## Mean Per-Class Error: 0.001969021
## AUC: 0.9998303
## AUCPR: 0.9999583
## Gini: 0.9996606
## R^2: 0.9793501
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## dismissed not dismissed Error Rate
## dismissed 949 0 0.000000 =0/949
## not dismissed 15 3794 0.003938 =15/3809
## Totals 964 3794 0.003153 =15/4758
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.983067 0.998027 129
## 2 max f2 0.121216 0.996905 163
## 3 max f0point5 0.983067 0.999210 129
## 4 max accuracy 0.983067 0.996847 129
## 5 max precision 0.999987 1.000000 0
## 6 max recall 0.008510 1.000000 229
## 7 max specificity 0.999987 1.000000 0
## 8 max absolute_mcc 0.983067 0.990234 129
## 9 max min_per_class_accuracy 0.983067 0.996062 129
## 10 max mean_per_class_accuracy 0.983067 0.998031 129
## 11 max tns 0.999987 949.000000 0
## 12 max fns 0.999987 3808.000000 0
## 13 max fps 0.000243 949.000000 399
## 14 max tps 0.008510 3809.000000 229
## 15 max tnr 0.999987 1.000000 0
## 16 max fnr 0.999987 0.999737 0
## 17 max fpr 0.000243 1.000000 399
## 18 max tpr 0.008510 1.000000 229
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.996848 0.001661 0.995798 0.997899 0.994748
## auc 0.999811 0.000130 0.999738 0.999924 0.999634
## err 0.003152 0.001661 0.004202 0.002101 0.005252
## err_count 3.000000 1.581139 4.000000 2.000000 5.000000
## f0point5 0.999209 0.000418 0.998946 0.999474 0.998681
## f1 0.998026 0.001042 0.997368 0.998686 0.996708
## f2 0.996847 0.001663 0.995796 0.997899 0.994744
## lift_top_group 1.249147 0.000639 1.249344 1.249344 1.249344
## logloss 0.012713 0.005792 0.015947 0.009249 0.020077
## max_per_class_error 0.003938 0.002075 0.005249 0.002625 0.006562
## mcc 0.990262 0.005084 0.987036 0.993472 0.983852
## mean_per_class_accuracy 0.998031 0.001037 0.997375 0.998688 0.996719
## mean_per_class_error 0.001969 0.001037 0.002625 0.001312 0.003281
## mse 0.003440 0.001477 0.004297 0.002971 0.005236
## pr_auc 0.999953 0.000032 0.999935 0.999981 0.999910
## precision 1.000000 0.000000 1.000000 1.000000 1.000000
## r2 0.978462 0.009239 0.973103 0.981405 0.967223
## recall 0.996062 0.002075 0.994751 0.997375 0.993438
## rmse 0.057361 0.013697 0.065549 0.054503 0.072361
## specificity 1.000000 0.000000 1.000000 1.000000 1.000000
## cv_4_valid cv_5_valid
## accuracy 0.996845 0.998948
## auc 0.999813 0.999944
## err 0.003155 0.001052
## err_count 3.000000 1.000000
## f0point5 0.999209 0.999737
## f1 0.998025 0.999343
## f2 0.996844 0.998950
## lift_top_group 1.249672 1.248031
## logloss 0.013149 0.005143
## max_per_class_error 0.003942 0.001312
## mcc 0.990240 0.996710
## mean_per_class_accuracy 0.998029 0.999344
## mean_per_class_error 0.001971 0.000656
## mse 0.003391 0.001307
## pr_auc 0.999954 0.999986
## precision 1.000000 1.000000
## r2 0.978790 0.991789
## recall 0.996058 0.998688
## rmse 0.058231 0.036159
## specificity 1.000000 1.000000
?h2o.getModel
?h2o.saveModel
?h2o.loadModel
h2o.getModel("GBM_4_AutoML_17_20241121_142222") %>%
h2o.saveModel("h2o__model/")
## [1] "/Users/erinmcevoy/Desktop/PSU_DAT3100/00_data/h2o__model/GBM_4_AutoML_17_20241121_142222"
best_model <- models_h2o@leader
# Make predictions using the best model
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 1,870 × 9
## predict dismissed not.dismissed dismissal_dataset_id departure_code
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 not dismissed 0.000340 1.00 12 5
## 2 dismissed 0.999 0.000765 13 3
## 3 not dismissed 0.000465 1.00 65 5
## 4 dismissed 0.999 0.000713 78 3
## 5 not dismissed 0.000423 1.00 80 5
## 6 not dismissed 0.000487 1.00 81 5
## 7 dismissed 0.999 0.00135 88 3
## 8 not dismissed 0.000243 1.00 99 5
## 9 not dismissed 0.000262 1.00 117 5
## 10 not dismissed 0.000301 1.00 121 5
## # ℹ 1,860 more rows
## # ℹ 4 more variables: ceo_dismissal <fct>, tenure_no_ceodb <dbl>,
## # max_tenure_ceodb <dbl>, fyear_gone <dbl>
?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "GBM_4_AutoML_27_20241121_144822"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/GBM_4_AutoML_27_20241121_144822"
##
##
## $model_checksum
## [1] "2991273419511954476"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_a911_3"
##
##
## $frame_checksum
## [1] "9047969800299088666"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.732219e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.003353416
##
## $RMSE
## [1] 0.05790869
##
## $nobs
## [1] 1870
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.978914
##
## $logloss
## [1] 0.01116995
##
## $AUC
## [1] 0.9998993
##
## $pr_auc
## [1] 0.9999752
##
## $Gini
## [1] 0.9997986
##
## $mean_per_class_error
## [1] 0.005043794
##
## $domain
## [1] "dismissed" "not dismissed"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## dismissed not dismissed Error Rate
## dismissed 368 3 0.0081 = 3 / 371
## not dismissed 3 1496 0.0020 = 3 / 1,499
## Totals 371 1499 0.0032 = 6 / 1,870
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.999930 0.001333 0.000834 0.003327 0.198930 1.000000 0.000667 1.000000
## 2 0.999925 0.003995 0.002500 0.009927 0.200000 1.000000 0.002001 1.000000
## 3 0.999922 0.005323 0.003333 0.013201 0.200535 1.000000 0.002668 1.000000
## 4 0.999916 0.007973 0.004998 0.019698 0.201604 1.000000 0.004003 1.000000
## 5 0.999912 0.011936 0.007494 0.029316 0.203209 1.000000 0.006004 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.011508 0.000667 0.500334 371 1498 0 1
## 2 0.019942 0.002001 0.501001 371 1496 0 3
## 3 0.023034 0.002668 0.501334 371 1495 0 4
## 4 0.028225 0.004003 0.502001 371 1493 0 6
## 5 0.034597 0.006004 0.503002 371 1490 0 9
## tnr fnr fpr tpr idx
## 1 1.000000 0.999333 0.000000 0.000667 0
## 2 1.000000 0.997999 0.000000 0.002001 1
## 3 1.000000 0.997332 0.000000 0.002668 2
## 4 1.000000 0.995997 0.000000 0.004003 3
## 5 1.000000 0.993996 0.000000 0.006004 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 395 0.000719 0.891466 0.953562 0.836963 0.804813 0.804185 1.000000
## 396 0.000713 0.891201 0.953441 0.836589 0.804278 0.803753 1.000000
## 397 0.000699 0.890936 0.953320 0.836216 0.803743 0.803323 1.000000
## 398 0.000693 0.890671 0.953199 0.835843 0.803209 0.802892 1.000000
## 399 0.000686 0.890143 0.952956 0.835097 0.802139 0.802033 1.000000
## 400 0.000607 0.889878 0.952835 0.834725 0.801604 0.801604 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395 0.016173 0.114042 0.016173 0.508086 6
## 396 0.013477 0.104078 0.013477 0.506739 5
## 397 0.010782 0.093065 0.010782 0.505391 4
## 398 0.008086 0.080575 0.008086 0.504043 3
## 399 0.002695 0.046495 0.002695 0.501348 1
## 400 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 395 0 365 1499 0.016173 0.000000 0.983827 1.000000 394
## 396 0 366 1499 0.013477 0.000000 0.986523 1.000000 395
## 397 0 367 1499 0.010782 0.000000 0.989218 1.000000 396
## 398 0 368 1499 0.008086 0.000000 0.991914 1.000000 397
## 399 0 370 1499 0.002695 0.000000 0.997305 1.000000 398
## 400 0 371 1499 0.000000 0.000000 1.000000 1.000000 399
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.197672 0.997999 201
## 2 max f2 0.197672 0.997999 201
## 3 max f0point5 0.997764 0.998928 193
## 4 max accuracy 0.197672 0.996791 201
## 5 max precision 0.999930 1.000000 0
## 6 max recall 0.023784 1.000000 219
## 7 max specificity 0.999930 1.000000 0
## 8 max absolute_mcc 0.197672 0.989912 201
## 9 max min_per_class_accuracy 0.997764 0.994663 193
## 10 max mean_per_class_accuracy 0.997764 0.997332 193
## 11 max tns 0.999930 371.000000 0
## 12 max fns 0.999930 1498.000000 0
## 13 max fps 0.000607 371.000000 399
## 14 max tps 0.023784 1499.000000 219
## 15 max tnr 0.999930 1.000000 0
## 16 max fnr 0.999930 0.999333 0
## 17 max fpr 0.000607 1.000000 399
## 18 max tpr 0.023784 1.000000 219
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 80.16 %, avg score: 79.95 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.01016043 0.999883 1.247498 1.247498
## 2 2 0.02032086 0.999856 1.247498 1.247498
## 3 3 0.03048128 0.999841 1.247498 1.247498
## 4 4 0.04010695 0.999831 1.247498 1.247498
## 5 5 0.05026738 0.999823 1.247498 1.247498
## 6 6 0.10000000 0.999793 1.247498 1.247498
## 7 7 0.15026738 0.999772 1.247498 1.247498
## 8 8 0.20000000 0.999757 1.247498 1.247498
## 9 9 0.30000000 0.999727 1.247498 1.247498
## 10 10 0.40000000 0.999699 1.247498 1.247498
## 11 11 0.50000000 0.999670 1.247498 1.247498
## 12 12 0.60000000 0.999641 1.247498 1.247498
## 13 13 0.70000000 0.999582 1.247498 1.247498
## 14 14 0.80000000 0.247503 1.227485 1.244997
## 15 15 0.90053476 0.001238 0.039814 1.110451
## 16 16 1.00000000 0.000607 0.000000 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 1.000000 0.999908 1.000000 0.999908
## 2 1.000000 0.999868 1.000000 0.999888
## 3 1.000000 0.999847 1.000000 0.999874
## 4 1.000000 0.999835 1.000000 0.999865
## 5 1.000000 0.999826 1.000000 0.999857
## 6 1.000000 0.999808 1.000000 0.999833
## 7 1.000000 0.999783 1.000000 0.999816
## 8 1.000000 0.999764 1.000000 0.999803
## 9 1.000000 0.999742 1.000000 0.999783
## 10 1.000000 0.999714 1.000000 0.999766
## 11 1.000000 0.999684 1.000000 0.999749
## 12 1.000000 0.999656 1.000000 0.999734
## 13 1.000000 0.999616 1.000000 0.999717
## 14 0.983957 0.982650 0.997995 0.997584
## 15 0.031915 0.013395 0.890143 0.887710
## 16 0.000000 0.001018 0.801604 0.799515
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 0.012675 0.012675 24.749833 24.749833
## 2 0.012675 0.025350 24.749833 24.749833
## 3 0.012675 0.038025 24.749833 24.749833
## 4 0.012008 0.050033 24.749833 24.749833
## 5 0.012675 0.062708 24.749833 24.749833
## 6 0.062041 0.124750 24.749833 24.749833
## 7 0.062708 0.187458 24.749833 24.749833
## 8 0.062041 0.249500 24.749833 24.749833
## 9 0.124750 0.374249 24.749833 24.749833
## 10 0.124750 0.498999 24.749833 24.749833
## 11 0.124750 0.623749 24.749833 24.749833
## 12 0.124750 0.748499 24.749833 24.749833
## 13 0.124750 0.873249 24.749833 24.749833
## 14 0.122748 0.995997 22.748499 24.499666
## 15 0.004003 1.000000 -96.018622 11.045131
## 16 0.000000 1.000000 -100.000000 0.000000
## kolmogorov_smirnov
## 1 0.012675
## 2 0.025350
## 3 0.038025
## 4 0.050033
## 5 0.062708
## 6 0.124750
## 7 0.187458
## 8 0.249500
## 9 0.374249
## 10 0.498999
## 11 0.623749
## 12 0.748499
## 13 0.873249
## 14 0.987911
## 15 0.501348
## 16 0.000000
h2o.auc(performance_h2o)
## [1] 0.9998993
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.197672408100051:
## dismissed not dismissed Error Rate
## dismissed 368 3 0.008086 =3/371
## not dismissed 3 1496 0.002001 =3/1499
## Totals 371 1499 0.003209 =6/1870
h2o.metric(performance_h2o)
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.999930 0.001333 0.000834 0.003327 0.198930 1.000000 0.000667 1.000000
## 2 0.999925 0.003995 0.002500 0.009927 0.200000 1.000000 0.002001 1.000000
## 3 0.999922 0.005323 0.003333 0.013201 0.200535 1.000000 0.002668 1.000000
## 4 0.999916 0.007973 0.004998 0.019698 0.201604 1.000000 0.004003 1.000000
## 5 0.999912 0.011936 0.007494 0.029316 0.203209 1.000000 0.006004 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.011508 0.000667 0.500334 371 1498 0 1
## 2 0.019942 0.002001 0.501001 371 1496 0 3
## 3 0.023034 0.002668 0.501334 371 1495 0 4
## 4 0.028225 0.004003 0.502001 371 1493 0 6
## 5 0.034597 0.006004 0.503002 371 1490 0 9
## tnr fnr fpr tpr idx
## 1 1.000000 0.999333 0.000000 0.000667 0
## 2 1.000000 0.997999 0.000000 0.002001 1
## 3 1.000000 0.997332 0.000000 0.002668 2
## 4 1.000000 0.995997 0.000000 0.004003 3
## 5 1.000000 0.993996 0.000000 0.006004 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 395 0.000719 0.891466 0.953562 0.836963 0.804813 0.804185 1.000000
## 396 0.000713 0.891201 0.953441 0.836589 0.804278 0.803753 1.000000
## 397 0.000699 0.890936 0.953320 0.836216 0.803743 0.803323 1.000000
## 398 0.000693 0.890671 0.953199 0.835843 0.803209 0.802892 1.000000
## 399 0.000686 0.890143 0.952956 0.835097 0.802139 0.802033 1.000000
## 400 0.000607 0.889878 0.952835 0.834725 0.801604 0.801604 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395 0.016173 0.114042 0.016173 0.508086 6
## 396 0.013477 0.104078 0.013477 0.506739 5
## 397 0.010782 0.093065 0.010782 0.505391 4
## 398 0.008086 0.080575 0.008086 0.504043 3
## 399 0.002695 0.046495 0.002695 0.501348 1
## 400 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 395 0 365 1499 0.016173 0.000000 0.983827 1.000000 394
## 396 0 366 1499 0.013477 0.000000 0.986523 1.000000 395
## 397 0 367 1499 0.010782 0.000000 0.989218 1.000000 396
## 398 0 368 1499 0.008086 0.000000 0.991914 1.000000 397
## 399 0 370 1499 0.002695 0.000000 0.997305 1.000000 398
## 400 0 371 1499 0.000000 0.000000 1.000000 1.000000 399
This model’s predictions performed much better than the xgboost model with an AUC of 0.99