Goal is to automate building and tuning a classification model to predict employee attrition, using the h2o::h2o.automl.
Import the cleaned data from Module 7.
library(h2o)
## Warning: package 'h2o' was built under R version 4.4.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::day() masks h2o::day()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::hour() masks h2o::hour()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::month() masks h2o::month()
## ✖ lubridate::week() masks h2o::week()
## ✖ lubridate::year() masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ rsample 1.3.0
## ✔ dials 1.4.0 ✔ tune 1.3.0
## ✔ infer 1.0.7 ✔ workflows 1.2.0
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.3.1 ✔ yardstick 1.3.2
## ✔ recipes 1.2.1
## Warning: package 'broom' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.4.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Warning: package 'xts' was built under R version 4.4.3
## Warning: package 'zoo' was built under R version 4.4.3
## Warning: package 'quantmod' was built under R version 4.4.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.4.3
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8 ✔ TTR 0.24.4
## ✔ quantmod 0.4.27 ✔ xts 0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ scales::col_factor() masks readr::col_factor()
## ✖ lubridate::day() masks h2o::day()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ lubridate::hour() masks h2o::hour()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ TTR::momentum() masks dials::momentum()
## ✖ lubridate::month() masks h2o::month()
## ✖ yardstick::spec() masks readr::spec()
## ✖ quantmod::summary() masks h2o::summary(), base::summary()
## ✖ lubridate::week() masks h2o::week()
## ✖ lubridate::year() masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data_unclean <- read.csv("C:/Users/tch30/Desktop/PSU_DAT3100/00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
factors_vec <- data_unclean %>% select(Education,EnvironmentSatisfaction, JobInvolvement, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance) %>% names()
data_clean <- data_unclean %>%
# Address factors imported as numeric
mutate(across(factors_vec, as.factor)) %>%
#Drop zero-variance variables
select(-c(Over18,EmployeeCount,StandardHours))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(factors_vec, as.factor)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(factors_vec)
##
## # Now:
## data %>% select(all_of(factors_vec))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
data <- data_clean %>%
# h2o requires all variables to be either numeric or factors
mutate(across(where(is.character), factor))
set.seed(1234)
data_split <- initial_split(data, strata = "Attrition")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
recipe_obj <- recipe(Attrition ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
# Intiialize h2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 48 minutes 16 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 16 days
## H2O cluster name: H2O_started_from_R_tch30_gzf755
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.72 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 16 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)
## | | | 0% | |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
## | | | 0% | |======================================================================| 100%
y <- "Attrition"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
# max_runtime_secs = 30,
max_models = 10,
exclude_algos = "DeepLearning",
nfolds = 5,
seed = 3456
)
## | | | 0% | |====== | 8%
## 01:26:01.327: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 01:26:01.334: AutoML: XGBoost is not available; skipping it. | |================== | 25% | |=============================================== | 67% | |======================================================================| 100%
Examine the output of h2o.automl
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id auc logloss
## 1 StackedEnsemble_AllModels_1_AutoML_5_20250507_12601 0.8366775 0.3161036
## 2 StackedEnsemble_BestOfFamily_1_AutoML_5_20250507_12601 0.8359763 0.3286589
## 3 GLM_1_AutoML_5_20250507_12601 0.8357066 0.3236488
## 4 GBM_grid_1_AutoML_5_20250507_12601_model_1 0.8341963 0.3384674
## 5 GBM_1_AutoML_5_20250507_12601 0.8331176 0.3333609
## 6 GBM_4_AutoML_5_20250507_12601 0.8181230 0.3421082
## aucpr mean_per_class_error rmse mse
## 1 0.6146322 0.2409385 0.3037509 0.09226460
## 2 0.6195593 0.2476537 0.3068584 0.09416207
## 3 0.6124677 0.2177184 0.3104302 0.09636689
## 4 0.5736951 0.2541262 0.3191542 0.10185939
## 5 0.6024018 0.2471683 0.3155600 0.09957810
## 6 0.5506653 0.2638350 0.3192782 0.10193859
##
## [12 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: stackedensemble
## Model ID: StackedEnsemble_AllModels_1_AutoML_5_20250507_12601
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 7/10
## 3 # GBM base models (used / total) 5/7
## 4 # GLM base models (used / total) 1/1
## 5 # DRF base models (used / total) 1/2
## 6 Metalearner algorithm GLM
## 7 Metalearner fold assignment scheme Random
## 8 Metalearner nfolds 5
## 9 Metalearner fold_column NA
## 10 Custom metalearner hyperparameters None
##
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 0.03227298
## RMSE: 0.1796468
## LogLoss: 0.1424294
## Mean Per-Class Error: 0.04983926
## AUC: 0.9947758
## AUCPR: 0.9807961
## Gini: 0.9895516
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## No Yes Error Rate
## No 783 5 0.006345 =5/788
## Yes 14 136 0.093333 =14/150
## Totals 797 141 0.020256 =19/938
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.358823 0.934708 117
## 2 max f2 0.282655 0.934211 136
## 3 max f0point5 0.474131 0.968468 105
## 4 max accuracy 0.358823 0.979744 117
## 5 max precision 0.971993 1.000000 0
## 6 max recall 0.118179 1.000000 230
## 7 max specificity 0.971993 1.000000 0
## 8 max absolute_mcc 0.358823 0.923353 117
## 9 max min_per_class_accuracy 0.253004 0.960000 148
## 10 max mean_per_class_accuracy 0.237591 0.963824 155
## 11 max tns 0.971993 788.000000 0
## 12 max fns 0.971993 149.000000 0
## 13 max fps 0.001471 788.000000 399
## 14 max tps 0.118179 150.000000 230
## 15 max tnr 0.971993 1.000000 0
## 16 max fnr 0.971993 0.993333 0
## 17 max fpr 0.001471 1.000000 399
## 18 max tpr 0.118179 1.000000 230
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on validation data. **
##
## MSE: 0.09446345
## RMSE: 0.3073491
## LogLoss: 0.3083184
## Mean Per-Class Error: 0.1846405
## AUC: 0.8714597
## AUCPR: 0.620885
## Gini: 0.7429194
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## No Yes Error Rate
## No 116 20 0.147059 =20/136
## Yes 6 21 0.222222 =6/27
## Totals 122 41 0.159509 =26/163
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.217259 0.617647 40
## 2 max f2 0.217259 0.704698 40
## 3 max f0point5 0.465368 0.656566 17
## 4 max accuracy 0.465368 0.883436 17
## 5 max precision 0.951530 1.000000 0
## 6 max recall 0.069156 1.000000 101
## 7 max specificity 0.951530 1.000000 0
## 8 max absolute_mcc 0.217259 0.540400 40
## 9 max min_per_class_accuracy 0.217259 0.777778 40
## 10 max mean_per_class_accuracy 0.217259 0.815359 40
## 11 max tns 0.951530 136.000000 0
## 12 max fns 0.951530 26.000000 0
## 13 max fps 0.004288 136.000000 162
## 14 max tps 0.069156 27.000000 101
## 15 max tnr 0.951530 1.000000 0
## 16 max fnr 0.951530 0.962963 0
## 17 max fpr 0.004288 1.000000 162
## 18 max tpr 0.069156 1.000000 101
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.09463944
## RMSE: 0.3076352
## LogLoss: 0.3256015
## Mean Per-Class Error: 0.2302792
## AUC: 0.829742
## AUCPR: 0.6055174
## Gini: 0.6594839
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## No Yes Error Rate
## No 693 95 0.120558 =95/788
## Yes 51 99 0.340000 =51/150
## Totals 744 194 0.155650 =146/938
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.248514 0.575581 145
## 2 max f2 0.188210 0.648148 185
## 3 max f0point5 0.463066 0.627706 66
## 4 max accuracy 0.463066 0.880597 66
## 5 max precision 0.984998 1.000000 0
## 6 max recall 0.003360 1.000000 396
## 7 max specificity 0.984998 1.000000 0
## 8 max absolute_mcc 0.270759 0.488856 130
## 9 max min_per_class_accuracy 0.163428 0.760000 204
## 10 max mean_per_class_accuracy 0.229832 0.779569 157
## 11 max tns 0.984998 788.000000 0
## 12 max fns 0.984998 149.000000 0
## 13 max fps 0.001226 788.000000 399
## 14 max tps 0.003360 150.000000 396
## 15 max tnr 0.984998 1.000000 0
## 16 max fnr 0.984998 0.993333 0
## 17 max fpr 0.001226 1.000000 399
## 18 max tpr 0.003360 1.000000 396
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy 0.873603 0.032658 0.836538 0.870466 0.855556 0.923469
## auc 0.836685 0.060797 0.785048 0.798571 0.797462 0.921303
## err 0.126397 0.032658 0.163462 0.129534 0.144444 0.076531
## err_count 23.800000 7.259477 34.000000 25.000000 26.000000 15.000000
## f0point5 0.612589 0.122104 0.544041 0.503876 0.552326 0.806452
## cv_5_valid
## accuracy 0.881988
## auc 0.881042
## err 0.118012
## err_count 19.000000
## f0point5 0.656250
##
## ---
## mean sd cv_1_valid cv_2_valid cv_3_valid
## precision 0.614434 0.151616 0.538462 0.500000 0.527778
## r2 0.296458 0.124548 0.214023 0.210021 0.200310
## recall 0.628228 0.090507 0.567568 0.520000 0.678571
## residual_deviance 120.763610 24.327105 158.939900 118.035934 126.884210
## rmse 0.305441 0.027073 0.339032 0.298453 0.324107
## specificity 0.919399 0.037315 0.894737 0.922619 0.888158
## cv_4_valid cv_5_valid
## precision 0.869565 0.636364
## r2 0.470321 0.387615
## recall 0.625000 0.750000
## residual_deviance 99.307655 100.650350
## rmse 0.268997 0.296614
## specificity 0.981707 0.909774
# Ensure the model is retrieved correctly
model <- h2o.getModel("GBM_grid_1_AutoML_3_20250507_05322_model_15")
# Define the directory where you want to save the model
save_dir <- "h2o models/"
# Save the model, overwriting any existing file
saved_model_path <- h2o.saveModel(model, path = save_dir, force = TRUE)
# Print the path where the model is saved
print(saved_model_path)
## [1] "C:\\Users\\tch30\\Desktop\\PSU_DAT3100\\h2o models\\GBM_grid_1_AutoML_3_20250507_05322_model_15"
best_model <- h2o.loadModel("h2o models/GBM_grid_1_AutoML_3_20250507_05322_model_15")
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 369 × 35
## predict No Yes Age Attrition BusinessTravel DailyRate Department
## <fct> <dbl> <dbl> <int> <fct> <fct> <int> <fct>
## 1 Yes 0.663 0.337 59 No Travel_Rarely 1324 Research & …
## 2 No 0.866 0.134 35 No Travel_Rarely 809 Research & …
## 3 No 0.929 0.0709 34 No Travel_Rarely 1346 Research & …
## 4 Yes 0.602 0.398 22 No Non-Travel 1123 Research & …
## 5 No 0.922 0.0776 53 No Travel_Rarely 1219 Sales
## 6 No 0.910 0.0898 24 No Non-Travel 673 Research & …
## 7 No 0.770 0.230 21 No Travel_Rarely 391 Research & …
## 8 No 0.932 0.0680 34 Yes Travel_Rarely 699 Research & …
## 9 No 0.969 0.0311 53 No Travel_Rarely 1282 Research & …
## 10 Yes 0.567 0.433 32 Yes Travel_Frequently 1125 Research & …
## # ℹ 359 more rows
## # ℹ 27 more variables: DistanceFromHome <int>, Education <fct>,
## # EducationField <fct>, EmployeeNumber <int>, EnvironmentSatisfaction <fct>,
## # Gender <fct>, HourlyRate <int>, JobInvolvement <fct>, JobLevel <int>,
## # JobRole <fct>, JobSatisfaction <int>, MaritalStatus <fct>,
## # MonthlyIncome <int>, MonthlyRate <int>, NumCompaniesWorked <int>,
## # OverTime <fct>, PercentSalaryHike <int>, PerformanceRating <fct>, …
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "GBM_grid_1_AutoML_3_20250507_05322_model_15"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/GBM_grid_1_AutoML_3_20250507_05322_model_15"
##
##
## $model_checksum
## [1] "5968363586185990720"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_bc66_3"
##
##
## $frame_checksum
## [1] "-54413530742813186"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.746596e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.1002433
##
## $RMSE
## [1] 0.3166122
##
## $nobs
## [1] 369
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.263796
##
## $logloss
## [1] 0.3325214
##
## $AUC
## [1] 0.8420173
##
## $pr_auc
## [1] 0.5940495
##
## $Gini
## [1] 0.6840345
##
## $mean_per_class_error
## [1] 0.2182039
##
## $domain
## [1] "No" "Yes"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## No Yes Error Rate
## No 272 37 0.1197 = 37 / 309
## Yes 19 41 0.3167 = 19 / 60
## Totals 291 78 0.1518 = 56 / 369
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.742148 0.032787 0.020747 0.078125 0.840108 1.000000 0.016667 1.000000
## 2 0.715825 0.064516 0.041322 0.147059 0.842818 1.000000 0.033333 1.000000
## 3 0.684642 0.095238 0.061728 0.208333 0.845528 1.000000 0.050000 1.000000
## 4 0.681600 0.093750 0.061475 0.197368 0.842818 0.750000 0.050000 0.996764
## 5 0.624659 0.092308 0.061224 0.187500 0.840108 0.600000 0.050000 0.993528
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.118299 0.016667 0.508333 309 59 0 1
## 2 0.167527 0.033333 0.516667 309 58 0 2
## 3 0.205458 0.050000 0.525000 309 57 0 3
## 4 0.166643 0.050000 0.523382 308 57 1 3
## 5 0.138926 0.050000 0.521764 307 57 2 3
## tnr fnr fpr tpr idx
## 1 1.000000 0.983333 0.000000 0.016667 0
## 2 1.000000 0.966667 0.000000 0.033333 1
## 3 1.000000 0.950000 0.000000 0.050000 2
## 4 0.996764 0.950000 0.003236 0.050000 3
## 5 0.993528 0.950000 0.006472 0.050000 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 364 0.015081 0.283019 0.496689 0.197889 0.176152 0.164835 1.000000
## 365 0.013518 0.282353 0.495868 0.197368 0.173442 0.164384 1.000000
## 366 0.011618 0.281690 0.495050 0.196850 0.170732 0.163934 1.000000
## 367 0.009694 0.281030 0.494234 0.196335 0.168022 0.163488 1.000000
## 368 0.008651 0.280374 0.493421 0.195822 0.165312 0.163043 1.000000
## 369 0.007026 0.279720 0.492611 0.195313 0.162602 0.162602 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 364 0.016181 0.051645 0.016181 0.508091 5
## 365 0.012945 0.046130 0.012945 0.506472 4
## 366 0.009709 0.039895 0.009709 0.504854 3
## 367 0.006472 0.032530 0.006472 0.503236 2
## 368 0.003236 0.022971 0.003236 0.501618 1
## 369 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 364 0 304 60 0.016181 0.000000 0.983819 1.000000 363
## 365 0 305 60 0.012945 0.000000 0.987055 1.000000 364
## 366 0 306 60 0.009709 0.000000 0.990291 1.000000 365
## 367 0 307 60 0.006472 0.000000 0.993528 1.000000 366
## 368 0 308 60 0.003236 0.000000 0.996764 1.000000 367
## 369 0 309 60 0.000000 0.000000 1.000000 1.000000 368
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.242947 0.594203 77
## 2 max f2 0.200365 0.660920 107
## 3 max f0point5 0.363009 0.683962 37
## 4 max accuracy 0.363009 0.891599 37
## 5 max precision 0.742148 1.000000 0
## 6 max recall 0.021673 1.000000 351
## 7 max specificity 0.742148 1.000000 0
## 8 max absolute_mcc 0.363009 0.551446 37
## 9 max min_per_class_accuracy 0.188435 0.773463 116
## 10 max mean_per_class_accuracy 0.200365 0.783010 107
## 11 max tns 0.742148 309.000000 0
## 12 max fns 0.742148 59.000000 0
## 13 max fps 0.007026 309.000000 368
## 14 max tps 0.021673 60.000000 351
## 15 max tnr 0.742148 1.000000 0
## 16 max fnr 0.742148 0.983333 0
## 17 max fpr 0.007026 1.000000 368
## 18 max tpr 0.021673 1.000000 351
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 16.26 %, avg score: 15.96 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.01084011 0.642880 4.612500 4.612500
## 2 2 0.02168022 0.555200 4.612500 4.612500
## 3 3 0.03252033 0.499274 4.612500 4.612500
## 4 4 0.04065041 0.482818 6.150000 4.920000
## 5 5 0.05149051 0.458358 4.612500 4.855263
## 6 6 0.10027100 0.364831 4.441667 4.654054
## 7 7 0.15176152 0.298871 1.294737 3.514286
## 8 8 0.20054201 0.248920 2.050000 3.158108
## 9 9 0.30081301 0.199447 1.329730 2.548649
## 10 10 0.40108401 0.145110 0.498649 2.036149
## 11 11 0.50135501 0.111086 0.498649 1.728649
## 12 12 0.59891599 0.091194 0.170833 1.474887
## 13 13 0.69918699 0.068021 0.498649 1.334884
## 14 14 0.79945799 0.045392 0.498649 1.230000
## 15 15 0.89972900 0.029309 0.000000 1.092922
## 16 16 1.00000000 0.007026 0.166216 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 0.750000 0.706053 0.750000 0.706053
## 2 0.750000 0.594429 0.750000 0.650241
## 3 0.750000 0.513938 0.750000 0.604807
## 4 1.000000 0.492437 0.800000 0.582333
## 5 0.750000 0.464258 0.789474 0.557475
## 6 0.722222 0.410524 0.756757 0.485985
## 7 0.210526 0.325379 0.571429 0.431494
## 8 0.333333 0.269332 0.513514 0.392049
## 9 0.216216 0.220489 0.414414 0.334862
## 10 0.081081 0.167218 0.331081 0.292951
## 11 0.081081 0.125915 0.281081 0.259544
## 12 0.027778 0.102763 0.239819 0.234005
## 13 0.081081 0.077789 0.217054 0.211602
## 14 0.081081 0.058044 0.200000 0.192342
## 15 0.000000 0.036934 0.177711 0.175023
## 16 0.027027 0.020981 0.162602 0.159577
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 0.050000 0.050000 361.250000 361.250000
## 2 0.050000 0.100000 361.250000 361.250000
## 3 0.050000 0.150000 361.250000 361.250000
## 4 0.050000 0.200000 515.000000 392.000000
## 5 0.050000 0.250000 361.250000 385.526316
## 6 0.216667 0.466667 344.166667 365.405405
## 7 0.066667 0.533333 29.473684 251.428571
## 8 0.100000 0.633333 105.000000 215.810811
## 9 0.133333 0.766667 32.972973 154.864865
## 10 0.050000 0.816667 -50.135135 103.614865
## 11 0.050000 0.866667 -50.135135 72.864865
## 12 0.016667 0.883333 -82.916667 47.488688
## 13 0.050000 0.933333 -50.135135 33.488372
## 14 0.050000 0.983333 -50.135135 23.000000
## 15 0.000000 0.983333 -100.000000 9.292169
## 16 0.016667 1.000000 -83.378378 0.000000
## kolmogorov_smirnov
## 1 0.046764
## 2 0.093528
## 3 0.140291
## 4 0.190291
## 5 0.237055
## 6 0.437540
## 7 0.455663
## 8 0.516828
## 9 0.556311
## 10 0.496278
## 11 0.436246
## 12 0.339644
## 13 0.279612
## 14 0.219579
## 15 0.099838
## 16 0.000000
h2o.auc(performance_h2o)
## [1] 0.8420173
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.242946501878446:
## No Yes Error Rate
## No 272 37 0.119741 =37/309
## Yes 19 41 0.316667 =19/60
## Totals 291 78 0.151762 =56/369
h2o.metric(performance_h2o)
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.742148 0.032787 0.020747 0.078125 0.840108 1.000000 0.016667 1.000000
## 2 0.715825 0.064516 0.041322 0.147059 0.842818 1.000000 0.033333 1.000000
## 3 0.684642 0.095238 0.061728 0.208333 0.845528 1.000000 0.050000 1.000000
## 4 0.681600 0.093750 0.061475 0.197368 0.842818 0.750000 0.050000 0.996764
## 5 0.624659 0.092308 0.061224 0.187500 0.840108 0.600000 0.050000 0.993528
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.118299 0.016667 0.508333 309 59 0 1
## 2 0.167527 0.033333 0.516667 309 58 0 2
## 3 0.205458 0.050000 0.525000 309 57 0 3
## 4 0.166643 0.050000 0.523382 308 57 1 3
## 5 0.138926 0.050000 0.521764 307 57 2 3
## tnr fnr fpr tpr idx
## 1 1.000000 0.983333 0.000000 0.016667 0
## 2 1.000000 0.966667 0.000000 0.033333 1
## 3 1.000000 0.950000 0.000000 0.050000 2
## 4 0.996764 0.950000 0.003236 0.050000 3
## 5 0.993528 0.950000 0.006472 0.050000 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 364 0.015081 0.283019 0.496689 0.197889 0.176152 0.164835 1.000000
## 365 0.013518 0.282353 0.495868 0.197368 0.173442 0.164384 1.000000
## 366 0.011618 0.281690 0.495050 0.196850 0.170732 0.163934 1.000000
## 367 0.009694 0.281030 0.494234 0.196335 0.168022 0.163488 1.000000
## 368 0.008651 0.280374 0.493421 0.195822 0.165312 0.163043 1.000000
## 369 0.007026 0.279720 0.492611 0.195313 0.162602 0.162602 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 364 0.016181 0.051645 0.016181 0.508091 5
## 365 0.012945 0.046130 0.012945 0.506472 4
## 366 0.009709 0.039895 0.009709 0.504854 3
## 367 0.006472 0.032530 0.006472 0.503236 2
## 368 0.003236 0.022971 0.003236 0.501618 1
## 369 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 364 0 304 60 0.016181 0.000000 0.983819 1.000000 363
## 365 0 305 60 0.012945 0.000000 0.987055 1.000000 364
## 366 0 306 60 0.009709 0.000000 0.990291 1.000000 365
## 367 0 307 60 0.006472 0.000000 0.993528 1.000000 366
## 368 0 308 60 0.003236 0.000000 0.996764 1.000000 367
## 369 0 309 60 0.000000 0.000000 1.000000 1.000000 368