Goal is to automate building and tuning a classification model to predict employee attrition, using the h2o::h2o.automl.
Import the cleaned data from Module 7.
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::day() masks h2o::day()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::hour() masks h2o::hour()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::month() masks h2o::month()
## ✖ lubridate::week() masks h2o::week()
## ✖ lubridate::year() masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ rsample 1.2.1
## ✔ dials 1.3.0 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.4.1
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ──────────────────────── tidyquant 1.0.9 ──
## ✔ PerformanceAnalytics 2.0.4 ✔ TTR 0.24.4
## ✔ quantmod 0.4.26 ✔ xts 0.13.2── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ scales::col_factor() masks readr::col_factor()
## ✖ lubridate::day() masks h2o::day()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ lubridate::hour() masks h2o::hour()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ TTR::momentum() masks dials::momentum()
## ✖ lubridate::month() masks h2o::month()
## ✖ yardstick::spec() masks readr::spec()
## ✖ quantmod::summary() masks h2o::summary(), base::summary()
## ✖ lubridate::week() masks h2o::week()
## ✖ lubridate::year() masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm (1): leftofc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- departures %>%
# Clean ceo_dismissal
filter(!is.na(ceo_dismissal)) %>%
mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not dismissed")) %>%
# Drop variables with too many missing values
select(-interim_coceo, - still_there, - eight_ks) %>%
# Treat dismissal_dataset_id
mutate(dismissal_dataset_id = as.character(dismissal_dataset_id)) %>%
distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
# Delete year of 2997 in fyear_geon
filter(fyear_gone < 2023) %>%
# Drop redundant variables
select(-departure_code, -fyear, -gvkey, - co_per_rol, - leftofc, - cik, - sources, - `_merge`) %>%
# Drop high cardinality predictors
select(-exec_fullname) %>%
#Convert to factor the variables with a few unique values
#mutate(across(tenure_no_ceodb:fyear_gone, factor)) %>%
# Convert to factor all character variables, except the string variable - notes
mutate(across(where(is.character), factor)) %>%
# Keep notes as character
mutate(notes = as.character(notes)) %>%
filter(!is.na(notes))
skimr::skim(data)
Name | data |
Number of rows | 7458 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 1 |
factor | 3 |
numeric | 3 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
notes | 0 | 1 | 5 | 3117 | 0 | 7448 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1 | FALSE | 7458 | 1: 1, 10: 1, 100: 1, 100: 1 |
coname | 0 | 1 | FALSE | 3427 | BAR: 8, CLA: 8, FED: 8, NTN: 8 |
ceo_dismissal | 0 | 1 | FALSE | 2 | not: 5976, dis: 1482 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
tenure_no_ceodb | 0 | 1 | 1.03 | 0.16 | 1 | 1 | 1 | 1 | 3 | ▇▁▁▁▁ |
max_tenure_ceodb | 0 | 1 | 1.05 | 0.23 | 1 | 1 | 1 | 1 | 4 | ▇▁▁▁▁ |
fyear_gone | 0 | 1 | 2006.40 | 7.50 | 1980 | 2000 | 2006 | 2013 | 2021 | ▁▂▇▇▆ |
factors_vec <- data %>% select(dismissal_dataset_id, ceo_dismissal, tenure_no_ceodb, max_tenure_ceodb, fyear_gone) %>% names()
data_clean <- data %>%
mutate(across(all_of(factors_vec), as.factor)) %>%
# h2o requires all variables to be either numeric or factors
mutate(across(where(is.character), factor))
set.seed(1234)
data_clean <- data_clean %>% sample_n(500)
data_split <- initial_split(data_clean, strata = "ceo_dismissal")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
recipe_obj <- recipe(ceo_dismissal ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
# Intitialize h2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 4 minutes 8 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months and 14 days
## H2O cluster name: H2O_started_from_R_julius.mondschein_fgr987
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.63 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.0 (2024-04-24)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split_h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios =c(0.85), seed = 2345)
## | | | 0% | |======================================================================| 100%
train_h2o <- split_h2o[[1]]
valid_h2o <- split_h2o[[2]]
test_h2o <- as.h2o(test_tbl)
## | | | 0% | |======================================================================| 100%
y <- "ceo_dismissal"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
#max_runtime_secs = 30,
max_models = 10,
exclude_algos = "DeepLearning",
nfolds = 5,
seed = 3456
)
## | | | 0%
## 12:03:41.950: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models. | |==== | 5% | |========= | 13% | |============= | 18% | |============== | 21% | |==================== | 28% | |======================================================================| 100%
Examine the output of h2o.automl
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id auc logloss aucpr
## 1 GBM_1_AutoML_3_20241205_120341 0.6394558 0.6214412 0.8962589
## 2 GLM_1_AutoML_3_20241205_120341 0.6022676 0.4502793 0.8712547
## 3 GBM_4_AutoML_3_20241205_120341 0.5684807 0.6391895 0.8750144
## 4 GBM_2_AutoML_3_20241205_120341 0.5185941 0.6445316 0.8503257
## 5 DRF_1_AutoML_3_20241205_120341 0.5151927 5.2669597 0.8375897
## 6 XGBoost_3_AutoML_3_20241205_120341 0.5138322 0.4491892 0.8456128
## mean_per_class_error rmse mse
## 1 0.5000000 0.3976886 0.1581562
## 2 0.5000000 0.3725710 0.1388092
## 3 0.4761905 0.3983940 0.1587178
## 4 0.5000000 0.3991362 0.1593097
## 5 0.4761905 0.4039297 0.1631592
## 6 0.5000000 0.3721429 0.1384904
##
## [12 rows x 7 columns]
best_model <- models_h2o@leader
best_model
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model ID: GBM_1_AutoML_3_20241205_120341
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 20 20 2547 1
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 1 1.00000 2 2 2.00000
##
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.0124815
## RMSE: 0.1117206
## LogLoss: 0.09319324
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 1
## Gini: 1
## R^2: 0.9118428
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## dismissed not dismissed Error Rate
## dismissed 56 0 0.000000 =0/56
## not dismissed 0 272 0.000000 =0/272
## Totals 56 272 0.000000 =0/328
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.933688 1.000000 32
## 2 max f2 0.933688 1.000000 32
## 3 max f0point5 0.933688 1.000000 32
## 4 max accuracy 0.933688 1.000000 32
## 5 max precision 0.951637 1.000000 0
## 6 max recall 0.933688 1.000000 32
## 7 max specificity 0.951637 1.000000 0
## 8 max absolute_mcc 0.933688 1.000000 32
## 9 max min_per_class_accuracy 0.933688 1.000000 32
## 10 max mean_per_class_accuracy 0.933688 1.000000 32
## 11 max tns 0.951637 56.000000 0
## 12 max fns 0.951637 271.000000 0
## 13 max fps 0.236939 56.000000 34
## 14 max tps 0.933688 272.000000 32
## 15 max tnr 0.951637 1.000000 0
## 16 max fnr 0.951637 0.996324 0
## 17 max fpr 0.236939 1.000000 34
## 18 max tpr 0.933688 1.000000 32
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
##
## MSE: 0.1035829
## RMSE: 0.321843
## LogLoss: 0.4175753
## Mean Per-Class Error: 0.5
## AUC: 0.6365854
## AUCPR: 0.9329715
## Gini: 0.2731707
## R^2: -0.06917754
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## dismissed not dismissed Error Rate
## dismissed 0 5 1.000000 =5/5
## not dismissed 0 41 0.000000 =0/41
## Totals 0 46 0.108696 =5/46
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.961898 0.942529 3
## 2 max f2 0.961898 0.976190 3
## 3 max f0point5 0.961898 0.911111 3
## 4 max accuracy 0.961898 0.891304 3
## 5 max precision 0.976523 0.954545 0
## 6 max recall 0.961898 1.000000 3
## 7 max specificity 0.976523 0.800000 0
## 8 max absolute_mcc 0.976523 0.194530 0
## 9 max min_per_class_accuracy 0.976523 0.512195 0
## 10 max mean_per_class_accuracy 0.976523 0.656098 0
## 11 max tns 0.976523 4.000000 0
## 12 max fns 0.976523 20.000000 0
## 13 max fps 0.972609 5.000000 1
## 14 max tps 0.961898 41.000000 3
## 15 max tnr 0.976523 0.800000 0
## 16 max fnr 0.976523 0.487805 0
## 17 max fpr 0.972609 1.000000 1
## 18 max tpr 0.961898 1.000000 3
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.162921
## RMSE: 0.4036348
## LogLoss: 0.6534263
## Mean Per-Class Error: 0.5
## AUC: 0.4152114
## AUCPR: 0.7953657
## Gini: -0.1695772
## R^2: -0.1507154
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## dismissed not dismissed Error Rate
## dismissed 0 56 1.000000 =56/56
## not dismissed 0 272 0.000000 =0/272
## Totals 0 328 0.170732 =56/328
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.949902 0.906667 19
## 2 max f2 0.949902 0.960452 19
## 3 max f0point5 0.949902 0.858586 19
## 4 max accuracy 0.949902 0.829268 19
## 5 max precision 0.949902 0.829268 19
## 6 max recall 0.949902 1.000000 19
## 7 max specificity 0.977871 0.875000 0
## 8 max absolute_mcc 0.973205 0.102622 6
## 9 max min_per_class_accuracy 0.976296 0.410714 4
## 10 max mean_per_class_accuracy 0.949902 0.500000 19
## 11 max tns 0.977871 49.000000 0
## 12 max fns 0.977871 251.000000 0
## 13 max fps 0.964830 56.000000 15
## 14 max tps 0.949902 272.000000 19
## 15 max tnr 0.977871 0.875000 0
## 16 max fnr 0.977871 0.922794 0
## 17 max fpr 0.964830 1.000000 15
## 18 max tpr 0.949902 1.000000 19
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.829277 0.006334 0.818182 0.833333 0.833333
## auc 0.431002 0.053383 0.398920 0.473554 0.424793
## err 0.170723 0.006334 0.181818 0.166667 0.166667
## err_count 11.200000 0.447214 12.000000 11.000000 11.000000
## f0point5 0.858588 0.005440 0.849057 0.862069 0.862069
## f1 0.906662 0.003801 0.900000 0.909091 0.909091
## f2 0.960446 0.001711 0.957447 0.961538 0.961538
## lift_top_group 0.954939 0.032875 0.916667 0.991304 0.960000
## logloss 0.817678 0.032959 0.875221 0.795375 0.798577
## max_per_class_error 1.000000 0.000000 1.000000 1.000000 1.000000
## mcc NA 0.000000 NA NA NA
## mean_per_class_accuracy 0.500000 0.000000 0.500000 0.500000 0.500000
## mean_per_class_error 0.500000 0.000000 0.500000 0.500000 0.500000
## mse 0.167823 0.006270 0.178810 0.163788 0.163846
## pr_auc 0.803669 0.022710 0.776126 0.825995 0.807725
## precision 0.829277 0.006334 0.818182 0.833333 0.833333
## r2 -0.185436 0.009447 -0.201998 -0.179271 -0.179689
## recall 1.000000 0.000000 1.000000 1.000000 1.000000
## rmse 0.409606 0.007569 0.422859 0.404707 0.404779
## specificity 0.000000 0.000000 0.000000 0.000000 0.000000
## cv_4_valid cv_5_valid
## accuracy 0.830769 0.830769
## auc 0.363636 0.494108
## err 0.169231 0.169231
## err_count 11.000000 11.000000
## f0point5 0.859873 0.859873
## f1 0.907563 0.907563
## f2 0.960854 0.960854
## lift_top_group 0.925926 0.980796
## logloss 0.814166 0.805050
## max_per_class_error 1.000000 1.000000
## mcc NA NA
## mean_per_class_accuracy 0.500000 0.500000
## mean_per_class_error 0.500000 0.500000
## mse 0.166428 0.166243
## pr_auc 0.784391 0.824108
## precision 0.830769 0.830769
## r2 -0.183767 -0.182455
## recall 1.000000 1.000000
## rmse 0.407956 0.407730
## specificity 0.000000 0.000000
?h2o.getModel
?h2o.saveModel
?h2o.loadModel
h2o.getModel("GBM_1_AutoML_9_20241124_183641") %>%
h2o.saveModel("~/Desktop/PSU_DAT3100_IntermediateDataAnalytics/PSU_DAT3100/12_module14/h2o_models/")
best_model <- h2o.loadModel("~/Desktop/PSU_DAT3100_IntermediateDataAnalytics/PSU_DAT3100/12_module14/h2o_models/GBM_1_AutoML_9_20241124_183641")
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'dismissal_dataset_id' has levels not trained on: ["1033",
## "1060", "1076", "109", "1209", "1225", "1296", "1351", "136", "160", ...106 not
## listed..., "8595", "8711", "8865", "8876", "89", "8951", "910", "947", "975",
## "977"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'coname' has levels not trained on: ["AARON'S INC", "ABERCROMBIE
## & FITCH -CL A", "ADTALEM GLOBAL EDUCATION INC", "ADVANCE AUTO PARTS INC",
## "AETNA INC", "AIRTRAN HOLDINGS INC", "ALBERTO-CULVER CO", "ALCOA INC",
## "ALEXANDER & ALEXANDER", "ALIANT COMMUNICATIONS INC", ...94 not listed...,
## "UNITED MERIDIAN CORP", "UNITED ONLINE INC", "US ECOLOGY INC", "VOLT INFO
## SCIENCES INC", "WASTE MANAGEMENT INC-OLD", "WEBB (DEL E) CORP", "WEBMD HEALTH
## CORP", "WEYERHAEUSER CO", "WHIRLPOOL CORP", "WITCO CORP"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'tenure_no_ceodb' has levels not trained on: ["3"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'max_tenure_ceodb' has levels not trained on: ["3"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'fyear_gone' has levels not trained on: ["1988"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation dataset column 'notes' has levels not trained on: [""FRANCIS LOBO: Finally, as I am sure you have read in the 8-K that was filed with the SEC earlier today, I have resigned my position as President and Chief Executive Officer and Director of United Online, effective November 18, 2015, to pursue another business opportunity. . . . MIKE CRAWFORD: Okay. Francis, best of luck at WeWork."", "After Daly expressed his intention to step down in 2012 after succesfully taking the company public among other success. A succession team and plan was set in place to prepare for Daly to step down as Chief Executive Officer in 2013.", "After coming to a mutual agreement with the Board of Directors and leaders of the company David Schlanger, resigned from the role of CEO of WebMD. Just months later he was appointed to CEO of Progny, a fertility services comapny.", "After having “successfully accomplished his mission,” Peter Bain — head of the Old Mutual Asset Management (OMAM) (now BRIGHTSPHERE INVEST GRP INC) — resigned with immediate effect from his position as president, CEO, and director Friday. James J. Ritchie, chairman of OMAM’s board of directors, will serve as executive chairman and interim CEO until a replacement is found. "We restructured the business to focus on Affiliates with high growth potential, became a public company, completed a successful new Affiliate acquisition and executed the sell-down process for Old Mutual plc,” Bain said in a statement. “I am happy to hand over the business in good shape and want to thank my team for their dedication and support in our drive to create a great company.”", "After serving as the Chairman and Chief Executive Officer from 1983 until May 2002, Richard Herzer retired as CEO. The Company went public in 1991, and since then the chain has grown from less than 500 to over 1,100 restaurants and annual system-wide sales have more than tripled from $413 million to over $1.4 billion in 2002. After retred as CEO, he remained as Chairman of the Board until 2003 before retiring from that position as well. He was 71.", "All findings show that this transition was planned. Mr. Fusco would go on to become the C.E.O. of Cheniere Energy, which he still holds. However, the original plan was for Mr. Fusco to give up his title as C.E.O. of Calpine Corp. in order to become its Executive Chairman. Therefore, the departure code of "6" is recoded to a "5".", "Aon, formerly Rollins Hudig Hall, acquired Alexander & Alexander in December in a $1.23 billion deal that created the world's fourth-largest business insurance brokerage.", "At the age of 81, Mr. Shaw passed away in his home that morning.", "At the start of 1998, with FORE just moved into its new headquarters building in Warrendale, the company installed a new president and CEO: Thomas Gill, formerly the company's chief financial officer and chief operating officer. Exiting CEO Eric Cooper retained his role of chairman of the board. Of the company's other founders, only Francois Bitz had left the company, with former company president Onat Menzilcioglu still on the company's Board, and Robert Sansom continuing in his position of senior vice-president and chief technology officer.", "Based on the article, Robert N. Fisch is Independent Director of Ollie's Bargain Outlet Holdings, Inc. He currently is President of RNF Group, a consulting company focused on the assessment and evaluation of retail and other business enterprises, as well as providing mentoring services to existing management of these companies, a position he has held since January 2017. He served as the President, Chief Executive Officer and Chairman of the Board of rue21, inc., a large specialty apparel retailer, from June 2001 until October 2016. The information imples that the departure was due to the interest in other opportunities. Notwithstanding the information, no explanation was given for Mr. Fisch's departure. The departure code is recoded to a "5" from a "6"/.", ...106 not listed..., "Vanderwoude served as the President, Chief Executive Officer and a director of Powerhouse Technologies, Inc. from 1994 to 1995. From 1996 until April 2007, he served as Chairman and Chief Executive Officer of Madison River Telephone Company LLC. As CEO, he saw the start of a decline in revenues
##
## Changed to 3 - although the executive went on to find other work in the industry as a CEO, his short tenrue at powerhouse and the later proxy disclosures calling it a 'resignation' suggests that it might be less than willingin.
## ", "WILLIAM L. WEISS, 64, Chairman of the Board and Chief Executive Officer of Ameritech from Ameritech's incorporation in 1983 to January 1994 and Chairman of the Board since January 1994. . . . Mr. Weiss will retire as an officer of the Company on May 31, 1994.", "When Galey & Lord emerges from bankruptcy some time next week, the fabric maker will have a new president and CEO, one promoted from inside the company. He’s John J. Heldrich, currently Galey & Lord’s executive vice president and president of Swift Denim Group, a company division. Heldrich, 51, will replace Arthur C. Wiener, who announced in December that he would retire when the company emerges from bankruptcy. March 1, 2004, Galey & Lord Inc. has emerged from Chapter 11 protection and has elected John J. Heldrich president, CEO and a member of the board. Arthur Wiener announced his retirement at this time as part of the company's reorganization plan", "When Goode retired from the Norfolk-based railroad, the fourth-largest in the country, he ended what many considered to be a successful career. A tax attorney by training, he left in early 2006 after reaching Norfolk Southern's mandatory retirement age.", "William Howell retired as chairman of J.C. Penney in 1997, but continued to serve as a director of ExxonMobil Corporation until 2008.", "With the takeover of Airtran Holdings, Inc. by Southwest Airlines, Mr. Fornaro found himself out of a job. He made it up quickly by signing a two-year consulting contract with Southwest Airlines, along with a generous severance package afforded him by said company.", "Women's clothing retailer Christopher & Banks Corporation said Tuesday that CEO Lorna Nagler has resigned all positions within the company, effective immediately. Minneapolis-based Christopher & Banks didn't reveal further details about Nagler's departure, but the company has struggled in recent years amid tightened consumer spending prompted by the recession. It reported a $2.5 million loss on revenue of $101.3 million during the most recent quarter, which ended in August. During the most recent fiscal year, which ended in February, Christopher & Banks posted a $158,000 loss on $455.4 million. Since 2006, when shares of the company's stock were close to $30, they have slowly declined in value, closing Thursday at $6.77.", "Wright retired from his role as Chief Executive Officer in 2001 after being CEO since 1981. He remained chairman of the board.", "he retire after his successor come into place and will remain chairman of board for a while", "mutual agreement to terminate Baumgardner due to $5 million less in sales from same quarter year before
##
## Changed code to 3. CEO's employment terminated with immediate effect and there were no plans to fill the board seat that he had vacated. It was right before a investor conference call where the company announced that quarterly earningswer $5 million less than the same quarter last year. Crystal Equity Research’s Managing Director Debra Fiakas said, “If I were to make a guess, there have been concerns in the past within the board and management team as to the best approach for acquisitions. And some wanted to be more aggressive than others.”"]
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 126 × 10
## predict dismissed not.dismissed dismissal_dataset_id coname ceo_dismissal
## <fct> <dbl> <dbl> <fct> <fct> <fct>
## 1 not dismis… 0.0235 0.977 5156 BRIGH… not dismissed
## 2 not dismis… 0.0274 0.973 977 CRACK… dismissed
## 3 not dismis… 0.0274 0.973 1351 FIRST… not dismissed
## 4 not dismis… 0.0274 0.973 5514 DINE … not dismissed
## 5 not dismis… 0.0235 0.977 3843 WITCO… not dismissed
## 6 not dismis… 0.0274 0.973 1209 EL PA… dismissed
## 7 not dismis… 0.0274 0.973 5262 HCA H… not dismissed
## 8 not dismis… 0.0274 0.973 8711 SOLER… not dismissed
## 9 not dismis… 0.0235 0.977 2258 MEDUS… not dismissed
## 10 not dismis… 0.0274 0.973 1958 GREAT… dismissed
## # ℹ 116 more rows
## # ℹ 4 more variables: tenure_no_ceodb <fct>, max_tenure_ceodb <fct>,
## # fyear_gone <fct>, notes <fct>
?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "GBM_1_AutoML_3_20241205_120341"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/GBM_1_AutoML_3_20241205_120341"
##
##
## $model_checksum
## [1] "-317181396689018176"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_a054_3"
##
##
## $frame_checksum
## [1] "-848237940154077900"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.733418e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.1581562
##
## $RMSE
## [1] 0.3976886
##
## $nobs
## [1] 126
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] -0.1387246
##
## $logloss
## [1] 0.6214412
##
## $AUC
## [1] 0.6394558
##
## $pr_auc
## [1] 0.8962589
##
## $Gini
## [1] 0.2789116
##
## $mean_per_class_error
## [1] 0.5
##
## $domain
## [1] "dismissed" "not dismissed"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## dismissed not dismissed Error Rate
## dismissed 0 21 1.0000 = 21 / 21
## not dismissed 0 105 0.0000 = 0 / 105
## Totals 0 126 0.1667 = 21 / 126
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.976523 0.569536 0.461373 0.743945 0.484127 0.934783 0.409524 0.857143
## 2 0.972609 0.903509 0.948435 0.862647 0.825397 0.837398 0.980952 0.047619
## 3 0.961898 0.908297 0.955882 0.865225 0.833333 0.838710 0.990476 0.047619
## 4 0.955657 0.909091 0.961538 0.862069 0.833333 0.833333 1.000000 0.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.206419 0.409524 0.633333 18 62 3 43
## 2 0.069843 0.047619 0.514286 1 2 20 103
## 3 0.113592 0.047619 0.519048 1 1 20 104
## 4 0.000000 0.000000 0.500000 0 0 21 105
## tnr fnr fpr tpr idx
## 1 0.857143 0.590476 0.142857 0.409524 0
## 2 0.047619 0.019048 0.952381 0.980952 1
## 3 0.047619 0.009524 0.952381 0.990476 2
## 4 0.000000 0.000000 1.000000 1.000000 3
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.955657 0.909091 3
## 2 max f2 0.955657 0.961538 3
## 3 max f0point5 0.961898 0.865225 2
## 4 max accuracy 0.961898 0.833333 2
## 5 max precision 0.976523 0.934783 0
## 6 max recall 0.955657 1.000000 3
## 7 max specificity 0.976523 0.857143 0
## 8 max absolute_mcc 0.976523 0.206419 0
## 9 max min_per_class_accuracy 0.976523 0.409524 0
## 10 max mean_per_class_accuracy 0.976523 0.633333 0
## 11 max tns 0.976523 18.000000 0
## 12 max fns 0.976523 62.000000 0
## 13 max fps 0.955657 21.000000 3
## 14 max tps 0.955657 105.000000 3
## 15 max tnr 0.976523 0.857143 0
## 16 max fnr 0.976523 0.590476 0
## 17 max fpr 0.955657 1.000000 3
## 18 max tpr 0.955657 1.000000 3
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 83.33 %, avg score: 97.37 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.36507937 0.976523 1.121739 1.121739
## 2 2 0.97619048 0.972609 0.935065 1.004878
## 3 3 1.00000000 0.955657 0.800000 1.000000
## response_rate score cumulative_response_rate cumulative_score capture_rate
## 1 0.934783 0.976523 0.934783 0.976523 0.409524
## 2 0.779221 0.972609 0.837398 0.974073 0.571429
## 3 0.666667 0.957737 0.833333 0.973684 0.019048
## cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov
## 1 0.409524 12.173913 12.173913 0.266667
## 2 0.980952 -6.493506 0.487805 0.028571
## 3 1.000000 -20.000000 0.000000 0.000000
h2o.auc(best_model)
## [1] 1
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.95565695421697:
## dismissed not dismissed Error Rate
## dismissed 0 21 1.000000 =21/21
## not dismissed 0 105 0.000000 =0/105
## Totals 0 126 0.166667 =21/126
h2o.metric(performance_h2o)
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.976523 0.569536 0.461373 0.743945 0.484127 0.934783 0.409524 0.857143
## 2 0.972609 0.903509 0.948435 0.862647 0.825397 0.837398 0.980952 0.047619
## 3 0.961898 0.908297 0.955882 0.865225 0.833333 0.838710 0.990476 0.047619
## 4 0.955657 0.909091 0.961538 0.862069 0.833333 0.833333 1.000000 0.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.206419 0.409524 0.633333 18 62 3 43
## 2 0.069843 0.047619 0.514286 1 2 20 103
## 3 0.113592 0.047619 0.519048 1 1 20 104
## 4 0.000000 0.000000 0.500000 0 0 21 105
## tnr fnr fpr tpr idx
## 1 0.857143 0.590476 0.142857 0.409524 0
## 2 0.047619 0.019048 0.952381 0.980952 1
## 3 0.047619 0.009524 0.952381 0.990476 2
## 4 0.000000 0.000000 1.000000 1.000000 3