Goal is to automate building and tuning a classification model to predict employee attrition, using the h2o::h2o.automl.

Set up

Import data

Import the cleaned data from Module 7.

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::day()   masks h2o::day()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ lubridate::hour()  masks h2o::hour()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ lubridate::month() masks h2o::month()
## ✖ lubridate::week()  masks h2o::week()
## ✖ lubridate::year()  masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6      ✔ rsample      1.2.1 
## ✔ dials        1.3.0      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

library(tidyquant)

## Warning: package 'tidyquant' was built under R version 4.4.1

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## ── Attaching core tidyquant packages ──────────────────────── tidyquant 1.0.9 ──
## ✔ PerformanceAnalytics 2.0.4      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.26     ✔ xts                  0.13.2── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ scales::col_factor()           masks readr::col_factor()
## ✖ lubridate::day()               masks h2o::day()
## ✖ scales::discard()              masks purrr::discard()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ recipes::fixed()               masks stringr::fixed()
## ✖ lubridate::hour()              masks h2o::hour()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ TTR::momentum()                masks dials::momentum()
## ✖ lubridate::month()             masks h2o::month()
## ✖ yardstick::spec()              masks readr::spec()
## ✖ quantmod::summary()            masks h2o::summary(), base::summary()
## ✖ lubridate::week()              masks h2o::week()
## ✖ lubridate::year()              masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Exploring Data

data <- departures %>%
    
    # Clean ceo_dismissal
    filter(!is.na(ceo_dismissal)) %>%
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not dismissed")) %>%

    # Drop variables with too many missing values
    select(-interim_coceo, - still_there, - eight_ks) %>%
    
    # Treat dismissal_dataset_id
    mutate(dismissal_dataset_id = as.character(dismissal_dataset_id)) %>%
    distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
    
    # Delete year of 2997 in fyear_geon
    filter(fyear_gone < 2023) %>%
    
    # Drop redundant variables
    select(-departure_code, -fyear, -gvkey, - co_per_rol, - leftofc, - cik, - sources, - `_merge`) %>%
    
    # Drop high cardinality predictors
    select(-exec_fullname) %>%
    
    #Convert to factor the variables with a few unique values
    #mutate(across(tenure_no_ceodb:fyear_gone, factor)) %>%
    
    # Convert to factor all character variables, except the string variable - notes
    mutate(across(where(is.character), factor)) %>%
    
    # Keep notes as character
    mutate(notes = as.character(notes)) %>%
    filter(!is.na(notes))

skimr::skim(data)

Data summary
Name	data
Number of rows	7458
Number of columns	7
_______________________
Column type frequency:
character	1
factor	3
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
notes	0	1	5	3117	0	7448	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
dismissal_dataset_id	1	FALSE	7458	1: 1, 10: 1, 100: 1, 100: 1
coname	1	FALSE	3427	BAR: 8, CLA: 8, FED: 8, NTN: 8
ceo_dismissal	1	FALSE	2	not: 5976, dis: 1482

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
tenure_no_ceodb	1	1.03	0.16	1	1	1	1	3	▇▁▁▁▁
max_tenure_ceodb	1	1.05	0.23	1	1	1	1	4	▇▁▁▁▁
fyear_gone	1	2006.40	7.50	1980	2000	2006	2013	2021	▁▂▇▇▆

factors_vec <- data %>% select(dismissal_dataset_id, ceo_dismissal, tenure_no_ceodb, max_tenure_ceodb, fyear_gone) %>% names()

data_clean <- data %>%

mutate(across(all_of(factors_vec), as.factor)) %>%
    
    # h2o requires all variables to be either numeric or factors
    mutate(across(where(is.character), factor))

Split data

set.seed(1234)

data_clean <- data_clean %>% sample_n(500)

data_split <- initial_split(data_clean, strata = "ceo_dismissal")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)

Recipes

recipe_obj <- recipe(ceo_dismissal ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors())

Model

# Intitialize h2o

h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         4 minutes 8 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 14 days 
##     H2O cluster name:           H2O_started_from_R_julius.mondschein_fgr987 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.63 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.0 (2024-04-24)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

split_h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios =c(0.85), seed = 2345)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

train_h2o <- split_h2o[[1]]
valid_h2o <- split_h2o[[2]]
test_h2o  <- as.h2o(test_tbl)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

y <- "ceo_dismissal"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
    x = x,
    y = y, 
    training_frame    = train_h2o,
    validation_frame  = valid_h2o,
    leaderboard_frame = test_h2o,
    #max_runtime_secs  = 30,
    max_models = 10,
    exclude_algos = "DeepLearning",
    nfolds            = 5,
    seed              = 3456
)

##   |                                                                              |                                                                      |   0%
## 12:03:41.950: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.  |                                                                              |====                                                                  |   5%  |                                                                              |=========                                                             |  13%  |                                                                              |=============                                                         |  18%  |                                                                              |==============                                                        |  21%  |                                                                              |====================                                                  |  28%  |                                                                              |======================================================================| 100%

Examine the output of h2o.automl

models_h2o %>% typeof()

## [1] "S4"

models_h2o %>% slotNames()

## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"

models_h2o@leaderboard

##                             model_id       auc   logloss     aucpr
## 1     GBM_1_AutoML_3_20241205_120341 0.6394558 0.6214412 0.8962589
## 2     GLM_1_AutoML_3_20241205_120341 0.6022676 0.4502793 0.8712547
## 3     GBM_4_AutoML_3_20241205_120341 0.5684807 0.6391895 0.8750144
## 4     GBM_2_AutoML_3_20241205_120341 0.5185941 0.6445316 0.8503257
## 5     DRF_1_AutoML_3_20241205_120341 0.5151927 5.2669597 0.8375897
## 6 XGBoost_3_AutoML_3_20241205_120341 0.5138322 0.4491892 0.8456128
##   mean_per_class_error      rmse       mse
## 1            0.5000000 0.3976886 0.1581562
## 2            0.5000000 0.3725710 0.1388092
## 3            0.4761905 0.3983940 0.1587178
## 4            0.5000000 0.3991362 0.1593097
## 5            0.4761905 0.4039297 0.1631592
## 6            0.5000000 0.3721429 0.1384904
## 
## [12 rows x 7 columns]

best_model <- models_h2o@leader
best_model

## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model ID:  GBM_1_AutoML_3_20241205_120341 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              20                       20                2547         1
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         1    1.00000          2          2     2.00000
## 
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  0.0124815
## RMSE:  0.1117206
## LogLoss:  0.09319324
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## R^2:  0.9118428
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##               dismissed not dismissed    Error    Rate
## dismissed            56             0 0.000000   =0/56
## not dismissed         0           272 0.000000  =0/272
## Totals               56           272 0.000000  =0/328
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.933688   1.000000  32
## 2                       max f2  0.933688   1.000000  32
## 3                 max f0point5  0.933688   1.000000  32
## 4                 max accuracy  0.933688   1.000000  32
## 5                max precision  0.951637   1.000000   0
## 6                   max recall  0.933688   1.000000  32
## 7              max specificity  0.951637   1.000000   0
## 8             max absolute_mcc  0.933688   1.000000  32
## 9   max min_per_class_accuracy  0.933688   1.000000  32
## 10 max mean_per_class_accuracy  0.933688   1.000000  32
## 11                     max tns  0.951637  56.000000   0
## 12                     max fns  0.951637 271.000000   0
## 13                     max fps  0.236939  56.000000  34
## 14                     max tps  0.933688 272.000000  32
## 15                     max tnr  0.951637   1.000000   0
## 16                     max fnr  0.951637   0.996324   0
## 17                     max fpr  0.236939   1.000000  34
## 18                     max tpr  0.933688   1.000000  32
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
## 
## MSE:  0.1035829
## RMSE:  0.321843
## LogLoss:  0.4175753
## Mean Per-Class Error:  0.5
## AUC:  0.6365854
## AUCPR:  0.9329715
## Gini:  0.2731707
## R^2:  -0.06917754
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##               dismissed not dismissed    Error   Rate
## dismissed             0             5 1.000000   =5/5
## not dismissed         0            41 0.000000  =0/41
## Totals                0            46 0.108696  =5/46
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold     value idx
## 1                       max f1  0.961898  0.942529   3
## 2                       max f2  0.961898  0.976190   3
## 3                 max f0point5  0.961898  0.911111   3
## 4                 max accuracy  0.961898  0.891304   3
## 5                max precision  0.976523  0.954545   0
## 6                   max recall  0.961898  1.000000   3
## 7              max specificity  0.976523  0.800000   0
## 8             max absolute_mcc  0.976523  0.194530   0
## 9   max min_per_class_accuracy  0.976523  0.512195   0
## 10 max mean_per_class_accuracy  0.976523  0.656098   0
## 11                     max tns  0.976523  4.000000   0
## 12                     max fns  0.976523 20.000000   0
## 13                     max fps  0.972609  5.000000   1
## 14                     max tps  0.961898 41.000000   3
## 15                     max tnr  0.976523  0.800000   0
## 16                     max fnr  0.976523  0.487805   0
## 17                     max fpr  0.972609  1.000000   1
## 18                     max tpr  0.961898  1.000000   3
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.162921
## RMSE:  0.4036348
## LogLoss:  0.6534263
## Mean Per-Class Error:  0.5
## AUC:  0.4152114
## AUCPR:  0.7953657
## Gini:  -0.1695772
## R^2:  -0.1507154
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##               dismissed not dismissed    Error     Rate
## dismissed             0            56 1.000000   =56/56
## not dismissed         0           272 0.000000   =0/272
## Totals                0           328 0.170732  =56/328
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.949902   0.906667  19
## 2                       max f2  0.949902   0.960452  19
## 3                 max f0point5  0.949902   0.858586  19
## 4                 max accuracy  0.949902   0.829268  19
## 5                max precision  0.949902   0.829268  19
## 6                   max recall  0.949902   1.000000  19
## 7              max specificity  0.977871   0.875000   0
## 8             max absolute_mcc  0.973205   0.102622   6
## 9   max min_per_class_accuracy  0.976296   0.410714   4
## 10 max mean_per_class_accuracy  0.949902   0.500000  19
## 11                     max tns  0.977871  49.000000   0
## 12                     max fns  0.977871 251.000000   0
## 13                     max fps  0.964830  56.000000  15
## 14                     max tps  0.949902 272.000000  19
## 15                     max tnr  0.977871   0.875000   0
## 16                     max fnr  0.977871   0.922794   0
## 17                     max fpr  0.964830   1.000000  15
## 18                     max tpr  0.949902   1.000000  19
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                              mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                 0.829277 0.006334   0.818182   0.833333   0.833333
## auc                      0.431002 0.053383   0.398920   0.473554   0.424793
## err                      0.170723 0.006334   0.181818   0.166667   0.166667
## err_count               11.200000 0.447214  12.000000  11.000000  11.000000
## f0point5                 0.858588 0.005440   0.849057   0.862069   0.862069
## f1                       0.906662 0.003801   0.900000   0.909091   0.909091
## f2                       0.960446 0.001711   0.957447   0.961538   0.961538
## lift_top_group           0.954939 0.032875   0.916667   0.991304   0.960000
## logloss                  0.817678 0.032959   0.875221   0.795375   0.798577
## max_per_class_error      1.000000 0.000000   1.000000   1.000000   1.000000
## mcc                            NA 0.000000         NA         NA         NA
## mean_per_class_accuracy  0.500000 0.000000   0.500000   0.500000   0.500000
## mean_per_class_error     0.500000 0.000000   0.500000   0.500000   0.500000
## mse                      0.167823 0.006270   0.178810   0.163788   0.163846
## pr_auc                   0.803669 0.022710   0.776126   0.825995   0.807725
## precision                0.829277 0.006334   0.818182   0.833333   0.833333
## r2                      -0.185436 0.009447  -0.201998  -0.179271  -0.179689
## recall                   1.000000 0.000000   1.000000   1.000000   1.000000
## rmse                     0.409606 0.007569   0.422859   0.404707   0.404779
## specificity              0.000000 0.000000   0.000000   0.000000   0.000000
##                         cv_4_valid cv_5_valid
## accuracy                  0.830769   0.830769
## auc                       0.363636   0.494108
## err                       0.169231   0.169231
## err_count                11.000000  11.000000
## f0point5                  0.859873   0.859873
## f1                        0.907563   0.907563
## f2                        0.960854   0.960854
## lift_top_group            0.925926   0.980796
## logloss                   0.814166   0.805050
## max_per_class_error       1.000000   1.000000
## mcc                             NA         NA
## mean_per_class_accuracy   0.500000   0.500000
## mean_per_class_error      0.500000   0.500000
## mse                       0.166428   0.166243
## pr_auc                    0.784391   0.824108
## precision                 0.830769   0.830769
## r2                       -0.183767  -0.182455
## recall                    1.000000   1.000000
## rmse                      0.407956   0.407730
## specificity               0.000000   0.000000

Save and Load

?h2o.getModel
?h2o.saveModel
?h2o.loadModel

h2o.getModel("GBM_1_AutoML_9_20241124_183641") %>%
    h2o.saveModel("~/Desktop/PSU_DAT3100_IntermediateDataAnalytics/PSU_DAT3100/12_module14/h2o_models/")

best_model <- h2o.loadModel("~/Desktop/PSU_DAT3100_IntermediateDataAnalytics/PSU_DAT3100/12_module14/h2o_models/GBM_1_AutoML_9_20241124_183641")

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'dismissal_dataset_id' has levels not trained on: ["1033",
## "1060", "1076", "109", "1209", "1225", "1296", "1351", "136", "160", ...106 not
## listed..., "8595", "8711", "8865", "8876", "89", "8951", "910", "947", "975",
## "977"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'coname' has levels not trained on: ["AARON'S INC", "ABERCROMBIE
## & FITCH -CL A", "ADTALEM GLOBAL EDUCATION INC", "ADVANCE AUTO PARTS INC",
## "AETNA INC", "AIRTRAN HOLDINGS INC", "ALBERTO-CULVER CO", "ALCOA INC",
## "ALEXANDER & ALEXANDER", "ALIANT COMMUNICATIONS INC", ...94 not listed...,
## "UNITED MERIDIAN CORP", "UNITED ONLINE INC", "US ECOLOGY INC", "VOLT INFO
## SCIENCES INC", "WASTE MANAGEMENT INC-OLD", "WEBB (DEL E) CORP", "WEBMD HEALTH
## CORP", "WEYERHAEUSER CO", "WHIRLPOOL CORP", "WITCO CORP"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'tenure_no_ceodb' has levels not trained on: ["3"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'max_tenure_ceodb' has levels not trained on: ["3"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'fyear_gone' has levels not trained on: ["1988"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation dataset column 'notes' has levels not trained on: [""FRANCIS LOBO: Finally, as I am sure you have read in the 8-K that was filed with the SEC earlier today, I have resigned my position as President and Chief Executive Officer and Director of United Online, effective November 18, 2015, to pursue another business opportunity. . . . MIKE CRAWFORD: Okay. Francis, best of luck at WeWork."", "After Daly expressed his intention to step down in 2012 after succesfully taking the company public among other success. A succession team and plan was set in place to prepare for Daly to step down as Chief Executive Officer in 2013.", "After coming to a mutual agreement with the Board of Directors and leaders of the company David Schlanger, resigned from the role of CEO of WebMD. Just months later he was appointed to CEO of Progny, a fertility services comapny.", "After having “successfully accomplished his mission,” Peter Bain — head of the Old Mutual Asset Management (OMAM) (now BRIGHTSPHERE INVEST GRP INC) — resigned with immediate effect from his position as president, CEO, and director Friday. James J. Ritchie, chairman of OMAM’s board of directors, will serve as executive chairman and interim CEO until a replacement is found. "We restructured the business to focus on Affiliates with high growth potential, became a public company, completed a successful new Affiliate acquisition and executed the sell-down process for Old Mutual plc,” Bain said in a statement. “I am happy to hand over the business in good shape and want to thank my team for their dedication and support in our drive to create a great company.”", "After serving as the Chairman and Chief Executive Officer from 1983 until May 2002, Richard Herzer retired as CEO. The Company went public in 1991, and since then the chain has grown from less than 500 to over 1,100 restaurants and annual system-wide sales have more than tripled from $413 million to over $1.4 billion in 2002. After retred as CEO, he remained as Chairman of the Board until 2003 before retiring from that position as well. He was 71.", "All findings show that this transition was planned. Mr. Fusco would go on to become the C.E.O. of Cheniere Energy, which he still holds. However, the original plan was for Mr. Fusco to give up his title as C.E.O. of Calpine Corp. in order to become its Executive Chairman. Therefore, the departure code of "6" is recoded to a "5".", "Aon, formerly Rollins Hudig Hall, acquired Alexander & Alexander in December in a $1.23 billion deal that created the world's fourth-largest business insurance brokerage.", "At the age of 81, Mr. Shaw passed away in his home that morning.", "At the start of 1998, with FORE just moved into its new headquarters building in Warrendale, the company installed a new president and CEO: Thomas Gill, formerly the company's chief financial officer and chief operating officer. Exiting CEO Eric Cooper retained his role of chairman of the board. Of the company's other founders, only Francois Bitz had left the company, with former company president Onat Menzilcioglu still on the company's Board, and Robert Sansom continuing in his position of senior vice-president and chief technology officer.", "Based on the article, Robert N. Fisch is Independent Director of Ollie's Bargain Outlet Holdings, Inc. He currently is President of RNF Group, a consulting company focused on the assessment and evaluation of retail and other business enterprises, as well as providing mentoring services to existing management of these companies, a position he has held since January 2017. He served as the President, Chief Executive Officer and Chairman of the Board of rue21, inc., a large specialty apparel retailer, from June 2001 until October 2016. The information imples that the departure was due to the interest in other opportunities. Notwithstanding the information, no explanation  was given for Mr. Fisch's departure. The departure code is recoded to a "5" from a "6"/.", ...106 not listed..., "Vanderwoude served as the President, Chief Executive Officer and a director of Powerhouse Technologies, Inc. from 1994 to 1995. From 1996 until April 2007, he served as Chairman and Chief Executive Officer of Madison River Telephone Company LLC. As CEO, he saw the start of a decline in revenues
## 
## Changed to 3 - although the executive went on to find other work in the industry as a CEO, his short tenrue at powerhouse and the later proxy disclosures calling it a 'resignation' suggests that it might be less than willingin.
## ", "WILLIAM L. WEISS, 64, Chairman of the Board and Chief Executive Officer of Ameritech from Ameritech's incorporation in 1983 to January 1994 and Chairman of the Board since January 1994. . . . Mr. Weiss will retire as an officer of the Company on May 31, 1994.", "When Galey & Lord emerges from bankruptcy some time next week, the fabric maker will have a new president and CEO, one promoted from inside the company. He’s John J. Heldrich, currently Galey & Lord’s executive vice president and president of Swift Denim Group, a company division. Heldrich, 51, will replace Arthur C. Wiener, who announced in December that he would retire when the company emerges from bankruptcy. March 1, 2004, Galey & Lord Inc. has emerged from Chapter 11 protection and has elected John J. Heldrich president, CEO and a member of the board. Arthur Wiener announced his retirement at this time as part of the company's reorganization plan", "When Goode retired from the Norfolk-based railroad, the fourth-largest in the country, he ended what many considered to be a successful career. A tax attorney by training, he left in early 2006 after reaching Norfolk Southern's mandatory retirement age.", "William Howell retired as chairman of J.C. Penney in 1997, but continued to serve as a director of ExxonMobil Corporation until 2008.", "With the takeover of Airtran Holdings, Inc. by Southwest Airlines, Mr. Fornaro found himself out of a job. He made it up quickly by signing a two-year consulting contract with Southwest Airlines, along with a generous severance package afforded him by said company.", "Women's clothing retailer Christopher & Banks Corporation said Tuesday that CEO Lorna Nagler has resigned all positions within the company, effective immediately. Minneapolis-based Christopher & Banks didn't reveal further details about Nagler's departure, but the company has struggled in recent years amid tightened consumer spending prompted by the recession. It reported a $2.5 million loss on revenue of $101.3 million during the most recent quarter, which ended in August. During the most recent fiscal year, which ended in February, Christopher & Banks posted a $158,000 loss on $455.4 million. Since 2006, when shares of the company's stock were close to $30, they have slowly declined in value, closing Thursday at $6.77.", "Wright retired from his role as Chief Executive Officer in 2001 after being CEO since 1981. He remained chairman of the board.", "he retire after his successor come into place and will remain chairman of board for a while", "mutual agreement to terminate Baumgardner due to $5 million less in sales from same quarter year before
##   
##   Changed code to 3. CEO's employment terminated with immediate effect and there were no plans to fill the board seat that he had vacated. It was right before a investor conference call where the company announced that quarterly earningswer $5 million less than the same quarter last year. Crystal Equity Research’s Managing Director Debra Fiakas said, “If I were to make a guess, there have been concerns in the past within the board and management team as to the best approach for acquisitions. And some wanted to be more aggressive than others.”"]

predictions_tbl <- predictions %>%
    as_tibble()

predictions_tbl %>%
    bind_cols(test_tbl)

## # A tibble: 126 × 10
##    predict     dismissed not.dismissed dismissal_dataset_id coname ceo_dismissal
##    <fct>           <dbl>         <dbl> <fct>                <fct>  <fct>        
##  1 not dismis…    0.0235         0.977 5156                 BRIGH… not dismissed
##  2 not dismis…    0.0274         0.973 977                  CRACK… dismissed    
##  3 not dismis…    0.0274         0.973 1351                 FIRST… not dismissed
##  4 not dismis…    0.0274         0.973 5514                 DINE … not dismissed
##  5 not dismis…    0.0235         0.977 3843                 WITCO… not dismissed
##  6 not dismis…    0.0274         0.973 1209                 EL PA… dismissed    
##  7 not dismis…    0.0274         0.973 5262                 HCA H… not dismissed
##  8 not dismis…    0.0274         0.973 8711                 SOLER… not dismissed
##  9 not dismis…    0.0235         0.977 2258                 MEDUS… not dismissed
## 10 not dismis…    0.0274         0.973 1958                 GREAT… dismissed    
## # ℹ 116 more rows
## # ℹ 4 more variables: tenure_no_ceodb <fct>, max_tenure_ceodb <fct>,
## #   fyear_gone <fct>, notes <fct>

Evaluate model

?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)

## [1] "S4"

slotNames(performance_h2o)

## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"

performance_h2o@metrics

## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "GBM_1_AutoML_3_20241205_120341"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/GBM_1_AutoML_3_20241205_120341"
## 
## 
## $model_checksum
## [1] "-317181396689018176"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_a054_3"
## 
## 
## $frame_checksum
## [1] "-848237940154077900"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.733418e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.1581562
## 
## $RMSE
## [1] 0.3976886
## 
## $nobs
## [1] 126
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] -0.1387246
## 
## $logloss
## [1] 0.6214412
## 
## $AUC
## [1] 0.6394558
## 
## $pr_auc
## [1] 0.8962589
## 
## $Gini
## [1] 0.2789116
## 
## $mean_per_class_error
## [1] 0.5
## 
## $domain
## [1] "dismissed"     "not dismissed"
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               dismissed not dismissed  Error       Rate
## dismissed             0            21 1.0000 =  21 / 21
## not dismissed         0           105 0.0000 =  0 / 105
## Totals                0           126 0.1667 = 21 / 126
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.976523 0.569536 0.461373 0.743945 0.484127  0.934783 0.409524    0.857143
## 2  0.972609 0.903509 0.948435 0.862647 0.825397  0.837398 0.980952    0.047619
## 3  0.961898 0.908297 0.955882 0.865225 0.833333  0.838710 0.990476    0.047619
## 4  0.955657 0.909091 0.961538 0.862069 0.833333  0.833333 1.000000    0.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.206419               0.409524                0.633333  18  62   3  43
## 2     0.069843               0.047619                0.514286   1   2  20 103
## 3     0.113592               0.047619                0.519048   1   1  20 104
## 4     0.000000               0.000000                0.500000   0   0  21 105
##        tnr      fnr      fpr      tpr idx
## 1 0.857143 0.590476 0.142857 0.409524   0
## 2 0.047619 0.019048 0.952381 0.980952   1
## 3 0.047619 0.009524 0.952381 0.990476   2
## 4 0.000000 0.000000 1.000000 1.000000   3
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.955657   0.909091   3
## 2                       max f2  0.955657   0.961538   3
## 3                 max f0point5  0.961898   0.865225   2
## 4                 max accuracy  0.961898   0.833333   2
## 5                max precision  0.976523   0.934783   0
## 6                   max recall  0.955657   1.000000   3
## 7              max specificity  0.976523   0.857143   0
## 8             max absolute_mcc  0.976523   0.206419   0
## 9   max min_per_class_accuracy  0.976523   0.409524   0
## 10 max mean_per_class_accuracy  0.976523   0.633333   0
## 11                     max tns  0.976523  18.000000   0
## 12                     max fns  0.976523  62.000000   0
## 13                     max fps  0.955657  21.000000   3
## 14                     max tps  0.955657 105.000000   3
## 15                     max tnr  0.976523   0.857143   0
## 16                     max fnr  0.976523   0.590476   0
## 17                     max fpr  0.955657   1.000000   3
## 18                     max tpr  0.955657   1.000000   3
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 83.33 %, avg score: 97.37 %
##   group cumulative_data_fraction lower_threshold     lift cumulative_lift
## 1     1               0.36507937        0.976523 1.121739        1.121739
## 2     2               0.97619048        0.972609 0.935065        1.004878
## 3     3               1.00000000        0.955657 0.800000        1.000000
##   response_rate    score cumulative_response_rate cumulative_score capture_rate
## 1      0.934783 0.976523                 0.934783         0.976523     0.409524
## 2      0.779221 0.972609                 0.837398         0.974073     0.571429
## 3      0.666667 0.957737                 0.833333         0.973684     0.019048
##   cumulative_capture_rate       gain cumulative_gain kolmogorov_smirnov
## 1                0.409524  12.173913       12.173913           0.266667
## 2                0.980952  -6.493506        0.487805           0.028571
## 3                1.000000 -20.000000        0.000000           0.000000

h2o.auc(best_model)

## [1] 1

h2o.confusionMatrix(performance_h2o)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.95565695421697:
##               dismissed not dismissed    Error     Rate
## dismissed             0            21 1.000000   =21/21
## not dismissed         0           105 0.000000   =0/105
## Totals                0           126 0.166667  =21/126

h2o.metric(performance_h2o)

## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.976523 0.569536 0.461373 0.743945 0.484127  0.934783 0.409524    0.857143
## 2  0.972609 0.903509 0.948435 0.862647 0.825397  0.837398 0.980952    0.047619
## 3  0.961898 0.908297 0.955882 0.865225 0.833333  0.838710 0.990476    0.047619
## 4  0.955657 0.909091 0.961538 0.862069 0.833333  0.833333 1.000000    0.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.206419               0.409524                0.633333  18  62   3  43
## 2     0.069843               0.047619                0.514286   1   2  20 103
## 3     0.113592               0.047619                0.519048   1   1  20 104
## 4     0.000000               0.000000                0.500000   0   0  21 105
##        tnr      fnr      fpr      tpr idx
## 1 0.857143 0.590476 0.142857 0.409524   0
## 2 0.047619 0.019048 0.952381 0.980952   1
## 3 0.047619 0.009524 0.952381 0.990476   2
## 4 0.000000 0.000000 1.000000 1.000000   3

Apply11

Julius Mondschein

2024-11-25