Data

departures_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv")
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(departures_raw)
## Rows: 9,423
## Columns: 19
## $ dismissal_dataset_id <dbl> 559043, 12, 13, 31, 43, 51, 61, 63, 62, 65, 75, 7…
## $ coname               <chr> "SONICBLUE INC", "AMERICAN AIRLINES GROUP INC", "…
## $ gvkey                <dbl> 27903, 1045, 1045, 1078, 1161, 1177, 1194, 1194, …
## $ fyear                <dbl> 2002, 1997, 2002, 1998, 2001, 1997, 1993, 1997, 1…
## $ co_per_rol           <dbl> -1, 1, 3, 6, 11, 16, 21, 22, 24, 28, 33, 34, 38, …
## $ exec_fullname        <chr> "L. Gregory Ballard", "Robert L. Crandall", "Dona…
## $ departure_code       <dbl> 7, 5, 3, 5, 5, 5, 5, 7, 9, 5, 5, 5, 3, 5, 5, 3, 3…
## $ ceo_dismissal        <dbl> 0, 0, 1, 0, 0, 0, 0, 0, NA, 0, 0, 0, 1, 0, 0, 1, …
## $ interim_coceo        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ tenure_no_ceodb      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ max_tenure_ceodb     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ fyear_gone           <dbl> 2003, 1998, 2003, 1998, 2002, 1997, 1993, 1998, 1…
## $ leftofc              <dttm> 2003-03-21, 1998-05-20, 2003-04-24, 1998-12-31, …
## $ still_there          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ notes                <chr> "Ballard took over when the outgoing CEO said tha…
## $ sources              <chr> "https://www.wsj.com/articles/SB10288576921909334…
## $ eight_ks             <chr> "https://www.sec.gov/Archives/edgar/data/850519/0…
## $ cik                  <dbl> 850519, 6201, 6201, 1800, 2488, 1122304, 771667, …
## $ `_merge`             <chr> "matched (3)", "matched (3)", "matched (3)", "mat…

Initialize H2O

h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\rad1081\AppData\Local\Temp\Rtmp2pFSKY\file51b437a13422/h2o_rad1081_started_from_r.out
##     C:\Users\rad1081\AppData\Local\Temp\Rtmp2pFSKY\file51b44a802a39/h2o_rad1081_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 505 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 14 days 
##     H2O cluster name:           H2O_started_from_R_rad1081_byq922 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   7.91 GB 
##     H2O cluster total cores:    20 
##     H2O cluster allowed cores:  20 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.1 (2024-06-14 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

Clean and preprocess data

departures_clean <- departures_raw %>%
  filter(!is.na(ceo_dismissal)) %>%
  mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "Dismissed", "Not_Dismissed"),
         ceo_dismissal = as.factor(ceo_dismissal)) %>%
  select(-c(interim_coceo, still_there, eight_ks, sources, `_merge`, 
            departure_code, gvkey, cik, co_per_rol)) %>%
  distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
  filter(fyear_gone < 2025) %>%
  mutate(across(where(is.character), as.factor),
         notes = as.character(notes))

Split data

set.seed(123)
data_split <- initial_split(departures_clean, prop = 0.8, strata = ceo_dismissal)
train_data <- training(data_split)
test_data  <- testing(data_split)

Convert to H2O format

train_h2o <- as.h2o(train_data)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
test_h2o  <- as.h2o(test_data)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

Train AutoML Model

h2o_model <- h2o.automl(
  x = setdiff(names(train_h2o), "ceo_dismissal"),
  y = "ceo_dismissal",
  training_frame = train_h2o,
  max_models = 10,
  seed = 123
)
##   |                                                                              |                                                                      |   0%  |                                                                              |=                                                                     |   1%
## 19:56:33.68: AutoML: XGBoost is not available; skipping it.
## 19:56:33.89: _train param, Dropping bad and constant columns: [notes]  |                                                                              |=                                                                     |   2%
## 19:56:41.384: _train param, Dropping bad and constant columns: [notes]  |                                                                              |==                                                                    |   4%  |                                                                              |====                                                                  |   6%
## 19:56:44.718: _train param, Dropping bad and constant columns: [notes]  |                                                                              |======                                                                |   8%
## 19:56:48.636: _train param, Dropping bad and constant columns: [notes]  |                                                                              |========                                                              |  12%
## 19:56:50.385: _train param, Dropping bad and constant columns: [notes]
## 19:56:52.236: _train param, Dropping bad and constant columns: [notes]  |                                                                              |===========                                                           |  15%
## 19:56:55.267: _train param, Dropping bad and constant columns: [notes]  |                                                                              |=============                                                         |  18%  |                                                                              |==============                                                        |  20%  |                                                                              |==============                                                        |  21%
## 19:57:01.169: _train param, Dropping bad and constant columns: [notes]
## 19:57:02.752: _train param, Dropping bad and constant columns: [notes]  |                                                                              |=================                                                     |  24%  |                                                                              |==================                                                    |  26%  |                                                                              |=================================                                     |  47%
## 19:57:10.102: _train param, Dropping unused columns: [notes]  |                                                                              |===================================                                   |  50%
## 19:57:12.35: _train param, Dropping unused columns: [notes]  |                                                                              |======================================================================| 100%

View leaderboard of models

h2o_leaderboard <- h2o_model@leaderboard
print(h2o_leaderboard)
##                                      model_id mean_per_class_error   logloss
## 1              GBM_3_AutoML_1_20250504_195633            0.6612120 0.5812939
## 2              GBM_1_AutoML_1_20250504_195633            0.6619940 0.5931049
## 3              GBM_4_AutoML_1_20250504_195633            0.6620580 0.6208925
## 4              GBM_2_AutoML_1_20250504_195633            0.6620720 0.5677400
## 5              GBM_5_AutoML_1_20250504_195633            0.6621388 0.5469470
## 6 GBM_grid_1_AutoML_1_20250504_195633_model_1            0.6627705 0.5929122
##        rmse       mse
## 1 0.4183904 0.1750505
## 2 0.4205756 0.1768838
## 3 0.4235133 0.1793635
## 4 0.4154369 0.1725878
## 5 0.4107871 0.1687461
## 6 0.4193306 0.1758381
## 
## [12 rows x 5 columns]

Evaluate leader model

best_model <- h2o_model@leader
performance <- h2o.performance(best_model, newdata = test_h2o)

cat("AUC:", h2o.auc(performance), "\n")
## AUC: NaN
h2o.confusionMatrix(performance)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               1 Dismissed Not_Dismissed  Error          Rate
## 1             0         0             0     NA =       0 / 0
## Dismissed     0        13           284 0.9562 =   284 / 297
## Not_Dismissed 0        35          1164 0.0292 =  35 / 1,199
## Totals        0        48          1448 0.2132 = 319 / 1,496

Predict on test set

predictions <- h2o.predict(best_model, test_h2o)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'coname' has levels not trained on: ["ACNIELSEN CORP",
## "ADDINGTON RESOURCES INC", "ADT CORP", "ADTRAN INC", "ADVANCED TISSUE SCI -CL
## A", "AEROFLEX INC", "AIR EXPRESS INTERNATIONAL CP", "ALEXANDRIA R E EQUITIES
## INC", "ALLEN TELECOM INC", "ALLSCRIPTS HEALTHCARE SOLTNS", ...284 not
## listed..., "WESLEY JESSEN VISIONCARE INC", "WESTROCK CO", "WET SEAL INC",
## "WHITNEY HOLDING CORP", "WOODWARD INC", "WORLD FUEL SERVICES CORP", "XPO
## LOGISTICS INC", "YOUNKERS INC", "ZEBRA TECHNOLOGIES CP -CL A", "ZENITH NATIONAL
## INSURANCE CP"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'exec_fullname' has levels not trained on: ["A. Eugene Sapp,
## Jr.", "A. George (Skip) Battle", "A. Lanham Napier", "A. Laurence Jones", "A.
## Malachi Mixon III", "Adrian Adams", "Ahmad R. Chatila", "Alain A. Couder",
## "Alain Juan Pablo Belda", "Alan C. Greenberg", ...1307 not listed..., "William
## Thomas Dillard", "William V. Campbell", "William V. Larkin Jr.", "William V.
## Stephenson", "William White Adams", "Willliam T. Jensen", "Wilson B. Sexton",
## "Wilson W. Cheung", "Wilson Wilde", "Ying Lu"]
head(predictions)
##         predict         p1  Dismissed Not_Dismissed
## 1 Not_Dismissed 0.01051693 0.05235578     0.9371273
## 2 Not_Dismissed 0.01169119 0.09276919     0.8955396
## 3 Not_Dismissed 0.02515968 0.30383908     0.6710012
## 4 Not_Dismissed 0.01003835 0.04618693     0.9437747
## 5 Not_Dismissed 0.01110377 0.05351163     0.9353846
## 6 Not_Dismissed 0.05069571 0.40308204     0.5462222

Plot variable importance

h2o.varimp_plot(best_model)

Shutdown H2O when done

h2o.shutdown(prompt = FALSE)