Data

departures_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv")

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(departures_raw)

## Rows: 9,423
## Columns: 19
## $ dismissal_dataset_id <dbl> 559043, 12, 13, 31, 43, 51, 61, 63, 62, 65, 75, 7…
## $ coname               <chr> "SONICBLUE INC", "AMERICAN AIRLINES GROUP INC", "…
## $ gvkey                <dbl> 27903, 1045, 1045, 1078, 1161, 1177, 1194, 1194, …
## $ fyear                <dbl> 2002, 1997, 2002, 1998, 2001, 1997, 1993, 1997, 1…
## $ co_per_rol           <dbl> -1, 1, 3, 6, 11, 16, 21, 22, 24, 28, 33, 34, 38, …
## $ exec_fullname        <chr> "L. Gregory Ballard", "Robert L. Crandall", "Dona…
## $ departure_code       <dbl> 7, 5, 3, 5, 5, 5, 5, 7, 9, 5, 5, 5, 3, 5, 5, 3, 3…
## $ ceo_dismissal        <dbl> 0, 0, 1, 0, 0, 0, 0, 0, NA, 0, 0, 0, 1, 0, 0, 1, …
## $ interim_coceo        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ tenure_no_ceodb      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ max_tenure_ceodb     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ fyear_gone           <dbl> 2003, 1998, 2003, 1998, 2002, 1997, 1993, 1998, 1…
## $ leftofc              <dttm> 2003-03-21, 1998-05-20, 2003-04-24, 1998-12-31, …
## $ still_there          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ notes                <chr> "Ballard took over when the outgoing CEO said tha…
## $ sources              <chr> "https://www.wsj.com/articles/SB10288576921909334…
## $ eight_ks             <chr> "https://www.sec.gov/Archives/edgar/data/850519/0…
## $ cik                  <dbl> 850519, 6201, 6201, 1800, 2488, 1122304, 771667, …
## $ `_merge`             <chr> "matched (3)", "matched (3)", "matched (3)", "mat…

Initialize H2O

h2o.init()

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\rad1081\AppData\Local\Temp\Rtmp2pFSKY\file51b437a13422/h2o_rad1081_started_from_r.out
##     C:\Users\rad1081\AppData\Local\Temp\Rtmp2pFSKY\file51b44a802a39/h2o_rad1081_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 505 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 14 days 
##     H2O cluster name:           H2O_started_from_R_rad1081_byq922 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   7.91 GB 
##     H2O cluster total cores:    20 
##     H2O cluster allowed cores:  20 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.1 (2024-06-14 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

Clean and preprocess data

departures_clean <- departures_raw %>%
  filter(!is.na(ceo_dismissal)) %>%
  mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "Dismissed", "Not_Dismissed"),
         ceo_dismissal = as.factor(ceo_dismissal)) %>%
  select(-c(interim_coceo, still_there, eight_ks, sources, `_merge`, 
            departure_code, gvkey, cik, co_per_rol)) %>%
  distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
  filter(fyear_gone < 2025) %>%
  mutate(across(where(is.character), as.factor),
         notes = as.character(notes))

Split data

set.seed(123)
data_split <- initial_split(departures_clean, prop = 0.8, strata = ceo_dismissal)
train_data <- training(data_split)
test_data  <- testing(data_split)

Convert to H2O format

train_h2o <- as.h2o(train_data)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

test_h2o  <- as.h2o(test_data)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

Train AutoML Model

h2o_model <- h2o.automl(
  x = setdiff(names(train_h2o), "ceo_dismissal"),
  y = "ceo_dismissal",
  training_frame = train_h2o,
  max_models = 10,
  seed = 123
)

##   |                                                                              |                                                                      |   0%  |                                                                              |=                                                                     |   1%
## 19:56:33.68: AutoML: XGBoost is not available; skipping it.
## 19:56:33.89: _train param, Dropping bad and constant columns: [notes]  |                                                                              |=                                                                     |   2%
## 19:56:41.384: _train param, Dropping bad and constant columns: [notes]  |                                                                              |==                                                                    |   4%  |                                                                              |====                                                                  |   6%
## 19:56:44.718: _train param, Dropping bad and constant columns: [notes]  |                                                                              |======                                                                |   8%
## 19:56:48.636: _train param, Dropping bad and constant columns: [notes]  |                                                                              |========                                                              |  12%
## 19:56:50.385: _train param, Dropping bad and constant columns: [notes]
## 19:56:52.236: _train param, Dropping bad and constant columns: [notes]  |                                                                              |===========                                                           |  15%
## 19:56:55.267: _train param, Dropping bad and constant columns: [notes]  |                                                                              |=============                                                         |  18%  |                                                                              |==============                                                        |  20%  |                                                                              |==============                                                        |  21%
## 19:57:01.169: _train param, Dropping bad and constant columns: [notes]
## 19:57:02.752: _train param, Dropping bad and constant columns: [notes]  |                                                                              |=================                                                     |  24%  |                                                                              |==================                                                    |  26%  |                                                                              |=================================                                     |  47%
## 19:57:10.102: _train param, Dropping unused columns: [notes]  |                                                                              |===================================                                   |  50%
## 19:57:12.35: _train param, Dropping unused columns: [notes]  |                                                                              |======================================================================| 100%

View leaderboard of models

h2o_leaderboard <- h2o_model@leaderboard
print(h2o_leaderboard)

##                                      model_id mean_per_class_error   logloss
## 1              GBM_3_AutoML_1_20250504_195633            0.6612120 0.5812939
## 2              GBM_1_AutoML_1_20250504_195633            0.6619940 0.5931049
## 3              GBM_4_AutoML_1_20250504_195633            0.6620580 0.6208925
## 4              GBM_2_AutoML_1_20250504_195633            0.6620720 0.5677400
## 5              GBM_5_AutoML_1_20250504_195633            0.6621388 0.5469470
## 6 GBM_grid_1_AutoML_1_20250504_195633_model_1            0.6627705 0.5929122
##        rmse       mse
## 1 0.4183904 0.1750505
## 2 0.4205756 0.1768838
## 3 0.4235133 0.1793635
## 4 0.4154369 0.1725878
## 5 0.4107871 0.1687461
## 6 0.4193306 0.1758381
## 
## [12 rows x 5 columns]

Evaluate leader model

best_model <- h2o_model@leader
performance <- h2o.performance(best_model, newdata = test_h2o)

cat("AUC:", h2o.auc(performance), "\n")

## AUC: NaN

h2o.confusionMatrix(performance)

## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               1 Dismissed Not_Dismissed  Error          Rate
## 1             0         0             0     NA =       0 / 0
## Dismissed     0        13           284 0.9562 =   284 / 297
## Not_Dismissed 0        35          1164 0.0292 =  35 / 1,199
## Totals        0        48          1448 0.2132 = 319 / 1,496

Predict on test set

predictions <- h2o.predict(best_model, test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'coname' has levels not trained on: ["ACNIELSEN CORP",
## "ADDINGTON RESOURCES INC", "ADT CORP", "ADTRAN INC", "ADVANCED TISSUE SCI -CL
## A", "AEROFLEX INC", "AIR EXPRESS INTERNATIONAL CP", "ALEXANDRIA R E EQUITIES
## INC", "ALLEN TELECOM INC", "ALLSCRIPTS HEALTHCARE SOLTNS", ...284 not
## listed..., "WESLEY JESSEN VISIONCARE INC", "WESTROCK CO", "WET SEAL INC",
## "WHITNEY HOLDING CORP", "WOODWARD INC", "WORLD FUEL SERVICES CORP", "XPO
## LOGISTICS INC", "YOUNKERS INC", "ZEBRA TECHNOLOGIES CP -CL A", "ZENITH NATIONAL
## INSURANCE CP"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'exec_fullname' has levels not trained on: ["A. Eugene Sapp,
## Jr.", "A. George (Skip) Battle", "A. Lanham Napier", "A. Laurence Jones", "A.
## Malachi Mixon III", "Adrian Adams", "Ahmad R. Chatila", "Alain A. Couder",
## "Alain Juan Pablo Belda", "Alan C. Greenberg", ...1307 not listed..., "William
## Thomas Dillard", "William V. Campbell", "William V. Larkin Jr.", "William V.
## Stephenson", "William White Adams", "Willliam T. Jensen", "Wilson B. Sexton",
## "Wilson W. Cheung", "Wilson Wilde", "Ying Lu"]

head(predictions)

##         predict         p1  Dismissed Not_Dismissed
## 1 Not_Dismissed 0.01051693 0.05235578     0.9371273
## 2 Not_Dismissed 0.01169119 0.09276919     0.8955396
## 3 Not_Dismissed 0.02515968 0.30383908     0.6710012
## 4 Not_Dismissed 0.01003835 0.04618693     0.9437747
## 5 Not_Dismissed 0.01110377 0.05351163     0.9353846
## 6 Not_Dismissed 0.05069571 0.40308204     0.5462222

Plot variable importance

h2o.varimp_plot(best_model)

Shutdown H2O when done

h2o.shutdown(prompt = FALSE)

Apply 12

Ronja Dahlin

2025-05-04

Data

Initialize H2O

Clean and preprocess data

Split data

Convert to H2O format

Train AutoML Model

View leaderboard of models

Evaluate leader model

Predict on test set

Plot variable importance

Shutdown H2O when done