Data
departures_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv")
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm (1): leftofc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(departures_raw)
## Rows: 9,423
## Columns: 19
## $ dismissal_dataset_id <dbl> 559043, 12, 13, 31, 43, 51, 61, 63, 62, 65, 75, 7…
## $ coname <chr> "SONICBLUE INC", "AMERICAN AIRLINES GROUP INC", "…
## $ gvkey <dbl> 27903, 1045, 1045, 1078, 1161, 1177, 1194, 1194, …
## $ fyear <dbl> 2002, 1997, 2002, 1998, 2001, 1997, 1993, 1997, 1…
## $ co_per_rol <dbl> -1, 1, 3, 6, 11, 16, 21, 22, 24, 28, 33, 34, 38, …
## $ exec_fullname <chr> "L. Gregory Ballard", "Robert L. Crandall", "Dona…
## $ departure_code <dbl> 7, 5, 3, 5, 5, 5, 5, 7, 9, 5, 5, 5, 3, 5, 5, 3, 3…
## $ ceo_dismissal <dbl> 0, 0, 1, 0, 0, 0, 0, 0, NA, 0, 0, 0, 1, 0, 0, 1, …
## $ interim_coceo <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ tenure_no_ceodb <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ max_tenure_ceodb <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ fyear_gone <dbl> 2003, 1998, 2003, 1998, 2002, 1997, 1993, 1998, 1…
## $ leftofc <dttm> 2003-03-21, 1998-05-20, 2003-04-24, 1998-12-31, …
## $ still_there <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ notes <chr> "Ballard took over when the outgoing CEO said tha…
## $ sources <chr> "https://www.wsj.com/articles/SB10288576921909334…
## $ eight_ks <chr> "https://www.sec.gov/Archives/edgar/data/850519/0…
## $ cik <dbl> 850519, 6201, 6201, 1800, 2488, 1122304, 771667, …
## $ `_merge` <chr> "matched (3)", "matched (3)", "matched (3)", "mat…
Initialize H2O
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\rad1081\AppData\Local\Temp\Rtmp2pFSKY\file51b437a13422/h2o_rad1081_started_from_r.out
## C:\Users\rad1081\AppData\Local\Temp\Rtmp2pFSKY\file51b44a802a39/h2o_rad1081_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 505 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 14 days
## H2O cluster name: H2O_started_from_R_rad1081_byq922
## H2O cluster total nodes: 1
## H2O cluster total memory: 7.91 GB
## H2O cluster total cores: 20
## H2O cluster allowed cores: 20
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.1 (2024-06-14 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
Clean and preprocess data
departures_clean <- departures_raw %>%
filter(!is.na(ceo_dismissal)) %>%
mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "Dismissed", "Not_Dismissed"),
ceo_dismissal = as.factor(ceo_dismissal)) %>%
select(-c(interim_coceo, still_there, eight_ks, sources, `_merge`,
departure_code, gvkey, cik, co_per_rol)) %>%
distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
filter(fyear_gone < 2025) %>%
mutate(across(where(is.character), as.factor),
notes = as.character(notes))
Split data
set.seed(123)
data_split <- initial_split(departures_clean, prop = 0.8, strata = ceo_dismissal)
train_data <- training(data_split)
test_data <- testing(data_split)
Train AutoML Model
h2o_model <- h2o.automl(
x = setdiff(names(train_h2o), "ceo_dismissal"),
y = "ceo_dismissal",
training_frame = train_h2o,
max_models = 10,
seed = 123
)
## | | | 0% | |= | 1%
## 19:56:33.68: AutoML: XGBoost is not available; skipping it.
## 19:56:33.89: _train param, Dropping bad and constant columns: [notes] | |= | 2%
## 19:56:41.384: _train param, Dropping bad and constant columns: [notes] | |== | 4% | |==== | 6%
## 19:56:44.718: _train param, Dropping bad and constant columns: [notes] | |====== | 8%
## 19:56:48.636: _train param, Dropping bad and constant columns: [notes] | |======== | 12%
## 19:56:50.385: _train param, Dropping bad and constant columns: [notes]
## 19:56:52.236: _train param, Dropping bad and constant columns: [notes] | |=========== | 15%
## 19:56:55.267: _train param, Dropping bad and constant columns: [notes] | |============= | 18% | |============== | 20% | |============== | 21%
## 19:57:01.169: _train param, Dropping bad and constant columns: [notes]
## 19:57:02.752: _train param, Dropping bad and constant columns: [notes] | |================= | 24% | |================== | 26% | |================================= | 47%
## 19:57:10.102: _train param, Dropping unused columns: [notes] | |=================================== | 50%
## 19:57:12.35: _train param, Dropping unused columns: [notes] | |======================================================================| 100%
View leaderboard of models
h2o_leaderboard <- h2o_model@leaderboard
print(h2o_leaderboard)
## model_id mean_per_class_error logloss
## 1 GBM_3_AutoML_1_20250504_195633 0.6612120 0.5812939
## 2 GBM_1_AutoML_1_20250504_195633 0.6619940 0.5931049
## 3 GBM_4_AutoML_1_20250504_195633 0.6620580 0.6208925
## 4 GBM_2_AutoML_1_20250504_195633 0.6620720 0.5677400
## 5 GBM_5_AutoML_1_20250504_195633 0.6621388 0.5469470
## 6 GBM_grid_1_AutoML_1_20250504_195633_model_1 0.6627705 0.5929122
## rmse mse
## 1 0.4183904 0.1750505
## 2 0.4205756 0.1768838
## 3 0.4235133 0.1793635
## 4 0.4154369 0.1725878
## 5 0.4107871 0.1687461
## 6 0.4193306 0.1758381
##
## [12 rows x 5 columns]
Evaluate leader model
best_model <- h2o_model@leader
performance <- h2o.performance(best_model, newdata = test_h2o)
cat("AUC:", h2o.auc(performance), "\n")
## AUC: NaN
h2o.confusionMatrix(performance)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 1 Dismissed Not_Dismissed Error Rate
## 1 0 0 0 NA = 0 / 0
## Dismissed 0 13 284 0.9562 = 284 / 297
## Not_Dismissed 0 35 1164 0.0292 = 35 / 1,199
## Totals 0 48 1448 0.2132 = 319 / 1,496
Predict on test set
predictions <- h2o.predict(best_model, test_h2o)
## | | | 0% | |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'coname' has levels not trained on: ["ACNIELSEN CORP",
## "ADDINGTON RESOURCES INC", "ADT CORP", "ADTRAN INC", "ADVANCED TISSUE SCI -CL
## A", "AEROFLEX INC", "AIR EXPRESS INTERNATIONAL CP", "ALEXANDRIA R E EQUITIES
## INC", "ALLEN TELECOM INC", "ALLSCRIPTS HEALTHCARE SOLTNS", ...284 not
## listed..., "WESLEY JESSEN VISIONCARE INC", "WESTROCK CO", "WET SEAL INC",
## "WHITNEY HOLDING CORP", "WOODWARD INC", "WORLD FUEL SERVICES CORP", "XPO
## LOGISTICS INC", "YOUNKERS INC", "ZEBRA TECHNOLOGIES CP -CL A", "ZENITH NATIONAL
## INSURANCE CP"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'exec_fullname' has levels not trained on: ["A. Eugene Sapp,
## Jr.", "A. George (Skip) Battle", "A. Lanham Napier", "A. Laurence Jones", "A.
## Malachi Mixon III", "Adrian Adams", "Ahmad R. Chatila", "Alain A. Couder",
## "Alain Juan Pablo Belda", "Alan C. Greenberg", ...1307 not listed..., "William
## Thomas Dillard", "William V. Campbell", "William V. Larkin Jr.", "William V.
## Stephenson", "William White Adams", "Willliam T. Jensen", "Wilson B. Sexton",
## "Wilson W. Cheung", "Wilson Wilde", "Ying Lu"]
head(predictions)
## predict p1 Dismissed Not_Dismissed
## 1 Not_Dismissed 0.01051693 0.05235578 0.9371273
## 2 Not_Dismissed 0.01169119 0.09276919 0.8955396
## 3 Not_Dismissed 0.02515968 0.30383908 0.6710012
## 4 Not_Dismissed 0.01003835 0.04618693 0.9437747
## 5 Not_Dismissed 0.01110377 0.05351163 0.9353846
## 6 Not_Dismissed 0.05069571 0.40308204 0.5462222
Plot variable importance
h2o.varimp_plot(best_model)

Shutdown H2O when done
h2o.shutdown(prompt = FALSE)