ceo_data_cleaned <- read_csv("../00_data/data_wrangled/data_clean2.csv") %>%
  clean_names() %>%
  filter(fyear_gone < 2025) %>%
  select(-c(dismissal_dataset_id, coname, exec_fullname, notes)) %>%
  mutate(
    ceo_dismissal = as.factor(ceo_dismissal),
    tenure_no_ceodb = as.factor(tenure_no_ceodb),
    max_tenure_ceodb = as.factor(max_tenure_ceodb),
    fyear_gone = as.factor(fyear_gone)
  ) %>%
  na.omit()
## New names:
## Rows: 7458 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): coname, exec_fullname, ceo_dismissal, notes dbl (5): ...1,
## dismissal_dataset_id, tenure_no_ceodb, max_tenure_ceodb, fyea...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Build a recipe

ceo_recipe <- recipe(ceo_dismissal ~ ., data = ceo_data_cleaned) %>%
  step_smote(ceo_dismissal) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors())

Initialize H2o

h2o.init(max_mem_size = "2G")
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         13 minutes 2 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 12 days 
##     H2O cluster name:           H2O_started_from_R_User_jzv796 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.95 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.1 (2024-06-14 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 12 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

Convert to H2O Frame and Split

ceo_h2o <- as.h2o(ceo_data_cleaned)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
splits <- h2o.splitFrame(ceo_h2o, ratios = 0.8, seed = 123)
train_h2o <- splits[[1]]
test_h2o <- splits[[2]]

Run AutoML

automl_model <- h2o.automl(
  x = setdiff(names(ceo_data_cleaned), "ceo_dismissal"),
  y = "ceo_dismissal",
  training_frame = train_h2o,
  leaderboard_frame = test_h2o,
  max_runtime_secs = 30,
  nfolds = 5,
  balance_classes = TRUE,
  seed = 123
)
##   |                                                                              |                                                                      |   0%  |                                                                              |===                                                                   |   4%
## 23:33:30.66: AutoML: XGBoost is not available; skipping it.  |                                                                              |==========                                                            |  15%  |                                                                              |=================                                                     |  24%  |                                                                              |========================                                              |  34%  |                                                                              |===============================                                       |  44%  |                                                                              |======================================                                |  54%  |                                                                              |============================================                          |  63%  |                                                                              |===========================================================           |  84%  |                                                                              |==================================================================    |  94%  |                                                                              |======================================================================| 100%

View Results

h2o.get_leaderboard(automl_model)
##                                                  model_id       auc   logloss
## 1                          GBM_1_AutoML_2_20250502_233330 0.6089564 0.5055466
## 2 StackedEnsemble_BestOfFamily_1_AutoML_2_20250502_233330 0.6060356 0.5079420
## 3 StackedEnsemble_BestOfFamily_4_AutoML_2_20250502_233330 0.6040286 0.5079082
## 4 StackedEnsemble_BestOfFamily_2_AutoML_2_20250502_233330 0.6033647 0.5078857
## 5 StackedEnsemble_BestOfFamily_3_AutoML_2_20250502_233330 0.6025420 0.5069875
## 6            GBM_grid_1_AutoML_2_20250502_233330_model_12 0.6021638 0.5082698
##       aucpr mean_per_class_error      rmse       mse
## 1 0.8461409            0.5000000 0.4041808 0.1633621
## 2 0.8440028            0.4983974 0.4053327 0.1642946
## 3 0.8455982            0.4983974 0.4054086 0.1643561
## 4 0.8455955            0.4983974 0.4054101 0.1643574
## 5 0.8457215            0.4983974 0.4050417 0.1640588
## 6 0.8385058            0.5000000 0.4052812 0.1642528
## 
## [33 rows x 7 columns]

Evaluate the Best Model

best_model <- automl_model@leader
perf <- h2o.performance(best_model, newdata = test_h2o)

cat("AUC:", h2o.auc(perf), "\n")
## AUC: 0.6089564
cat("Log Loss:", h2o.logloss(perf), "\n")
## Log Loss: 0.5055466
h2o.confusionMatrix(perf)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.638242671560943:
##           dismissed not_dis    Error       Rate
## dismissed         0     312 1.000000   =312/312
## not_dis           0    1161 0.000000    =0/1161
## Totals            0    1473 0.211813  =312/1473

Predict

preds <- h2o.predict(best_model, test_h2o)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
head(as.data.frame(preds))
##     predict dismissed   not_dis
## 1   not_dis 0.1728157 0.8271843
## 2 dismissed 0.2445543 0.7554457
## 3 dismissed 0.2838546 0.7161454
## 4 dismissed 0.2838546 0.7161454
## 5   not_dis 0.1728157 0.8271843
## 6   not_dis 0.1728157 0.8271843

Shutdown H2O

h2o.shutdown(prompt = FALSE)