ceo_data_cleaned <- read_csv("../00_data/data_wrangled/data_clean2.csv") %>%
clean_names() %>%
filter(fyear_gone < 2025) %>%
select(-c(dismissal_dataset_id, coname, exec_fullname, notes)) %>%
mutate(
ceo_dismissal = as.factor(ceo_dismissal),
tenure_no_ceodb = as.factor(tenure_no_ceodb),
max_tenure_ceodb = as.factor(max_tenure_ceodb),
fyear_gone = as.factor(fyear_gone)
) %>%
na.omit()
## New names:
## Rows: 7458 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): coname, exec_fullname, ceo_dismissal, notes dbl (5): ...1,
## dismissal_dataset_id, tenure_no_ceodb, max_tenure_ceodb, fyea...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Build a recipe
ceo_recipe <- recipe(ceo_dismissal ~ ., data = ceo_data_cleaned) %>%
step_smote(ceo_dismissal) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors())
Initialize H2o
h2o.init(max_mem_size = "2G")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 13 minutes 2 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 12 days
## H2O cluster name: H2O_started_from_R_User_jzv796
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.95 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.1 (2024-06-14 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 12 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
Convert to H2O Frame and Split
ceo_h2o <- as.h2o(ceo_data_cleaned)
## | | | 0% | |======================================================================| 100%
splits <- h2o.splitFrame(ceo_h2o, ratios = 0.8, seed = 123)
train_h2o <- splits[[1]]
test_h2o <- splits[[2]]
Run AutoML
automl_model <- h2o.automl(
x = setdiff(names(ceo_data_cleaned), "ceo_dismissal"),
y = "ceo_dismissal",
training_frame = train_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 30,
nfolds = 5,
balance_classes = TRUE,
seed = 123
)
## | | | 0% | |=== | 4%
## 23:33:30.66: AutoML: XGBoost is not available; skipping it. | |========== | 15% | |================= | 24% | |======================== | 34% | |=============================== | 44% | |====================================== | 54% | |============================================ | 63% | |=========================================================== | 84% | |================================================================== | 94% | |======================================================================| 100%
View Results
h2o.get_leaderboard(automl_model)
## model_id auc logloss
## 1 GBM_1_AutoML_2_20250502_233330 0.6089564 0.5055466
## 2 StackedEnsemble_BestOfFamily_1_AutoML_2_20250502_233330 0.6060356 0.5079420
## 3 StackedEnsemble_BestOfFamily_4_AutoML_2_20250502_233330 0.6040286 0.5079082
## 4 StackedEnsemble_BestOfFamily_2_AutoML_2_20250502_233330 0.6033647 0.5078857
## 5 StackedEnsemble_BestOfFamily_3_AutoML_2_20250502_233330 0.6025420 0.5069875
## 6 GBM_grid_1_AutoML_2_20250502_233330_model_12 0.6021638 0.5082698
## aucpr mean_per_class_error rmse mse
## 1 0.8461409 0.5000000 0.4041808 0.1633621
## 2 0.8440028 0.4983974 0.4053327 0.1642946
## 3 0.8455982 0.4983974 0.4054086 0.1643561
## 4 0.8455955 0.4983974 0.4054101 0.1643574
## 5 0.8457215 0.4983974 0.4050417 0.1640588
## 6 0.8385058 0.5000000 0.4052812 0.1642528
##
## [33 rows x 7 columns]
Evaluate the Best Model
best_model <- automl_model@leader
perf <- h2o.performance(best_model, newdata = test_h2o)
cat("AUC:", h2o.auc(perf), "\n")
## AUC: 0.6089564
cat("Log Loss:", h2o.logloss(perf), "\n")
## Log Loss: 0.5055466
h2o.confusionMatrix(perf)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.638242671560943:
## dismissed not_dis Error Rate
## dismissed 0 312 1.000000 =312/312
## not_dis 0 1161 0.000000 =0/1161
## Totals 0 1473 0.211813 =312/1473
Predict
preds <- h2o.predict(best_model, test_h2o)
## | | | 0% | |======================================================================| 100%
head(as.data.frame(preds))
## predict dismissed not_dis
## 1 not_dis 0.1728157 0.8271843
## 2 dismissed 0.2445543 0.7554457
## 3 dismissed 0.2838546 0.7161454
## 4 dismissed 0.2838546 0.7161454
## 5 not_dis 0.1728157 0.8271843
## 6 not_dis 0.1728157 0.8271843
Shutdown H2O
h2o.shutdown(prompt = FALSE)