library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'stringr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.1
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'dials' was built under R version 4.3.1
## Warning: package 'scales' was built under R version 4.3.1
## Warning: package 'infer' was built under R version 4.3.1
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.1
## Warning: package 'recipes' was built under R version 4.3.1
## Warning: package 'rsample' was built under R version 4.3.1
## Warning: package 'tune' was built under R version 4.3.1
## Warning: package 'workflows' was built under R version 4.3.1
## Warning: package 'workflowsets' was built under R version 4.3.1
## Warning: package 'yardstick' was built under R version 4.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(themis) # For SMOTE
library(h2o)
## Warning: package 'h2o' was built under R version 4.3.1
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
##
## Attaching package: 'h2o'
##
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
##
## The following objects are masked from 'package:stats':
##
## cor, sd, var
##
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(agua)
## Warning: package 'agua' was built under R version 4.3.3
##
## Attaching package: 'agua'
##
## The following object is masked from 'package:workflowsets':
##
## rank_results
ceo_data_raw <- read_csv("../00_data/data_wrangled/data_clean2.csv") # Replace with actual dataset path
## Rows: 501 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): still_there, notes
## dbl (7): fyear, co_per_rol, departure_code, ceo_dismissal, tenure_no_ceodb,...
## dttm (1): leftofc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# If data is not sensitive:
data %>% glimpse()
## function (..., list = character(), package = NULL, lib.loc = NULL, verbose = getOption("verbose"),
## envir = .GlobalEnv, overwrite = TRUE)
# Initialize H2O
h2o.init(max_mem_size = "4G")
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /var/folders/54/8ptbpppx6yl7x4zf4sjpnr6r0000gn/T//RtmpTx4HF3/file158fb662d8744/h2o_jobboonstoppel_started_from_r.out
## /var/folders/54/8ptbpppx6yl7x4zf4sjpnr6r0000gn/T//RtmpTx4HF3/file158fb370bb5c4/h2o_jobboonstoppel_started_from_r.err
##
##
## Starting H2O JVM and connecting: .. Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 178 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months and 19 days
## H2O cluster name: H2O_started_from_R_jobboonstoppel_onf375
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.99 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.0 (2023-04-21)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months and 19 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Ensure cluster information is displayed
h2o.clusterInfo()
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 229 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months and 19 days
## H2O cluster name: H2O_started_from_R_jobboonstoppel_onf375
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.99 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.0 (2023-04-21)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months and 19 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Clean and Prepare the Data ----
ceo_data_cleaned <- ceo_data_raw %>%
clean_names() %>% # Clean column names for easier reference
mutate(
ceo_dismissal = as.factor(ceo_dismissal), # Convert target variable to factor
still_there = as.Date(still_there, format = "%d%b%Y"), # Convert dates to Date type
leftofc = as.Date(leftofc) # Convert dates to Date type
) %>%
mutate(
still_there_numeric = as.numeric(still_there - as.Date("1970-01-01")), # Convert dates to numeric
leftofc_numeric = as.numeric(leftofc - as.Date("1970-01-01")),
tenure_diff = tenure_no_ceodb - max_tenure_ceodb, # Example: Create a new feature
time_diff = still_there_numeric - leftofc_numeric # Difference between time variables
) %>%
select(-still_there, -leftofc, -notes) %>% # Drop original date columns and non-informative columns
drop_na()
# Define Recipe with SMOTE ----
data_recipe <- recipe(ceo_dismissal ~ ., data = ceo_data_cleaned) %>%
step_smote(ceo_dismissal, over_ratio = 1) %>% # Apply SMOTE to balance classes
step_normalize(all_numeric_predictors()) %>% # Normalize numeric predictors
step_dummy(all_nominal_predictors(), -all_outcomes()) # One-hot encoding for categorical va
# Convert to H2O Frame ----
ceo_data_h2o <- as.h2o(ceo_data_cleaned)
##
|
| | 0%
|
|======================================================================| 100%
# Split the Data into Training and Testing Sets ----
splits <- h2o.splitFrame(ceo_data_h2o, ratios = 0.8, seed = 123)
train_data_h2o <- splits[[1]]
test_data_h2o <- splits[[2]]
# Run H2O AutoML with Class Balancing ----
ceo_automl <- h2o.automl(
x = setdiff(names(ceo_data_cleaned), "ceo_dismissal"), # Predictor columns
y = "ceo_dismissal", # Response column
training_frame = train_data_h2o,
leaderboard_frame = test_data_h2o, # Use test data for leaderboard
max_runtime_secs = 30, # Increased runtime
nfolds = 5, # Cross-validation folds
balance_classes = TRUE, # Enable class balancing
seed = 123
)
##
|
| | 0%
|
|== | 4%
## 14:34:43.702: AutoML: XGBoost is not available; skipping it.
|
|======= | 10%
|
|============ | 17%
|
|================= | 24%
|
|===================== | 31%
|
|========================== | 37%
|
|=============================== | 44%
|
|==================================== | 51%
|
|======================================== | 58%
|
|============================================= | 64%
|
|================================================== | 71%
|
|====================================================== | 78%
|
|=========================================================== | 85%
|
|================================================================ | 91%
|
|===================================================================== | 98%
|
|======================================================================| 100%
# Display the AutoML leaderboard ----
leaderboard <- h2o.get_leaderboard(ceo_automl, extra_columns = "ALL")
print(leaderboard)
## model_id auc logloss aucpr
## 1 GLM_1_AutoML_1_20241210_143443 0 9.303127e-07 0
## 2 StackedEnsemble_AllModels_1_AutoML_1_20241210_143443 0 5.244989e-06 0
## 3 GBM_grid_1_AutoML_1_20241210_143443_model_6 0 4.729219e-11 0
## 4 DeepLearning_grid_1_AutoML_1_20241210_143443_model_3 0 4.941228e-03 0
## 5 GBM_grid_1_AutoML_1_20241210_143443_model_2 0 1.286008e-04 0
## 6 DRF_1_AutoML_1_20241210_143443 0 2.221480e-05 0
## mean_per_class_error rmse mse training_time_ms
## 1 NaN 1.158411e-06 1.341915e-12 36
## 2 NaN 5.374627e-06 2.888662e-11 125
## 3 NaN 4.014160e-10 1.611348e-19 306
## 4 NaN 8.959368e-03 8.027028e-05 8
## 5 NaN 8.845956e-04 7.825093e-07 96
## 6 NaN 1.231186e-04 1.515818e-08 32
## predict_time_per_row_ms algo
## 1 0.017346 GLM
## 2 0.067589 StackedEnsemble
## 3 0.041548 GBM
## 4 0.007779 DeepLearning
## 5 0.015294 GBM
## 6 0.008084 DRF
##
## [46 rows x 10 columns]
# Evaluate the Best Model ----
best_model <- ceo_automl@leader
performance <- h2o.performance(best_model, newdata = test_data_h2o)
# Display metrics
cat("AUC: ", h2o.auc(performance), "\n")
## AUC: 0
cat("Log Loss: ", h2o.logloss(performance), "\n")
## Log Loss: 9.303127e-07
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(h2o.confusionMatrix(performance))
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = NaN:
## 0 1 Error Rate
## 0 NA NA NA =NA/NA
## 1 NA NA NA =NA/NA
## Totals NA NA NA =NA/NA
# Make Predictions ----
predictions <- h2o.predict(best_model, test_data_h2o)
##
|
| | 0%
|
|======================================================================| 100%
# Convert predictions back to R data frame
predictions_df <- as.data.frame(predictions)
head(predictions_df)
## predict p0 p1
## 1 0 0.9999996 4.254148e-07
## 2 0 0.9999985 1.459224e-06
## 3 0 0.9999984 1.579849e-06
## 4 0 0.9999991 8.887868e-07
## 5 0 0.9999972 2.818016e-06
## 6 0 0.9999982 1.849683e-06
# Shut Down H2O ----
h2o.shutdown(prompt = FALSE)