library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'stringr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.1
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'dials' was built under R version 4.3.1
## Warning: package 'scales' was built under R version 4.3.1
## Warning: package 'infer' was built under R version 4.3.1
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.1
## Warning: package 'recipes' was built under R version 4.3.1
## Warning: package 'rsample' was built under R version 4.3.1
## Warning: package 'tune' was built under R version 4.3.1
## Warning: package 'workflows' was built under R version 4.3.1
## Warning: package 'workflowsets' was built under R version 4.3.1
## Warning: package 'yardstick' was built under R version 4.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(themis)    # For SMOTE
library(h2o)
## Warning: package 'h2o' was built under R version 4.3.1
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(agua)
## Warning: package 'agua' was built under R version 4.3.3
## 
## Attaching package: 'agua'
## 
## The following object is masked from 'package:workflowsets':
## 
##     rank_results
ceo_data_raw <- read_csv("../00_data/data_wrangled/data_clean2.csv")  # Replace with actual dataset path
## Rows: 501 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): still_there, notes
## dbl  (7): fyear, co_per_rol, departure_code, ceo_dismissal, tenure_no_ceodb,...
## dttm (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# If data is not sensitive:
data %>% glimpse()
## function (..., list = character(), package = NULL, lib.loc = NULL, verbose = getOption("verbose"), 
##     envir = .GlobalEnv, overwrite = TRUE)
# Initialize H2O
h2o.init(max_mem_size = "4G")
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /var/folders/54/8ptbpppx6yl7x4zf4sjpnr6r0000gn/T//RtmpTx4HF3/file158fb662d8744/h2o_jobboonstoppel_started_from_r.out
##     /var/folders/54/8ptbpppx6yl7x4zf4sjpnr6r0000gn/T//RtmpTx4HF3/file158fb370bb5c4/h2o_jobboonstoppel_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: .. Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 178 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 19 days 
##     H2O cluster name:           H2O_started_from_R_jobboonstoppel_onf375 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.99 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.0 (2023-04-21)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 19 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Ensure cluster information is displayed
h2o.clusterInfo()
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 229 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 19 days 
##     H2O cluster name:           H2O_started_from_R_jobboonstoppel_onf375 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.99 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.0 (2023-04-21)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 19 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Clean and Prepare the Data ----
ceo_data_cleaned <- ceo_data_raw %>%
    clean_names() %>%                                    # Clean column names for easier reference
    mutate(
        ceo_dismissal = as.factor(ceo_dismissal),       # Convert target variable to factor
        still_there = as.Date(still_there, format = "%d%b%Y"),  # Convert dates to Date type
        leftofc = as.Date(leftofc)                      # Convert dates to Date type
    ) %>%
    mutate(
        still_there_numeric = as.numeric(still_there - as.Date("1970-01-01")), # Convert dates to numeric
        leftofc_numeric = as.numeric(leftofc - as.Date("1970-01-01")),
        tenure_diff = tenure_no_ceodb - max_tenure_ceodb, # Example: Create a new feature
        time_diff = still_there_numeric - leftofc_numeric # Difference between time variables
    ) %>%
    select(-still_there, -leftofc, -notes) %>%           # Drop original date columns and non-informative columns
    drop_na()  
# Define Recipe with SMOTE ----
data_recipe <- recipe(ceo_dismissal ~ ., data = ceo_data_cleaned) %>%
    step_smote(ceo_dismissal, over_ratio = 1) %>%  # Apply SMOTE to balance classes
    step_normalize(all_numeric_predictors()) %>%        # Normalize numeric predictors
    step_dummy(all_nominal_predictors(), -all_outcomes()) # One-hot encoding for categorical va
# Convert to H2O Frame ----
ceo_data_h2o <- as.h2o(ceo_data_cleaned)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Split the Data into Training and Testing Sets ----
splits <- h2o.splitFrame(ceo_data_h2o, ratios = 0.8, seed = 123)
train_data_h2o <- splits[[1]]
test_data_h2o <- splits[[2]]
# Run H2O AutoML with Class Balancing ----
ceo_automl <- h2o.automl(
    x = setdiff(names(ceo_data_cleaned), "ceo_dismissal"), # Predictor columns
    y = "ceo_dismissal",                                   # Response column
    training_frame = train_data_h2o,
    leaderboard_frame = test_data_h2o,                     # Use test data for leaderboard
    max_runtime_secs = 30,                                # Increased runtime
    nfolds = 5,                                            # Cross-validation folds
    balance_classes = TRUE,                                # Enable class balancing
    seed = 123
)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   4%
## 14:34:43.702: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================| 100%
# Display the AutoML leaderboard ----
leaderboard <- h2o.get_leaderboard(ceo_automl, extra_columns = "ALL")
print(leaderboard)
##                                               model_id auc      logloss aucpr
## 1                       GLM_1_AutoML_1_20241210_143443   0 9.303127e-07     0
## 2 StackedEnsemble_AllModels_1_AutoML_1_20241210_143443   0 5.244989e-06     0
## 3          GBM_grid_1_AutoML_1_20241210_143443_model_6   0 4.729219e-11     0
## 4 DeepLearning_grid_1_AutoML_1_20241210_143443_model_3   0 4.941228e-03     0
## 5          GBM_grid_1_AutoML_1_20241210_143443_model_2   0 1.286008e-04     0
## 6                       DRF_1_AutoML_1_20241210_143443   0 2.221480e-05     0
##   mean_per_class_error         rmse          mse training_time_ms
## 1                  NaN 1.158411e-06 1.341915e-12               36
## 2                  NaN 5.374627e-06 2.888662e-11              125
## 3                  NaN 4.014160e-10 1.611348e-19              306
## 4                  NaN 8.959368e-03 8.027028e-05                8
## 5                  NaN 8.845956e-04 7.825093e-07               96
## 6                  NaN 1.231186e-04 1.515818e-08               32
##   predict_time_per_row_ms            algo
## 1                0.017346             GLM
## 2                0.067589 StackedEnsemble
## 3                0.041548             GBM
## 4                0.007779    DeepLearning
## 5                0.015294             GBM
## 6                0.008084             DRF
## 
## [46 rows x 10 columns]
# Evaluate the Best Model ----
best_model <- ceo_automl@leader
performance <- h2o.performance(best_model, newdata = test_data_h2o)
# Display metrics
cat("AUC: ", h2o.auc(performance), "\n")
## AUC:  0
cat("Log Loss: ", h2o.logloss(performance), "\n")
## Log Loss:  9.303127e-07
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(h2o.confusionMatrix(performance))
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = NaN:
##         0  1 Error    Rate
## 0      NA NA    NA  =NA/NA
## 1      NA NA    NA  =NA/NA
## Totals NA NA    NA  =NA/NA
# Make Predictions ----
predictions <- h2o.predict(best_model, test_data_h2o)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Convert predictions back to R data frame
predictions_df <- as.data.frame(predictions)
head(predictions_df)
##   predict        p0           p1
## 1       0 0.9999996 4.254148e-07
## 2       0 0.9999985 1.459224e-06
## 3       0 0.9999984 1.579849e-06
## 4       0 0.9999991 8.887868e-07
## 5       0 0.9999972 2.818016e-06
## 6       0 0.9999982 1.849683e-06
# Shut Down H2O ----
h2o.shutdown(prompt = FALSE)