Goal is to to predict CEO departure (ceo_dismissal).

Import Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(correlationfunnel)

## ══ correlationfunnel Tip #2 ════════════════════════════════════════════════════
## Clean your NA's prior to using `binarize()`.
## Missing values and cleaning data are critical to getting great correlations. :)

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.2
## ✔ recipes      1.1.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(textrecipes)
library(tidytext)
library(usemodels)
library(xgboost)

## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice

library(themis)
library(doParallel)

## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: iterators
## Loading required package: parallel

library(vip)

## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv')

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

Clean Dataset

data_clean <- data %>%
    
    # Clean the target
    filter(!is.na(ceo_dismissal)) %>%
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
    mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>%
    
    # Address too many missing values
    select(-still_there, -interim_coceo, -eight_ks, -notes) %>%
    
    # Remove irrelevant variables
    select(-`_merge`, -sources) %>%
    
    # Remove variables that can't be used
    select(-departure_code) %>%
    
    # Remove redundant variables
    select(-cik, -gvkey, -fyear, -leftofc) %>%
    
    # Remove duplicates in the id variable
    distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
    
    # Remove 2997 in fyear_gone
    filter(fyear_gone < 2025) %>%
    
    # Convert character columns to factors
    mutate(across(c(tenure_no_ceodb, fyear_gone), as.factor)) %>%

    # Convert character columns to factors
    mutate(across(where(is.character), as.factor)) %>%

    # Omit missing values
    na.omit()
    
data_clean %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	7475
Number of columns	8
_______________________
Column type frequency:
factor	5
numeric	3
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
coname	1	FALSE	3427	BAR: 8, CLA: 8, FED: 8, GRE: 8
exec_fullname	1	FALSE	6975	Joh: 4, Mel: 4, Alb: 3, Ami: 3
ceo_dismissal	1	FALSE	2	not: 5992, dis: 1483
tenure_no_ceodb	1	FALSE	3	1: 7289, 2: 179, 3: 7
fyear_gone	1	FALSE	34	200: 379, 199: 351, 200: 334, 200: 321

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	1	5570.32	25757.33	1	2175.5	4326	6579.5	559044	▇▁▁▁▁
co_per_rol	1	21446.53	16350.34	-1	6981.0	18269	33418.5	64601	▇▅▅▂▁
max_tenure_ceodb	1	1.05	0.23	1	1.0	1	1.0	4	▇▁▁▁▁

Explore Data

data_clean %>% count(ceo_dismissal)

## # A tibble: 2 × 2
##   ceo_dismissal     n
##   <fct>         <int>
## 1 dismissed      1483
## 2 not_dis        5992

data_clean %>%
    ggplot(aes(ceo_dismissal)) +
    geom_bar()

ceo_dismissal vs. fyear_gone

data_clean %>%
    ggplot(aes(group = ceo_dismissal, fyear_gone)) +
    geom_boxplot()

correlation plot

# Step 1: Binarize
data_binarized <- data_clean %>%
    select(-dismissal_dataset_id) %>%
    na.omit() %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 7,475
## Columns: 44
## $ coname__BARRICK_GOLD_CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `co_per_rol__-Inf_6981`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ co_per_rol__6981_18269      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__18269_33418.5   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__33418.5_Inf     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__dismissed    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ ceo_dismissal__not_dis      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ tenure_no_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1993            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__1994            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1995            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone__1996            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1997            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1998            <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1999            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2002            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2003            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2004            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2006            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2008            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2009            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2013            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2014            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2017            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2018            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2: Correlation
data_correlation <- data_binarized %>%
    correlate(ceo_dismissal__dismissed)

data_correlation

## # A tibble: 44 × 3
##    feature          bin         correlation
##    <fct>            <chr>             <dbl>
##  1 ceo_dismissal    dismissed        1     
##  2 ceo_dismissal    not_dis         -1     
##  3 co_per_rol       -Inf_6981       -0.0595
##  4 max_tenure_ceodb 1                0.0580
##  5 co_per_rol       33418.5_Inf      0.0559
##  6 max_tenure_ceodb 2               -0.0536
##  7 fyear_gone       1999            -0.0391
##  8 fyear_gone       2002             0.0374
##  9 fyear_gone       2003             0.0296
## 10 fyear_gone       2009             0.0289
## # ℹ 34 more rows

# Step 3: Plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Split Data

set.seed(1234)

data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = ceo_dismissal)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5044/562]> Fold01
##  2 <split [5044/562]> Fold02
##  3 <split [5045/561]> Fold03
##  4 <split [5045/561]> Fold04
##  5 <split [5046/560]> Fold05
##  6 <split [5046/560]> Fold06
##  7 <split [5046/560]> Fold07
##  8 <split [5046/560]> Fold08
##  9 <split [5046/560]> Fold09
## 10 <split [5046/560]> Fold10

Preprocess Data

xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
    update_role(dismissal_dataset_id, new_role = "ID") %>%
    step_other(exec_fullname, coname, threshold = 0.01) %>%
    step_dummy(all_nominal_predictors()) %>% 
    step_YeoJohnson(max_tenure_ceodb)%>%
    step_normalize(all_numeric_predictors()) %>%
    step_smote(ceo_dismissal)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 8,988
## Columns: 41
## $ dismissal_dataset_id <dbl> 84, 85, 119, 162, 243, 244, 263, 280, 300, 346, 3…
## $ co_per_rol           <dbl> -1.314842, -1.314781, -1.312884, -1.309703, -1.30…
## $ max_tenure_ceodb     <dbl> -0.2110499, -0.2110499, -0.2110499, -0.2110499, -…
## $ ceo_dismissal        <fct> dismissed, dismissed, dismissed, dismissed, dismi…
## $ coname_other         <dbl> 0.03535534, 0.03535534, 0.03535534, 0.03535534, 0…
## $ exec_fullname_other  <dbl> 0.02671897, 0.02671897, 0.02671897, 0.02671897, 0…
## $ tenure_no_ceodb_X2   <dbl> -0.1606112, -0.1606112, -0.1606112, -0.1606112, -…
## $ tenure_no_ceodb_X3   <dbl> -0.02671897, -0.02671897, -0.02671897, -0.0267189…
## $ fyear_gone_X1988     <dbl> -0.01335591, -0.01335591, -0.01335591, -0.0133559…
## $ fyear_gone_X1990     <dbl> -0.01888979, -0.01888979, -0.01888979, -0.0188897…
## $ fyear_gone_X1991     <dbl> -0.01888979, -0.01888979, -0.01888979, -0.0188897…
## $ fyear_gone_X1992     <dbl> -0.02671897, -0.02671897, -0.02671897, -0.0267189…
## $ fyear_gone_X1993     <dbl> 7.9179139, -0.1262734, -0.1262734, -0.1262734, 7.…
## $ fyear_gone_X1994     <dbl> -0.1522343, -0.1522343, -0.1522343, -0.1522343, -…
## $ fyear_gone_X1995     <dbl> -0.1800187, -0.1800187, -0.1800187, -0.1800187, -…
## $ fyear_gone_X1996     <dbl> -0.1762889, -0.1762889, 5.6714953, -0.1762889, -0…
## $ fyear_gone_X1997     <dbl> -0.205882, -0.205882, -0.205882, -0.205882, -0.20…
## $ fyear_gone_X1998     <dbl> -0.2068214, 4.8342263, -0.2068214, -0.2068214, -0…
## $ fyear_gone_X1999     <dbl> -0.2249237, -0.2249237, -0.2249237, -0.2249237, -…
## $ fyear_gone_X2000     <dbl> -0.2257975, -0.2257975, -0.2257975, -0.2257975, -…
## $ fyear_gone_X2001     <dbl> -0.208224, -0.208224, -0.208224, -0.208224, -0.20…
## $ fyear_gone_X2002     <dbl> -0.181072, -0.181072, -0.181072, -0.181072, -0.18…
## $ fyear_gone_X2003     <dbl> -0.182120, -0.182120, -0.182120, -0.182120, -0.18…
## $ fyear_gone_X2004     <dbl> -0.1872832, -0.1872832, -0.1872832, -0.1872832, -…
## $ fyear_gone_X2005     <dbl> -0.2114665, -0.2114665, -0.2114665, 4.7280382, -0…
## $ fyear_gone_X2006     <dbl> -0.2035174, -0.2035174, -0.2035174, -0.2035174, -…
## $ fyear_gone_X2007     <dbl> -0.2119263, -0.2119263, -0.2119263, -0.2119263, -…
## $ fyear_gone_X2008     <dbl> -0.2169314, -0.2169314, -0.2169314, -0.2169314, -…
## $ fyear_gone_X2009     <dbl> -0.1842003, -0.1842003, -0.1842003, -0.1842003, -…
## $ fyear_gone_X2010     <dbl> -0.1831627, -0.1831627, -0.1831627, -0.1831627, -…
## $ fyear_gone_X2011     <dbl> -0.2011295, -0.2011295, -0.2011295, -0.2011295, -…
## $ fyear_gone_X2012     <dbl> -0.204466, -0.204466, -0.204466, -0.204466, -0.20…
## $ fyear_gone_X2013     <dbl> -0.1977456, -0.1977456, -0.1977456, -0.1977456, -…
## $ fyear_gone_X2014     <dbl> -0.1883011, -0.1883011, -0.1883011, -0.1883011, -…
## $ fyear_gone_X2015     <dbl> -0.2119263, -0.2119263, -0.2119263, -0.2119263, -…
## $ fyear_gone_X2016     <dbl> -0.2025651, -0.2025651, -0.2025651, -0.2025651, -…
## $ fyear_gone_X2017     <dbl> -0.1996852, -0.1996852, -0.1996852, -0.1996852, -…
## $ fyear_gone_X2018     <dbl> -0.2030417, -0.2030417, -0.2030417, -0.2030417, -…
## $ fyear_gone_X2019     <dbl> -0.1657943, -0.1657943, -0.1657943, -0.1657943, -…
## $ fyear_gone_X2020     <dbl> -0.07576223, -0.07576223, -0.07576223, -0.0757622…
## $ fyear_gone_X2021     <dbl> -0.03272976, -0.03272976, -0.03272976, -0.0327297…

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune Hyperparameters

tree_grid <- grid_regular(trees(),
                          tree_depth(),
                          levels = 5)

doParallel::registerDoParallel()

set.seed(17375)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5,
            control = control_grid(save_pred = TRUE))

## i Creating pre-processing data to finalize unknown parameter: mtry

Model Evaluation

Identify optimal values for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 11
##     mtry trees min_n tree_depth learn_rate .metric     .estimator  mean     n
##    <int> <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>
##  1     1  1674    21          8    0.0166  accuracy    binary     0.491    10
##  2     1  1674    21          8    0.0166  brier_class binary     0.245    10
##  3     1  1674    21          8    0.0166  roc_auc     binary     0.561    10
##  4    11   346    39         12    0.128   accuracy    binary     0.573    10
##  5    11   346    39         12    0.128   brier_class binary     0.239    10
##  6    11   346    39         12    0.128   roc_auc     binary     0.569    10
##  7    20   852    17          2    0.00405 accuracy    binary     0.470    10
##  8    20   852    17          2    0.00405 brier_class binary     0.244    10
##  9    20   852    17          2    0.00405 roc_auc     binary     0.559    10
## 10    29  1497    27          6    0.0475  accuracy    binary     0.576    10
## 11    29  1497    27          6    0.0475  brier_class binary     0.240    10
## 12    29  1497    27          6    0.0475  roc_auc     binary     0.572    10
## 13    37   659     8         14    0.00288 accuracy    binary     0.548    10
## 14    37   659     8         14    0.00288 brier_class binary     0.238    10
## 15    37   659     8         14    0.00288 roc_auc     binary     0.573    10
## # ℹ 2 more variables: std_err <dbl>, .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.579 Preprocessor1_Model1
## 2 roc_auc     binary         0.581 Preprocessor1_Model1
## 3 brier_class binary         0.239 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

Variable importance

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion pt 1

The previous model had an accuracy of 0.463 and AUC of 0.571

Feature transformation: Normalized Numeric Data. It resulted slight improvement with accuracy at 0.464 and AUC at 0.572.
Feature transformation: YeoJohnson Transformation. It resulted no improvement with accuracy remaining at 0.464 and AUC at 0.572.
Feature transformation: PCA Transformation. It resulted in a worse accuracy of 0.451 with a slightly worse AUC of 0.571.
Feature transformation: Increased Tuning Transformation. It resulted an improvement in accuracy to 0.579 and an improvement AUC of 0.581.

Comparision using h2o

Recipes

recipe_obj <- recipe(ceo_dismissal ~ ., data = data_train) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors())

Model

# Initialize h2o
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 hours 2 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year and 4 months 
##     H2O cluster name:           H2O_started_from_R_bradymartin_fhp551 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.24 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.2 (2024-10-31)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year and 4 months) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

split.h2o <- h2o.splitFrame(as.h2o(data_train), ratios = c(0.85), seed = 2345)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(data_test)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

y <- "ceo_dismissal"
x <- setdiff(names(data_train), y)

models_h2o <- h2o.automl(
    x = x,
    y = y, 
    training_frame    = train_h2o,
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    # max_runtime_secs  = 30, 
    max_models        = 10, 
    exclude_algos     = "DeepLearning",
    nfolds            = 5, 
    seed              = 3456   
)

##   |                                                                              |                                                                      |   0%  |                                                                              |===                                                                   |   4%
## 13:07:52.738: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 13:07:52.739: AutoML: XGBoost is not available; skipping it.  |                                                                              |======                                                                |   8%  |                                                                              |==================                                                    |  25%  |                                                                              |====================                                                  |  29%  |                                                                              |===============================================                       |  67%  |                                                                              |======================================================================| 100%

Examine the output of h2o.automl

models_h2o %>% typeof()

## [1] "S4"

models_h2o %>% slotNames()

## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"

models_h2o@leaderboard

##                                                   model_id       auc   logloss
## 1                          GLM_1_AutoML_11_20250421_130752 0.5796750 0.4958677
## 2 StackedEnsemble_BestOfFamily_1_AutoML_11_20250421_130752 0.5688168 0.4953323
## 3             GBM_grid_1_AutoML_11_20250421_130752_model_1 0.5595646 0.4998680
## 4    StackedEnsemble_AllModels_1_AutoML_11_20250421_130752 0.5546794 0.4964532
## 5                          GBM_5_AutoML_11_20250421_130752 0.5497501 0.5012743
## 6             GBM_grid_1_AutoML_11_20250421_130752_model_2 0.5426697 0.5149789
##       aucpr mean_per_class_error      rmse       mse
## 1 0.8455375                  0.5 0.3980336 0.1584307
## 2 0.8396666                  0.5 0.3976109 0.1580944
## 3 0.8258584                  0.5 0.3995889 0.1596713
## 4 0.8341421                  0.5 0.3983473 0.1586806
## 5 0.8245706                  0.5 0.3998816 0.1599053
## 6 0.8298534                  0.5 0.4044986 0.1636191
## 
## [12 rows x 7 columns]

models_h2o@leader

## Model Details:
## ==============
## 
## H2OBinomialModel: glm
## Model ID:  GLM_1_AutoML_11_20250421_130752 
## GLM Model: summary
##     family  link              regularization
## 1 binomial logit Ridge ( lambda = 4.409E-4 )
##                                                                     lambda_search
## 1 nlambda = 30, lambda.max = 2.3362, lambda.min = 4.409E-4, lambda.1se = 0.001143
##   number_of_predictors_total number_of_active_predictors number_of_iterations
## 1                       8366                        7348                   81
##                                       training_frame
## 1 AutoML_11_20250421_130752_training_RTMP_sid_a40e_5
## 
## Coefficients: glm coefficients
##                            names coefficients standardized_coefficients
## 1                      Intercept     0.280638                  1.533240
## 2 exec_fullname.A. Blaine Bowman     0.081698                  0.081698
## 3     exec_fullname.A. Dan Rovig     0.058392                  0.058392
## 4    exec_fullname.A. Dano Davis     0.066699                  0.066699
## 5 exec_fullname.A. Drue Jennings    -0.337154                 -0.337154
## 
## ---
##                     names coefficients standardized_coefficients
## 8362    tenure_no_ceodb.1     0.000000                  0.000000
## 8363    tenure_no_ceodb.2    -0.893825                 -0.893825
## 8364    tenure_no_ceodb.3     0.016670                  0.016670
## 8365 dismissal_dataset_id     0.000001                  0.012440
## 8366           co_per_rol    -0.000010                 -0.168646
## 8367     max_tenure_ceodb     1.403063                  0.327792
## 
## H2OBinomialMetrics: glm
## ** Reported on training data. **
## 
## MSE:  0.1157555
## RMSE:  0.3402285
## LogLoss:  0.3761555
## Mean Per-Class Error:  0.07813384
## AUC:  0.9823338
## AUCPR:  0.9942362
## Gini:  0.9646676
## R^2:  0.2745818
## Residual Deviance:  3578.743
## AIC:  18276.74
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           dismissed not_dis    Error       Rate
## dismissed       824     124 0.130802   =124/948
## not_dis          97    3712 0.025466   =97/3809
## Totals          921    3836 0.046458  =221/4757
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.738007    0.971092 248
## 2                       max f2  0.712335    0.977369 272
## 3                 max f0point5  0.760790    0.975590 228
## 4                 max accuracy  0.738007    0.953542 248
## 5                max precision  0.997243    1.000000   0
## 6                   max recall  0.580264    1.000000 374
## 7              max specificity  0.997243    1.000000   0
## 8             max absolute_mcc  0.738007    0.852993 248
## 9   max min_per_class_accuracy  0.760790    0.940142 228
## 10 max mean_per_class_accuracy  0.760790    0.941062 228
## 11                     max tns  0.997243  948.000000   0
## 12                     max fns  0.997243 3807.000000   0
## 13                     max fps  0.485995  948.000000 399
## 14                     max tps  0.580264 3809.000000 374
## 15                     max tnr  0.997243    1.000000   0
## 16                     max fnr  0.997243    0.999475   0
## 17                     max fpr  0.485995    1.000000 399
## 18                     max tpr  0.580264    1.000000 374
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: glm
## ** Reported on validation data. **
## 
## MSE:  0.152814
## RMSE:  0.3909143
## LogLoss:  0.4806195
## Mean Per-Class Error:  0.4915836
## AUC:  0.6020474
## AUCPR:  0.8543374
## Gini:  0.2040947
## R^2:  0.01950785
## Residual Deviance:  816.0919
## AIC:  15514.09
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           dismissed not_dis    Error      Rate
## dismissed         3     161 0.981707  =161/164
## not_dis           1     684 0.001460    =1/685
## Totals            4     845 0.190813  =162/849
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.637454   0.894118 395
## 2                       max f2  0.596341   0.954571 398
## 3                 max f0point5  0.637454   0.841328 395
## 4                 max accuracy  0.637454   0.809187 395
## 5                max precision  0.981269   1.000000   0
## 6                   max recall  0.596341   1.000000 398
## 7              max specificity  0.981269   1.000000   0
## 8             max absolute_mcc  0.828159   0.153286 134
## 9   max min_per_class_accuracy  0.808011   0.564964 192
## 10 max mean_per_class_accuracy  0.828159   0.592220 134
## 11                     max tns  0.981269 164.000000   0
## 12                     max fns  0.981269 684.000000   0
## 13                     max fps  0.548705 164.000000 399
## 14                     max tps  0.596341 685.000000 398
## 15                     max tnr  0.981269   1.000000   0
## 16                     max fnr  0.981269   0.998540   0
## 17                     max fpr  0.548705   1.000000 399
## 18                     max tpr  0.596341   1.000000 398
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: glm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.1577448
## RMSE:  0.397171
## LogLoss:  0.4935684
## Mean Per-Class Error:  0.5
## AUC:  0.5777765
## AUCPR:  0.8408975
## Gini:  0.1555529
## R^2:  0.01144247
## Residual Deviance:  4695.81
## AIC:  17007.81
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           dismissed not_dis    Error       Rate
## dismissed         0     948 1.000000   =948/948
## not_dis           0    3809 0.000000    =0/3809
## Totals            0    4757 0.199285  =948/4757
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.544555    0.889330 399
## 2                       max f2  0.544555    0.952583 399
## 3                 max f0point5  0.607760    0.834028 393
## 4                 max accuracy  0.544555    0.800715 399
## 5                max precision  0.995659    1.000000   0
## 6                   max recall  0.544555    1.000000 399
## 7              max specificity  0.995659    1.000000   0
## 8             max absolute_mcc  0.819164    0.096519 167
## 9   max min_per_class_accuracy  0.807165    0.550633 192
## 10 max mean_per_class_accuracy  0.818056    0.559764 169
## 11                     max tns  0.995659  948.000000   0
## 12                     max fns  0.995659 3807.000000   0
## 13                     max fps  0.588392  948.000000 395
## 14                     max tps  0.544555 3809.000000 399
## 15                     max tnr  0.995659    1.000000   0
## 16                     max fnr  0.995659    0.999475   0
## 17                     max fpr  0.588392    1.000000 395
## 18                     max tpr  0.544555    1.000000 399
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                 mean       sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy    0.801976 0.002596   0.801471   0.800420   0.800210   0.806519
## auc         0.579319 0.012140   0.576654   0.572451   0.572059   0.574647
## err         0.198024 0.002596   0.198529   0.199580   0.199790   0.193481
## err_count 188.400000 2.509980 189.000000 190.000000 190.000000 184.000000
## f0point5    0.835436 0.002836   0.835165   0.833698   0.833516   0.840373
##           cv_5_valid
## accuracy    0.801262
## auc         0.600782
## err         0.198738
## err_count 189.000000
## f0point5    0.834428
## 
## ---
##                         mean       sd cv_1_valid cv_2_valid cv_3_valid
## precision           0.802770 0.003829   0.802534   0.800420   0.800210
## r2                  0.012590 0.003273   0.010741   0.009753   0.010155
## recall              0.997900 0.003422   0.997375   1.000000   1.000000
## residual_deviance 937.947100 4.940729 942.065550 941.512900 941.014650
## rmse                0.396940 0.001032   0.397532   0.397731   0.397807
## specificity         0.014798 0.025208   0.015789   0.000000   0.000000
##                   cv_4_valid cv_5_valid
## precision           0.809422   0.801262
## r2                  0.016335   0.015969
## recall              0.992126   1.000000
## residual_deviance 931.956100 933.186300
## rmse                0.395778   0.395851
## specificity         0.058201   0.000000

Save and Load

h2o.getModel("GLM_1_AutoML_9_20250421_124816")

## Model Details:
## ==============
## 
## H2OBinomialModel: glm
## Model ID:  GLM_1_AutoML_9_20250421_124816 
## GLM Model: summary
##     family  link              regularization
## 1 binomial logit Ridge ( lambda = 4.409E-4 )
##                                                                     lambda_search
## 1 nlambda = 30, lambda.max = 2.3362, lambda.min = 4.409E-4, lambda.1se = 0.001143
##   number_of_predictors_total number_of_active_predictors number_of_iterations
## 1                       8366                        7348                   81
##                                        training_frame
## 1 AutoML_9_20250421_124816_training_RTMP_sid_81dd_934
## 
## Coefficients: glm coefficients
##                            names coefficients standardized_coefficients
## 1                      Intercept     0.280638                  1.533240
## 2 exec_fullname.A. Blaine Bowman     0.081698                  0.081698
## 3     exec_fullname.A. Dan Rovig     0.058392                  0.058392
## 4    exec_fullname.A. Dano Davis     0.066699                  0.066699
## 5 exec_fullname.A. Drue Jennings    -0.337154                 -0.337154
## 
## ---
##                     names coefficients standardized_coefficients
## 8362    tenure_no_ceodb.1     0.000000                  0.000000
## 8363    tenure_no_ceodb.2    -0.893825                 -0.893825
## 8364    tenure_no_ceodb.3     0.016670                  0.016670
## 8365 dismissal_dataset_id     0.000001                  0.012440
## 8366           co_per_rol    -0.000010                 -0.168646
## 8367     max_tenure_ceodb     1.403063                  0.327792
## 
## H2OBinomialMetrics: glm
## ** Reported on training data. **
## 
## MSE:  0.1157555
## RMSE:  0.3402285
## LogLoss:  0.3761555
## Mean Per-Class Error:  0.07813384
## AUC:  0.9823338
## AUCPR:  0.9942362
## Gini:  0.9646676
## R^2:  0.2745818
## Residual Deviance:  3578.743
## AIC:  18276.74
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           dismissed not_dis    Error       Rate
## dismissed       824     124 0.130802   =124/948
## not_dis          97    3712 0.025466   =97/3809
## Totals          921    3836 0.046458  =221/4757
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.738007    0.971092 248
## 2                       max f2  0.712335    0.977369 272
## 3                 max f0point5  0.760790    0.975590 228
## 4                 max accuracy  0.738007    0.953542 248
## 5                max precision  0.997243    1.000000   0
## 6                   max recall  0.580264    1.000000 374
## 7              max specificity  0.997243    1.000000   0
## 8             max absolute_mcc  0.738007    0.852993 248
## 9   max min_per_class_accuracy  0.760790    0.940142 228
## 10 max mean_per_class_accuracy  0.760790    0.941062 228
## 11                     max tns  0.997243  948.000000   0
## 12                     max fns  0.997243 3807.000000   0
## 13                     max fps  0.485995  948.000000 399
## 14                     max tps  0.580264 3809.000000 374
## 15                     max tnr  0.997243    1.000000   0
## 16                     max fnr  0.997243    0.999475   0
## 17                     max fpr  0.485995    1.000000 399
## 18                     max tpr  0.580264    1.000000 374
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: glm
## ** Reported on validation data. **
## 
## MSE:  0.152814
## RMSE:  0.3909143
## LogLoss:  0.4806195
## Mean Per-Class Error:  0.4915836
## AUC:  0.6020474
## AUCPR:  0.8543374
## Gini:  0.2040947
## R^2:  0.01950785
## Residual Deviance:  816.0919
## AIC:  15514.09
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           dismissed not_dis    Error      Rate
## dismissed         3     161 0.981707  =161/164
## not_dis           1     684 0.001460    =1/685
## Totals            4     845 0.190813  =162/849
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.637454   0.894118 395
## 2                       max f2  0.596341   0.954571 398
## 3                 max f0point5  0.637454   0.841328 395
## 4                 max accuracy  0.637454   0.809187 395
## 5                max precision  0.981269   1.000000   0
## 6                   max recall  0.596341   1.000000 398
## 7              max specificity  0.981269   1.000000   0
## 8             max absolute_mcc  0.828159   0.153286 134
## 9   max min_per_class_accuracy  0.808011   0.564964 192
## 10 max mean_per_class_accuracy  0.828159   0.592220 134
## 11                     max tns  0.981269 164.000000   0
## 12                     max fns  0.981269 684.000000   0
## 13                     max fps  0.548705 164.000000 399
## 14                     max tps  0.596341 685.000000 398
## 15                     max tnr  0.981269   1.000000   0
## 16                     max fnr  0.981269   0.998540   0
## 17                     max fpr  0.548705   1.000000 399
## 18                     max tpr  0.596341   1.000000 398
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: glm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.1577448
## RMSE:  0.397171
## LogLoss:  0.4935684
## Mean Per-Class Error:  0.5
## AUC:  0.5777765
## AUCPR:  0.8408975
## Gini:  0.1555529
## R^2:  0.01144247
## Residual Deviance:  4695.81
## AIC:  17007.81
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           dismissed not_dis    Error       Rate
## dismissed         0     948 1.000000   =948/948
## not_dis           0    3809 0.000000    =0/3809
## Totals            0    4757 0.199285  =948/4757
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.544555    0.889330 399
## 2                       max f2  0.544555    0.952583 399
## 3                 max f0point5  0.607760    0.834028 393
## 4                 max accuracy  0.544555    0.800715 399
## 5                max precision  0.995659    1.000000   0
## 6                   max recall  0.544555    1.000000 399
## 7              max specificity  0.995659    1.000000   0
## 8             max absolute_mcc  0.819164    0.096519 167
## 9   max min_per_class_accuracy  0.807165    0.550633 192
## 10 max mean_per_class_accuracy  0.818056    0.559764 169
## 11                     max tns  0.995659  948.000000   0
## 12                     max fns  0.995659 3807.000000   0
## 13                     max fps  0.588392  948.000000 395
## 14                     max tps  0.544555 3809.000000 399
## 15                     max tnr  0.995659    1.000000   0
## 16                     max fnr  0.995659    0.999475   0
## 17                     max fpr  0.588392    1.000000 395
## 18                     max tpr  0.544555    1.000000 399
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                 mean       sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy    0.801976 0.002596   0.801471   0.800420   0.800210   0.806519
## auc         0.579319 0.012140   0.576654   0.572451   0.572059   0.574647
## err         0.198024 0.002596   0.198529   0.199580   0.199790   0.193481
## err_count 188.400000 2.509980 189.000000 190.000000 190.000000 184.000000
## f0point5    0.835436 0.002836   0.835165   0.833698   0.833516   0.840373
##           cv_5_valid
## accuracy    0.801262
## auc         0.600782
## err         0.198738
## err_count 189.000000
## f0point5    0.834428
## 
## ---
##                         mean       sd cv_1_valid cv_2_valid cv_3_valid
## precision           0.802770 0.003829   0.802534   0.800420   0.800210
## r2                  0.012590 0.003273   0.010741   0.009753   0.010155
## recall              0.997900 0.003422   0.997375   1.000000   1.000000
## residual_deviance 937.947100 4.940729 942.065550 941.512900 941.014650
## rmse                0.396940 0.001032   0.397532   0.397731   0.397807
## specificity         0.014798 0.025208   0.015789   0.000000   0.000000
##                   cv_4_valid cv_5_valid
## precision           0.809422   0.801262
## r2                  0.016335   0.015969
## recall              0.992126   1.000000
## residual_deviance 931.956100 933.186300
## rmse                0.395778   0.395851
## specificity         0.058201   0.000000

best_model <- h2o.loadModel("h2o_models/GLM_1_AutoML_9_20250421_124816")

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'exec_fullname' has levels not trained on: ["A. Earl Swift", "A.
## Frederick Gerstell", "A. George (Skip) Battle", "A. James Dearlove", "A. Thomas
## Bender", "Ahmad R. Chatila", "Ahmed D. Kafadar", "Alain C. Viry", "Alan C.
## Greenberg", "Alan C. Henderson", ...1630 not listed..., "William W. Sprague
## Jr.", "William W. Steele", "William White Adams", "William Wrigley", "William
## Y. Tauscher", "Willis J. Johnson", "Willliam T. Jensen", "Wilson Wilde",
## "Wilton Allen Doane Jr.", "Woodson M. Hobbs"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'coname' has levels not trained on: ["4LICENSING CORP", "AAON
## INC", "ABERCROMBIE & FITCH -CL A", "ABIOMED INC", "ACADIA HEALTHCARE CO INC",
## "ACNIELSEN CORP", "ACTEL CORP", "ACUITY BRANDS INC", "ADT CORP", "ADTRAN INC",
## ...406 not listed..., "WORLD COLOR PRESS INC-OLD", "WORLD WRESTLING ENTMT INC",
## "WPX ENERGY INC", "WYLE ELECTRONICS", "WYNN RESORTS LTD", "XPO LOGISTICS INC",
## "XSTELOS HOLDINGS INC", "YOUNG & RUBICAM INC", "ZEBRA TECHNOLOGIES CP -CL A",
## "ZILOG INC"]

predictions_tbl<- predictions %>%
    as_tibble()

predictions_tbl %>% 
    bind_cols(data_test)

## # A tibble: 1,869 × 11
##    predict dismissed not_dis dismissal_dataset_id coname              co_per_rol
##    <fct>       <dbl>   <dbl>                <dbl> <fct>                    <dbl>
##  1 not_dis    0.112    0.888                   12 AMERICAN AIRLINES …          1
##  2 not_dis    0.150    0.850                   13 AMERICAN AIRLINES …          3
##  3 not_dis    0.174    0.826                   65 AIR PRODUCTS & CHE…         28
##  4 not_dis    0.138    0.862                   78 ALBERTSON'S INC             38
##  5 not_dis    0.176    0.824                   80 ALCAN INC                   43
##  6 not_dis    0.138    0.862                   81 ALCAN INC                   44
##  7 not_dis    0.104    0.896                   88 ALEXANDER & ALEXAN…         55
##  8 not_dis    0.163    0.837                   99 HONEYWELL INTERNAT…         60
##  9 not_dis    0.124    0.876                  117 HESS CORP                   77
## 10 not_dis    0.0902   0.910                  121 BEAM INC                    88
## # ℹ 1,859 more rows
## # ℹ 5 more variables: exec_fullname <fct>, ceo_dismissal <fct>,
## #   tenure_no_ceodb <fct>, max_tenure_ceodb <dbl>, fyear_gone <fct>

Evaluate model

performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)

## [1] "S4"

slotNames(performance_h2o)

## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"

performance_h2o@metrics

## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "GLM_1_AutoML_9_20250421_124816"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/GLM_1_AutoML_9_20250421_124816"
## 
## 
## $model_checksum
## [1] "-3048238076565447088"
## 
## $frame
## $frame$name
## [1] "data_test_sid_a40e_3"
## 
## 
## $frame_checksum
## [1] "3960022671504695270"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.745255e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.1584307
## 
## $RMSE
## [1] 0.3980336
## 
## $nobs
## [1] 1869
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.00419951
## 
## $logloss
## [1] 0.4958677
## 
## $AUC
## [1] 0.579675
## 
## $pr_auc
## [1] 0.8455375
## 
## $Gini
## [1] 0.1593499
## 
## $mean_per_class_error
## [1] 0.5
## 
## $domain
## [1] "dismissed" "not_dis"  
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##           dismissed not_dis  Error          Rate
## dismissed         0     371 1.0000 =   371 / 371
## not_dis           0    1498 0.0000 =   0 / 1,498
## Totals            0    1869 0.1985 = 371 / 1,869
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.983637 0.001334 0.000834 0.003329 0.199037  1.000000 0.000668    1.000000
## 2  0.981342 0.002667 0.001668 0.006640 0.199572  1.000000 0.001335    1.000000
## 3  0.979521 0.003997 0.002502 0.009934 0.200107  1.000000 0.002003    1.000000
## 4  0.971487 0.006653 0.004169 0.016469 0.201177  1.000000 0.003338    1.000000
## 5  0.969311 0.007979 0.005002 0.019711 0.201712  1.000000 0.004005    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns  fns fps tps
## 1     0.011514               0.000668                0.500334 371 1497   0   1
## 2     0.016288               0.001335                0.500668 371 1496   0   2
## 3     0.019954               0.002003                0.501001 371 1495   0   3
## 4     0.025775               0.003338                0.501669 371 1493   0   5
## 5     0.028242               0.004005                0.502003 371 1492   0   6
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.999332 0.000000 0.000668   0
## 2 1.000000 0.998665 0.000000 0.001335   1
## 3 1.000000 0.997997 0.000000 0.002003   2
## 4 1.000000 0.996662 0.000000 0.003338   3
## 5 1.000000 0.995995 0.000000 0.004005   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.604917 0.888757 0.950866 0.834264 0.799893  0.801502 0.997330
## 396  0.602411 0.889087 0.951381 0.834450 0.800428  0.801609 0.997997
## 397  0.601538 0.889417 0.951896 0.834635 0.800963  0.801715 0.998665
## 398  0.595162 0.889153 0.951775 0.834263 0.800428  0.801285 0.998665
## 399  0.585352 0.889483 0.952290 0.834448 0.800963  0.801392 0.999332
## 400  0.576528 0.889813 0.952805 0.834633 0.801498  0.801498 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.002695     0.000195               0.002695                0.500013   1
## 396    0.002695     0.005979               0.002695                0.500346   1
## 397    0.002695     0.013554               0.002695                0.500680   1
## 398    0.000000     0.016288               0.000000                0.499332   0
## 399    0.000000     0.011514               0.000000                0.499666   0
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns fps  tps      tnr      fnr      fpr      tpr idx
## 395   4 370 1494 0.002695 0.002670 0.997305 0.997330 394
## 396   3 370 1495 0.002695 0.002003 0.997305 0.997997 395
## 397   2 370 1496 0.002695 0.001335 0.997305 0.998665 396
## 398   2 371 1496 0.000000 0.001335 1.000000 0.998665 397
## 399   1 371 1497 0.000000 0.000668 1.000000 0.999332 398
## 400   0 371 1498 0.000000 0.000000 1.000000 1.000000 399
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.576528    0.889813 399
## 2                       max f2  0.576528    0.952805 399
## 3                 max f0point5  0.660764    0.835012 389
## 4                 max accuracy  0.576528    0.801498 399
## 5                max precision  0.983637    1.000000   0
## 6                   max recall  0.576528    1.000000 399
## 7              max specificity  0.983637    1.000000   0
## 8             max absolute_mcc  0.803261    0.105299 262
## 9   max min_per_class_accuracy  0.838058    0.557951 185
## 10 max mean_per_class_accuracy  0.841826    0.564955 175
## 11                     max tns  0.983637  371.000000   0
## 12                     max fns  0.983637 1497.000000   0
## 13                     max fps  0.595162  371.000000 397
## 14                     max tps  0.576528 1498.000000 399
## 15                     max tnr  0.983637    1.000000   0
## 16                     max fnr  0.983637    0.999332   0
## 17                     max fpr  0.595162    1.000000 397
## 18                     max tpr  0.576528    1.000000 399
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 80.15 %, avg score: 83.47 %
##    group cumulative_data_fraction lower_threshold     lift cumulative_lift
## 1      1               0.01016586        0.953726 1.247664        1.247664
## 2      2               0.02033173        0.926746 1.181997        1.214830
## 3      3               0.03049759        0.907211 1.181997        1.203886
## 4      4               0.04012841        0.901136 0.970405        1.147850
## 5      5               0.05029428        0.897900 0.984998        1.114933
## 6      6               0.10005350        0.887067 1.046427        1.080864
## 7      7               0.15034778        0.878654 1.088387        1.083380
## 8      8               0.20010701        0.872396 1.006180        1.064184
## 9      9               0.30016051        0.859930 1.060848        1.063072
## 10    10               0.40021402        0.849956 1.007472        1.049172
## 11    11               0.50026752        0.840524 1.040832        1.047504
## 12    12               0.59978598        0.830508 0.986057        1.037308
## 13    13               0.69983949        0.818437 0.980784        1.029227
## 14    14               0.79989299        0.802000 0.967440        1.021498
## 15    15               0.89994650        0.771341 0.894048        1.007329
## 16    16               1.00000000        0.576528 0.934080        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 0.964963                 1.000000         0.964963
## 2       0.947368 0.939447                 0.973684         0.952205
## 3       0.947368 0.913674                 0.964912         0.939361
## 4       0.777778 0.904102                 0.920000         0.930899
## 5       0.789474 0.899435                 0.893617         0.924539
## 6       0.838710 0.891971                 0.866310         0.908342
## 7       0.872340 0.882721                 0.868327         0.899771
## 8       0.806452 0.875530                 0.852941         0.893743
## 9       0.850267 0.865739                 0.852050         0.884409
## 10      0.807487 0.855110                 0.840909         0.877084
## 11      0.834225 0.845298                 0.839572         0.870727
## 12      0.790323 0.835942                 0.831401         0.864955
## 13      0.786096 0.824279                 0.824924         0.859140
## 14      0.775401 0.810583                 0.818729         0.853066
## 15      0.716578 0.788138                 0.807372         0.845848
## 16      0.748663 0.734883                 0.801498         0.834745
##    capture_rate cumulative_capture_rate       gain cumulative_gain
## 1      0.012684                0.012684  24.766355       24.766355
## 2      0.012016                0.024700  18.199705       21.483030
## 3      0.012016                0.036716  18.199705       20.388588
## 4      0.009346                0.046061  -2.959502       14.785047
## 5      0.010013                0.056075  -1.500246       11.493339
## 6      0.052069                0.108144   4.642749        8.086361
## 7      0.054740                0.162884   8.838735        8.338045
## 8      0.050067                0.212951   0.618028        6.418362
## 9      0.106142                0.319092   6.084762        6.307162
## 10     0.100801                0.419893   0.747164        4.917162
## 11     0.104139                0.524032   4.083163        4.750362
## 12     0.098131                0.622163  -1.394332        3.730814
## 13     0.098131                0.720294  -1.921635        2.922704
## 14     0.096796                0.817089  -3.256035        2.149845
## 15     0.089453                0.906542 -10.595232        0.732884
## 16     0.093458                1.000000  -6.592034        0.000000
##    kolmogorov_smirnov
## 1            0.012684
## 2            0.022004
## 3            0.031325
## 4            0.029889
## 5            0.029121
## 6            0.040759
## 7            0.063153
## 8            0.064703
## 9            0.095372
## 10           0.099138
## 11           0.119719
## 12           0.112729
## 13           0.103043
## 14           0.086631
## 15           0.033227
## 16           0.000000
## 
## $residual_deviance
## [1] 1853.553
## 
## $null_deviance
## [1] 1862.722
## 
## $AIC
## [1] 16551.55
## 
## $loglikelihood
## [1] 0
## 
## $null_degrees_of_freedom
## [1] 1868
## 
## $residual_degrees_of_freedom
## [1] -5480

h2o.auc(performance_h2o)

## [1] 0.579675

h2o.accuracy(performance_h2o)

##   threshold  accuracy
## 1 0.9836371 0.1990369
## 2 0.9813420 0.1995720
## 3 0.9795211 0.2001070
## 4 0.9714871 0.2011771
## 5 0.9693114 0.2017121
## 
## ---
##     threshold  accuracy
## 395 0.6049169 0.7998930
## 396 0.6024113 0.8004280
## 397 0.6015384 0.8009631
## 398 0.5951616 0.8004280
## 399 0.5853520 0.8009631
## 400 0.5765282 0.8014981

h2o.confusionMatrix(performance_h2o)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.576528161948278:
##           dismissed not_dis    Error       Rate
## dismissed         0     371 1.000000   =371/371
## not_dis           0    1498 0.000000    =0/1498
## Totals            0    1869 0.198502  =371/1869

h2o.metric(performance_h2o)

## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.983637 0.001334 0.000834 0.003329 0.199037  1.000000 0.000668    1.000000
## 2  0.981342 0.002667 0.001668 0.006640 0.199572  1.000000 0.001335    1.000000
## 3  0.979521 0.003997 0.002502 0.009934 0.200107  1.000000 0.002003    1.000000
## 4  0.971487 0.006653 0.004169 0.016469 0.201177  1.000000 0.003338    1.000000
## 5  0.969311 0.007979 0.005002 0.019711 0.201712  1.000000 0.004005    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns  fns fps tps
## 1     0.011514               0.000668                0.500334 371 1497   0   1
## 2     0.016288               0.001335                0.500668 371 1496   0   2
## 3     0.019954               0.002003                0.501001 371 1495   0   3
## 4     0.025775               0.003338                0.501669 371 1493   0   5
## 5     0.028242               0.004005                0.502003 371 1492   0   6
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.999332 0.000000 0.000668   0
## 2 1.000000 0.998665 0.000000 0.001335   1
## 3 1.000000 0.997997 0.000000 0.002003   2
## 4 1.000000 0.996662 0.000000 0.003338   3
## 5 1.000000 0.995995 0.000000 0.004005   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.604917 0.888757 0.950866 0.834264 0.799893  0.801502 0.997330
## 396  0.602411 0.889087 0.951381 0.834450 0.800428  0.801609 0.997997
## 397  0.601538 0.889417 0.951896 0.834635 0.800963  0.801715 0.998665
## 398  0.595162 0.889153 0.951775 0.834263 0.800428  0.801285 0.998665
## 399  0.585352 0.889483 0.952290 0.834448 0.800963  0.801392 0.999332
## 400  0.576528 0.889813 0.952805 0.834633 0.801498  0.801498 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.002695     0.000195               0.002695                0.500013   1
## 396    0.002695     0.005979               0.002695                0.500346   1
## 397    0.002695     0.013554               0.002695                0.500680   1
## 398    0.000000     0.016288               0.000000                0.499332   0
## 399    0.000000     0.011514               0.000000                0.499666   0
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns fps  tps      tnr      fnr      fpr      tpr idx
## 395   4 370 1494 0.002695 0.002670 0.997305 0.997330 394
## 396   3 370 1495 0.002695 0.002003 0.997305 0.997997 395
## 397   2 370 1496 0.002695 0.001335 0.997305 0.998665 396
## 398   2 371 1496 0.000000 0.001335 1.000000 0.998665 397
## 399   1 371 1497 0.000000 0.000668 1.000000 0.999332 398
## 400   0 371 1498 0.000000 0.000000 1.000000 1.000000 399

Comparison between original classification model and h2o model

The original classification model resulted in an accuracy of 0.579 and an AUC of 0.581. The h2o model resulted in a comparable AUC of 0.5796 and an optimal accuracy of 0.801. The h2o model took significantly less time to produce and resulted in a similar AUC and a better accuracy so I would conclude for this dataset the h2o model is better.

Apply 11

Brady Martin

2025-04-21