Import Data

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv')

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore Data

skimr::skim(departures)

Data summary
Name	departures
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

Clean Data

# Clean data
departures_clean <- departures %>%
    
    # Clean the target variable
    filter(!is.na(ceo_dismissal)) %>%
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
    mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>%
    
    # Remove variables with too many missing values
    select(-c(interim_coceo, still_there, eight_ks))%>%
    
    # Remove irrelevant variables
    select(-`_merge`, -sources) %>%
               
    # Remove variables with info that only becomes 
    select(-departure_code) %>%
    
    # Remove redundant variables 
    select(-c(gvkey, cik, co_per_rol)) %>% #need leftofc as date variable later
    
    #Remove duplicated in dismissal_dataset_id our id variable
    distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
    
    #Remove 2997 in fyear_gone
    filter(fyear_gone < 2025) %>%
    
    # Convert factors that are incorrectly imported as numeric variables
    mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%
    mutate(across(where(is.character), as.factor)) %>%
    
    mutate(notes = as.character(notes))
    
skimr::skim(departures_clean)

Data summary
Name	departures_clean
Number of rows	7475
Number of columns	10
_______________________
Column type frequency:
character	1
factor	6
numeric	2
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
notes	17	1	5	3117	0	7448	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
coname	1	FALSE	3427	BAR: 8, CLA: 8, FED: 8, GRE: 8
exec_fullname	1	FALSE	6975	Joh: 4, Mel: 4, Alb: 3, Ami: 3
ceo_dismissal	1	FALSE	2	not: 5992, dis: 1483
tenure_no_ceodb	1	FALSE	3	1: 7289, 2: 179, 3: 7
max_tenure_ceodb	1	FALSE	4	1: 7138, 2: 319, 3: 15, 4: 3
fyear_gone	1	FALSE	34	200: 379, 199: 351, 200: 334, 200: 321

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1	5570.32	25757.33	1	2175.5	4326	6579.5	559044	▇▁▁▁▁
fyear	0	1	2005.61	7.45	1987	1999.0	2006	2012.0	2020	▁▇▆▇▆

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	0	1	1981-01-01	2021-12-01	2006-11-15	3576

Model building

Split data

library(tidymodels)

# Set seed for reproducibility
set.seed(1234)
data_clean <- departures_clean %>% group_by(ceo_dismissal)%>% sample_n(100) %>% ungroup

# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [134/16]> Fold01
##  2 <split [134/16]> Fold02
##  3 <split [134/16]> Fold03
##  4 <split [134/16]> Fold04
##  5 <split [134/16]> Fold05
##  6 <split [136/14]> Fold06
##  7 <split [136/14]> Fold07
##  8 <split [136/14]> Fold08
##  9 <split [136/14]> Fold09
## 10 <split [136/14]> Fold10

Preprocess data

#departures_clean <- departures_clean %>%
#mutate(leftofc = as.Date(leftofc, format = "%Y-%m-%d"))
    
xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
    update_role(dismissal_dataset_id, new_role = "ID")  %>%
    step_other(coname, exec_fullname, threshold = 0.05) %>%
    step_tokenize(notes) %>%
    step_tokenfilter(notes, max_tokens = 100) %>%
    step_tfidf(notes)  %>%
    step_date(leftofc, features = c("year", "month", "doy"), keep_original_cols = FALSE) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_smote(ceo_dismissal)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 150
## Columns: 156
## $ dismissal_dataset_id    <dbl> 3386, 4200, 6287, 1277, 2064, 3066, 6716, 6613…
## $ fyear                   <dbl> 2012, 2004, 2001, 2013, 2012, 2005, 2016, 2000…
## $ ceo_dismissal           <fct> dismissed, dismissed, dismissed, dismissed, di…
## $ tfidf_notes_1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_1997        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_2003        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_3           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04145575…
## $ tfidf_notes_a           <dbl> 0.06278155, 0.00000000, 0.03251187, 0.04477045…
## $ tfidf_notes_about       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_acquisition <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_after       <dbl> 0.06483837, 0.00000000, 0.00000000, 0.03082480…
## $ tfidf_notes_agreement   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_all         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_also        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03835080…
## $ tfidf_notes_an          <dbl> 0.06084517, 0.00000000, 0.00000000, 0.05785278…
## $ tfidf_notes_and         <dbl> 0.02796311, 0.00000000, 0.05792359, 0.02658788…
## $ tfidf_notes_announced   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06164960…
## $ tfidf_notes_as          <dbl> 0.10075215, 0.00000000, 0.06956696, 0.01596619…
## $ tfidf_notes_at          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_based       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_be          <dbl> 0.00000000, 0.00000000, 0.08159930, 0.00000000…
## $ tfidf_notes_been        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_board       <dbl> 0.00000000, 0.00000000, 0.04474153, 0.00000000…
## $ tfidf_notes_but         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_by          <dbl> 0.00000000, 0.38467534, 0.00000000, 0.00000000…
## $ tfidf_notes_ceo         <dbl> 0.09166455, 0.00000000, 0.09493828, 0.02178911…
## $ tfidf_notes_chairman    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.02224629…
## $ tfidf_notes_changed     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04266704…
## $ tfidf_notes_chief       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.01996979…
## $ tfidf_notes_company     <dbl> 0.03506670, 0.00000000, 0.03631908, 0.05001316…
## $ `tfidf_notes_company's` <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_corp        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_corporation <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_departure   <dbl> 0.00000000, 0.00000000, 0.09031432, 0.00000000…
## $ tfidf_notes_did         <dbl> 0.00000000, 0.00000000, 0.09031432, 0.00000000…
## $ tfidf_notes_director    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_directors   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_down        <dbl> 0.16537209, 0.00000000, 0.00000000, 0.03930976…
## $ tfidf_notes_during      <dbl> 0.17440006, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_effective   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_executive   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_financial   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_following   <dbl> 0.00000000, 0.00000000, 0.09295320, 0.00000000…
## $ tfidf_notes_for         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04228418…
## $ tfidf_notes_from        <dbl> 0.04536817, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_had         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06742466…
## $ tfidf_notes_has         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_have        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_he          <dbl> 0.00000000, 0.00000000, 0.04651873, 0.04270572…
## $ tfidf_notes_him         <dbl> 0.00000000, 0.59947382, 0.00000000, 0.00000000…
## $ tfidf_notes_his         <dbl> 0.16326056, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_in          <dbl> 0.09604772, 0.00000000, 0.00000000, 0.04566203…
## $ tfidf_notes_inc         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.02587189…
## $ tfidf_notes_into        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_is          <dbl> 0.00000000, 0.00000000, 0.12235561, 0.00000000…
## $ tfidf_notes_it          <dbl> 0.00000000, 0.00000000, 0.12416220, 0.00000000…
## $ tfidf_notes_its         <dbl> 0.05442019, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_january     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_john        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_left        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_management  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04145575…
## $ tfidf_notes_march       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03930976…
## $ tfidf_notes_may         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03930976…
## $ tfidf_notes_merger      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_million     <dbl> 0.07536068, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_mr          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_new         <dbl> 0.00000000, 0.00000000, 0.07489790, 0.03437936…
## $ tfidf_notes_not         <dbl> 0.06957922, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_of          <dbl> 0.00000000, 0.00000000, 0.08738442, 0.02674059…
## $ tfidf_notes_officer     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.02248327…
## $ tfidf_notes_on          <dbl> 0.00000000, 0.00000000, 0.04651873, 0.00000000…
## $ tfidf_notes_or          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_other       <dbl> 0.07702042, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_out         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_over        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_performance <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04145575…
## $ tfidf_notes_president   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_resignation <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_resigned    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retire      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0829115, 0.…
## $ tfidf_notes_retirement  <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_said        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_served      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_since       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_stock       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_than        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_that        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.02015495…
## $ tfidf_notes_the         <dbl> 0.02576601, 0.00000000, 0.05337246, 0.04899767…
## $ tfidf_notes_they        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_this        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_time        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_to          <dbl> 0.03021542, 0.00000000, 0.06258909, 0.04309413…
## $ tfidf_notes_today       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_until       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03745542…
## $ tfidf_notes_was         <dbl> 0.03915458, 0.28387071, 0.00000000, 0.03722895…
## $ tfidf_notes_were        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_when        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_which       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03835080…
## $ tfidf_notes_who         <dbl> 0.00000000, 0.47809686, 0.00000000, 0.03135061…
## $ tfidf_notes_will        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.09405184…
## $ tfidf_notes_with        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06103201…
## $ tfidf_notes_year        <dbl> 0.00000000, 0.00000000, 0.06829955, 0.00000000…
## $ tfidf_notes_years       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03190017…
## $ leftofc_year            <int> 2012, 2005, 2001, 2014, 2012, 2005, 2017, 2003…
## $ leftofc_doy             <int> 269, 32, 288, 60, 275, 31, 9, 90, 53, 333, 1, …
## $ coname_other            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ exec_fullname_other     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ tenure_no_ceodb_X2      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tenure_no_ceodb_X3      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X2     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X3     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X4     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1988        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1990        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1991        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1992        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1993        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1994        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1995        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1996        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1997        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1998        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ fyear_gone_X1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2000        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ fyear_gone_X2001        <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2002        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2003        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2004        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2005        <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ fyear_gone_X2006        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2007        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2008        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2009        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2010        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2011        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2012        <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ fyear_gone_X2013        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2014        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2015        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2016        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2017        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2018        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1…
## $ fyear_gone_X2019        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2020        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2021        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Feb       <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Mar       <dbl> 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Apr       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ leftofc_month_May       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ leftofc_month_Jun       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Jul       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ leftofc_month_Aug       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Sep       <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Oct       <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0…
## $ leftofc_month_Nov       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Dec       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Specify model

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune(), min_n = tune(),   learn_rate = tune()) %>%  
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune Hyperparameters

doParallel::registerDoParallel()

set.seed(65743)
xgboost_tune <- 
    tune_grid(xgboost_workflow,
              resamples = data_cv,
              grid = 5,
              control = control_grid(save_pred = TRUE))

## Warning: ! tune detected a parallel backend registered with foreach but no backend
##   registered with future.
## ℹ Support for parallel processing with foreach was soft-deprecated in tune
##   1.2.1.
## ℹ See ?parallelism (`?tune::parallelism()`) to learn more.

Model Evaluation

Identify Optimal Values for Hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 10
##    trees min_n tree_depth learn_rate .metric     .estimator  mean     n std_err
##    <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>   <dbl>
##  1  1000     2          1    0.0750  accuracy    binary     0.624    10 0.0447 
##  2  1000     2          1    0.0750  brier_class binary     0.265    10 0.0327 
##  3  1000     2          1    0.0750  roc_auc     binary     0.703    10 0.0501 
##  4  1500    11         11    0.001   accuracy    binary     0.588    10 0.0531 
##  5  1500    11         11    0.001   brier_class binary     0.241    10 0.00621
##  6  1500    11         11    0.001   roc_auc     binary     0.628    10 0.0454 
##  7   500    21         15    0.316   accuracy    binary     0.5      10 0      
##  8   500    21         15    0.316   brier_class binary     0.25     10 0      
##  9   500    21         15    0.316   roc_auc     binary     0.5      10 0      
## 10     1    30          4    0.00422 accuracy    binary     0.5      10 0      
## 11     1    30          4    0.00422 brier_class binary     0.25     10 0      
## 12     1    30          4    0.00422 roc_auc     binary     0.5      10 0      
## 13  2000    40          8    0.0178  accuracy    binary     0.5      10 0      
## 14  2000    40          8    0.0178  brier_class binary     0.25     10 0      
## 15  2000    40          8    0.0178  roc_auc     binary     0.5      10 0      
## # ℹ 1 more variable: .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the Model for the Last Time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.64  Preprocessor1_Model1
## 2 roc_auc     binary         0.653 Preprocessor1_Model1
## 3 brier_class binary         0.269 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

Variable Importance

library(vip)
xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had accuracy of 0.838 and AUC of 0.856

Feature transformation: Date function transformation. Resulted in almost the same result, but slightly worsen with accuracy of 0.837 and AUC 0.0.853. Feature transformation: removal #%>% group_by(ceo_dismissal)%>% sample_n(100) %>% ungroup: No improvment accuracy, worsen result on full data set. *Feature slection: PCA didn’t make an improvement.

Comparision using h2o

Recipes

recipe_obj <- recipe(ceo_dismissal ~ ., data = data_train) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors())

Model

# Initialize h2o
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         6 minutes 44 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 17 days 
##     H2O cluster name:           H2O_started_from_R_rad1081_pja023 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   7.08 GB 
##     H2O cluster total cores:    20 
##     H2O cluster allowed cores:  20 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.1 (2024-06-14 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 17 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

split.h2o <- h2o.splitFrame(as.h2o(data_train), ratios = c(0.85), seed = 2345)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(data_test)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

y <- "ceo_dismissal"
x <- setdiff(names(data_train), y)

models_h2o <- h2o.automl(
    x = x,
    y = y, 
    training_frame    = train_h2o,
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    # max_runtime_secs  = 30, 
    max_models        = 10, 
    exclude_algos     = "DeepLearning",
    nfolds            = 5, 
    seed              = 3456   
)

##   |                                                                              |                                                                      |   0%  |                                                                              |==================================================                    |  71%
## 22:21:33.192: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 22:21:33.196: AutoML: XGBoost is not available; skipping it.
## 22:21:33.196: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.342: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.342: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 133.0.
## 22:21:33.342: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.509: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.622: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.743: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.848: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.940: _train param, Dropping bad and constant columns: [notes]
## 22:21:34.276: _train param, Dropping bad and constant columns: [notes]
## 22:21:34.425: _train param, Dropping unused columns: [notes]
## 22:21:34.666: _train param, Dropping unused columns: [notes]  |                                                                              |======================================================================| 100%

Examine the output of h2o.automl

models_h2o %>% typeof()

## [1] "S4"

models_h2o %>% slotNames()

## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"

models_h2o@leaderboard

##                                                           model_id    auc
## 1 GBM_lr_annealing_selection_AutoML_3_20250507_222133_select_model 0.6384
## 2          StackedEnsemble_BestOfFamily_1_AutoML_3_20250507_222133 0.6384
## 3                                   GLM_1_AutoML_3_20250507_222133 0.5632
## 4                                   XRT_1_AutoML_3_20250507_222133 0.5368
## 5                                   GBM_2_AutoML_3_20250507_222133 0.5128
## 6                      GBM_grid_1_AutoML_3_20250507_222133_model_3 0.4904
##     logloss     aucpr mean_per_class_error      rmse       mse
## 1 1.1890790 0.6733367                 0.32 0.6336895 0.4015624
## 2 0.6663851 0.6733367                 0.32 0.4879913 0.2381355
## 3 0.6920394 0.5765021                 0.50 0.4994458 0.2494461
## 4 0.7143198 0.5302203                 0.46 0.5091135 0.2591966
## 5 1.2738569 0.5653167                 0.44 0.6418286 0.4119440
## 6 0.9656350 0.5164876                 0.50 0.5908143 0.3490615
## 
## [11 rows x 7 columns]

models_h2o@leader

## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model ID:  GBM_lr_annealing_selection_AutoML_3_20250507_222133_select_model 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              20                       20                5828         3
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         5    4.10000          7          9     8.40000
## 
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  0.007384357
## RMSE:  0.08593228
## LogLoss:  0.08953965
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## R^2:  0.9704609
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           dismissed not_dis    Error    Rate
## dismissed        67       0 0.000000   =0/67
## not_dis           0      66 0.000000   =0/66
## Totals           67      66 0.000000  =0/133
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold     value idx
## 1                       max f1  0.892790  1.000000  64
## 2                       max f2  0.892790  1.000000  64
## 3                 max f0point5  0.892790  1.000000  64
## 4                 max accuracy  0.892790  1.000000  64
## 5                max precision  0.923658  1.000000   0
## 6                   max recall  0.892790  1.000000  64
## 7              max specificity  0.923658  1.000000   0
## 8             max absolute_mcc  0.892790  1.000000  64
## 9   max min_per_class_accuracy  0.892790  1.000000  64
## 10 max mean_per_class_accuracy  0.892790  1.000000  64
## 11                     max tns  0.923658 67.000000   0
## 12                     max fns  0.923658 65.000000   0
## 13                     max fps  0.077761 67.000000 127
## 14                     max tps  0.892790 66.000000  64
## 15                     max tnr  0.923658  1.000000   0
## 16                     max fnr  0.923658  0.984848   0
## 17                     max fpr  0.077761  1.000000 127
## 18                     max tpr  0.892790  1.000000  64
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
## 
## MSE:  0.4410905
## RMSE:  0.6641464
## LogLoss:  1.314171
## Mean Per-Class Error:  0.375
## AUC:  0.5277778
## AUCPR:  0.5289719
## Gini:  0.05555556
## R^2:  -0.7704883
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           dismissed not_dis    Error   Rate
## dismissed         2       6 0.750000   =6/8
## not_dis           0       9 0.000000   =0/9
## Totals            2      15 0.352941  =6/17
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.078710 0.750000   9
## 2                       max f2  0.078710 0.882353   9
## 3                 max f0point5  0.078710 0.652174   9
## 4                 max accuracy  0.078710 0.647059   9
## 5                max precision  0.097067 0.600000   2
## 6                   max recall  0.078710 1.000000   9
## 7              max specificity  0.107839 0.875000   0
## 8             max absolute_mcc  0.078710 0.387298   9
## 9   max min_per_class_accuracy  0.092963 0.500000   4
## 10 max mean_per_class_accuracy  0.078710 0.625000   9
## 11                     max tns  0.107839 7.000000   0
## 12                     max fns  0.107839 8.000000   0
## 13                     max fps  0.076896 8.000000  11
## 14                     max tps  0.078710 9.000000   9
## 15                     max tnr  0.107839 0.875000   0
## 16                     max fnr  0.107839 0.888889   0
## 17                     max fpr  0.076896 1.000000  11
## 18                     max tpr  0.078710 1.000000   9
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.3925783
## RMSE:  0.6265607
## LogLoss:  1.171669
## Mean Per-Class Error:  0.5
## AUC:  0.5574401
## AUCPR:  0.595799
## Gini:  0.1148801
## R^2:  -0.5704022
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           dismissed not_dis    Error     Rate
## dismissed         0      67 1.000000   =67/67
## not_dis           0      66 0.000000    =0/66
## Totals            0     133 0.503759  =67/133
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold     value idx
## 1                       max f1  0.072764  0.663317  81
## 2                       max f2  0.072764  0.831234  81
## 3                 max f0point5  0.083791  0.570776  58
## 4                 max accuracy  0.090045  0.571429  44
## 5                max precision  0.515654  1.000000   0
## 6                   max recall  0.072764  1.000000  81
## 7              max specificity  0.515654  1.000000   0
## 8             max absolute_mcc  0.128158  0.237481   6
## 9   max min_per_class_accuracy  0.090294  0.552239  43
## 10 max mean_per_class_accuracy  0.090045  0.571574  44
## 11                     max tns  0.515654 67.000000   0
## 12                     max fns  0.515654 65.000000   0
## 13                     max fps  0.074445 67.000000  79
## 14                     max tps  0.072764 66.000000  81
## 15                     max tnr  0.515654  1.000000   0
## 16                     max fnr  0.515654  0.984848   0
## 17                     max fpr  0.074445  1.000000  79
## 18                     max tpr  0.072764  1.000000  81
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                              mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                 0.577778 0.099208   0.555556   0.592593   0.740741
## auc                      0.569611 0.083104   0.516484   0.620879   0.684066
## err                      0.422222 0.099208   0.444444   0.407407   0.259259
## err_count               11.200000 2.489980  12.000000  11.000000   7.000000
## f0point5                 0.608048 0.085271   0.575221   0.596330   0.757576
## f1                       0.692197 0.030961   0.684211   0.702703   0.740741
## f2                       0.818145 0.053055   0.844156   0.855263   0.724638
## lift_top_group           2.016484 0.062406   2.076923   2.076923   1.928572
## logloss                  1.472991 0.066341   1.502453   1.362837   1.460304
## max_per_class_error      0.785714 0.294508   0.857143   0.785714   0.285714
## mcc                      0.365588 0.107662   0.272554   0.340693   0.483516
## mean_per_class_accuracy  0.584066 0.099616   0.571429   0.607143   0.741758
## mean_per_class_error     0.415934 0.099616   0.428571   0.392857   0.258242
## mse                      0.429751 0.018685   0.438094   0.396594   0.438892
## pr_auc                   0.624372 0.083814   0.565712   0.655953   0.749358
## precision                0.566179 0.114807   0.520000   0.541667   0.769231
## r2                      -0.720403 0.074348  -0.754784  -0.588554  -0.757979
## recall                   0.942857 0.127775   1.000000   1.000000   0.714286
## rmse                     0.655426 0.014460   0.661887   0.629757   0.662489
## specificity              0.225275 0.317923   0.142857   0.214286   0.769231
##                         cv_4_valid cv_5_valid
## accuracy                  0.500000   0.500000
## auc                       0.476331   0.550296
## err                       0.500000   0.500000
## err_count                13.000000  13.000000
## f0point5                  0.555556   0.555556
## f1                        0.666667   0.666667
## f2                        0.833333   0.833333
## lift_top_group            2.000000   2.000000
## logloss                   1.513267   1.526091
## max_per_class_error       1.000000   1.000000
## mcc                             NA         NA
## mean_per_class_accuracy   0.500000   0.500000
## mean_per_class_error      0.500000   0.500000
## mse                       0.434334   0.440840
## pr_auc                    0.535128   0.615709
## precision                 0.500000   0.500000
## r2                       -0.737335  -0.763362
## recall                    1.000000   1.000000
## rmse                      0.659040   0.663958
## specificity               0.000000   0.000000

Save and Load

?h2o.getModel

## starting httpd help server ... done

?h2o.saveModel
?h2o.loadModel

best_model <- models_h2o@leader

# best_model <- h2o.loadModel("h2o_models/GBM_lr_annealing_selection_AutoML_1_20250504_210132_select_model")
# having trouble making this one work

Make Predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'coname' has levels not trained on: ["ADVANCE AUTO PARTS INC",
## "AK STEEL HOLDING CORP", "AMOCO CORP", "BISYS GROUP INC", "BOB EVANS FARMS",
## "BRUNSWICK CORP", "CACI INTL INC -CL A", "CALAMP CORP", "CATALINA MARKETING
## CORP", "CDW CORP", ...26 not listed..., "SYMANTEC CORP", "SYMBOL TECHNOLOGIES",
## "SYSCO CORP", "TECUMSEH PRODUCTS CO", "U S TRUST CORP", "U.S. STEEL", "US
## AIRWAYS GROUP INC-OLD", "VOLT INFO SCIENCES INC", "WET SEAL INC -CL A", "XPERI
## CORPORATION"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'exec_fullname' has levels not trained on: ["Bruce Karatz",
## "Bruce M. McWilliams", "Bruce R. Lakefield", "Charles R. Perrin", "David W.
## Fox", "Dennis C. Pence", "Eric Krasnoff", "Frank J. Hansen", "H. Laurance
## Fuller", "H. Marshall Schwarz", ...30 not listed..., "Steven D. Butler",
## "Thomas E. Richards", "Thomas J. Usher", "Thomas Kendall Hunt", "Timothy N.
## Jenson", "Tomo Razmilovic", "Volker Wypyszyk", "W. A. Griffin III", "William H.
## Swanson", "William L. Schrader"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'fyear_gone' has levels not trained on: ["2020"]

predictions_tbl<- predictions %>%
    as_tibble()

predictions_tbl %>% 
    bind_cols(data_test)

## # A tibble: 50 × 13
##    predict dismissed not_dis dismissal_dataset_id coname     fyear exec_fullname
##    <fct>       <dbl>   <dbl>                <dbl> <fct>      <dbl> <fct>        
##  1 not_dis     0.911  0.0893                 2853 POWELL IN…  2015 "Michael All…
##  2 not_dis     0.910  0.0897                 8320 FREEPORT …  2014 "James C. Fl…
##  3 not_dis     0.911  0.0895                 5771 BISYS GRO…  2005 "Russell P. …
##  4 not_dis     0.911  0.0892                 3296 STRIDE RI…  1999 "James A. Es…
##  5 not_dis     0.911  0.0895                 8250 ADVANCE A…  2006 "Michael N. …
##  6 not_dis     0.892  0.108                  5286 WET SEAL …  2004 "Peter D. Wh…
##  7 not_dis     0.893  0.107                  2208 MAXTOR CO…  1994 "Hyundai's c…
##  8 not_dis     0.903  0.0971                 3344 SYMBOL TE…  2002 "Tomo Razmil…
##  9 not_dis     0.911  0.0895                 7390 STARTEK I…  2006 "Steven D. B…
## 10 not_dis     0.910  0.0897                 4811 SYMANTEC …  2015 "Michael A. …
## # ℹ 40 more rows
## # ℹ 6 more variables: ceo_dismissal <fct>, tenure_no_ceodb <fct>,
## #   max_tenure_ceodb <fct>, fyear_gone <fct>, leftofc <dttm>, notes <chr>

Evaluate model

?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
confusion_matrix <- h2o.confusionMatrix(performance_h2o)
print(confusion_matrix)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.0926881801248008:
##           dismissed not_dis    Error    Rate
## dismissed        16       9 0.360000   =9/25
## not_dis           7      18 0.280000   =7/25
## Totals           23      27 0.320000  =16/50

#typeof(performance_h2o)
#slotNames(performance_h2o)
#performance_h2o@metrics

metrics <- performance_h2o@metrics
print(metrics)

## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "GBM_lr_annealing_selection_AutoML_3_20250507_222133_select_model"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/GBM_lr_annealing_selection_AutoML_3_20250507_222133_select_model"
## 
## 
## $model_checksum
## [1] "-8767878325203396011"
## 
## $frame
## $frame$name
## [1] "data_test_sid_b620_3"
## 
## 
## $frame_checksum
## [1] "8908287964745272696"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.746671e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.4015624
## 
## $RMSE
## [1] 0.6336895
## 
## $nobs
## [1] 50
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] -0.6062494
## 
## $logloss
## [1] 1.189079
## 
## $AUC
## [1] 0.6384
## 
## $pr_auc
## [1] 0.6733367
## 
## $Gini
## [1] 0.2768
## 
## $mean_per_class_error
## [1] 0.32
## 
## $domain
## [1] "dismissed" "not_dis"  
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##           dismissed not_dis  Error      Rate
## dismissed        16       9 0.3600 =  9 / 25
## not_dis           7      18 0.2800 =  7 / 25
## Totals           23      27 0.3200 = 16 / 50
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.482191 0.076923 0.049505 0.172414 0.520000  1.000000 0.040000    1.000000
## 2  0.125795 0.148148 0.098039 0.303030 0.540000  1.000000 0.080000    1.000000
## 3  0.116661 0.214286 0.145631 0.405405 0.560000  1.000000 0.120000    1.000000
## 4  0.107860 0.206897 0.144231 0.365854 0.540000  0.750000 0.120000    0.960000
## 5  0.107856 0.375000 0.280374 0.566038 0.600000  0.857143 0.240000    0.960000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.142857               0.040000                0.520000  25  24   0   1
## 2     0.204124               0.080000                0.540000  25  23   0   2
## 3     0.252646               0.120000                0.560000  25  22   0   3
## 4     0.147442               0.120000                0.540000  24  22   1   3
## 5     0.288195               0.240000                0.600000  24  19   1   6
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.960000 0.000000 0.040000   0
## 2 1.000000 0.920000 0.000000 0.080000   1
## 3 1.000000 0.880000 0.000000 0.120000   2
## 4 0.960000 0.880000 0.040000 0.120000   3
## 5 0.960000 0.760000 0.040000 0.240000   4
## 
## ---
##    threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 28  0.087099 0.626866 0.739437 0.544041 0.500000  0.500000 0.840000    0.160000
## 29  0.079123 0.647059 0.769231 0.558376 0.520000  0.511628 0.880000    0.160000
## 30  0.079113 0.628571 0.758621 0.536585 0.480000  0.488889 0.880000    0.080000
## 31  0.078754 0.647887 0.787671 0.550239 0.500000  0.500000 0.920000    0.080000
## 32  0.077264 0.648649 0.805369 0.542986 0.480000  0.489796 0.960000    0.000000
## 33  0.077261 0.666667 0.833333 0.555556 0.500000  0.500000 1.000000    0.000000
##    absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 28     0.000000               0.160000                0.500000   4   4  21  21
## 29     0.057639               0.160000                0.520000   4   3  21  22
## 30     0.066667               0.080000                0.480000   2   3  23  22
## 31     0.000000               0.080000                0.500000   2   2  23  23
## 32     0.142857               0.000000                0.480000   0   1  25  24
## 33     0.000000               0.000000                0.500000   0   0  25  25
##         tnr      fnr      fpr      tpr idx
## 28 0.160000 0.160000 0.840000 0.840000  27
## 29 0.160000 0.120000 0.840000 0.880000  28
## 30 0.080000 0.120000 0.920000 0.880000  29
## 31 0.080000 0.080000 0.920000 0.920000  30
## 32 0.000000 0.040000 1.000000 0.960000  31
## 33 0.000000 0.000000 1.000000 1.000000  32
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold     value idx
## 1                       max f1  0.092688  0.692308  17
## 2                       max f2  0.077261  0.833333  32
## 3                 max f0point5  0.093099  0.680000  15
## 4                 max accuracy  0.093099  0.680000  15
## 5                max precision  0.482191  1.000000   0
## 6                   max recall  0.077261  1.000000  32
## 7              max specificity  0.482191  1.000000   0
## 8             max absolute_mcc  0.092688  0.361158  17
## 9   max min_per_class_accuracy  0.093099  0.680000  15
## 10 max mean_per_class_accuracy  0.093099  0.680000  15
## 11                     max tns  0.482191 25.000000   0
## 12                     max fns  0.482191 24.000000   0
## 13                     max fps  0.077264 25.000000  31
## 14                     max tps  0.077261 25.000000  32
## 15                     max tnr  0.482191  1.000000   0
## 16                     max fnr  0.482191  0.960000   0
## 17                     max fpr  0.077264  1.000000  31
## 18                     max tpr  0.077261  1.000000  32
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 50.00 %, avg score: 10.19 %
##    group cumulative_data_fraction lower_threshold     lift cumulative_lift
## 1      1               0.02000000        0.307557 2.000000        2.000000
## 2      2               0.02000000        0.132923 0.000000        2.000000
## 3      3               0.04000000        0.121502 2.000000        2.000000
## 4      4               0.04000000        0.117027 0.000000        2.000000
## 5      5               0.06000000        0.112701 2.000000        2.000000
## 6      6               0.14000000        0.107856 1.500000        1.714286
## 7      7               0.16000000        0.107385 0.000000        1.500000
## 8      8               0.20000000        0.107373 1.000000        1.400000
## 9      9               0.30000000        0.097080 0.800000        1.200000
## 10    10               0.42000000        0.096646 1.666667        1.333333
## 11    11               0.50000000        0.092932 1.500000        1.360000
## 12    12               0.60000000        0.089567 0.400000        1.200000
## 13    13               0.72000000        0.089205 0.333333        1.055556
## 14    14               0.82000000        0.087104 0.800000        1.024390
## 15    15               0.90000000        0.079077 0.500000        0.977778
## 16    16               1.00000000        0.077261 1.200000        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 0.482191                 1.000000         0.482191
## 2       0.000000 0.000000                 1.000000         0.482191
## 3       1.000000 0.125795                 1.000000         0.303993
## 4       0.000000 0.000000                 1.000000         0.303993
## 5       1.000000 0.116661                 1.000000         0.241549
## 6       0.750000 0.107857                 0.857143         0.165154
## 7       0.000000 0.107390                 0.750000         0.157933
## 8       0.500000 0.107376                 0.700000         0.147822
## 9       0.400000 0.102730                 0.600000         0.132791
## 10      0.833333 0.096899                 0.666667         0.122536
## 11      0.750000 0.093836                 0.680000         0.117944
## 12      0.200000 0.090893                 0.600000         0.113436
## 13      0.166667 0.089364                 0.527778         0.109424
## 14      0.400000 0.088289                 0.512195         0.106846
## 15      0.250000 0.081112                 0.488889         0.104559
## 16      0.600000 0.077561                 0.500000         0.101859
##    capture_rate cumulative_capture_rate        gain cumulative_gain
## 1      0.040000                0.040000  100.000000      100.000000
## 2      0.000000                0.040000 -100.000000      100.000000
## 3      0.040000                0.080000  100.000000      100.000000
## 4      0.000000                0.080000 -100.000000      100.000000
## 5      0.040000                0.120000  100.000000      100.000000
## 6      0.120000                0.240000   50.000000       71.428571
## 7      0.000000                0.240000 -100.000000       50.000000
## 8      0.040000                0.280000    0.000000       40.000000
## 9      0.080000                0.360000  -20.000000       20.000000
## 10     0.200000                0.560000   66.666667       33.333333
## 11     0.120000                0.680000   50.000000       36.000000
## 12     0.040000                0.720000  -60.000000       20.000000
## 13     0.040000                0.760000  -66.666667        5.555556
## 14     0.080000                0.840000  -20.000000        2.439024
## 15     0.040000                0.880000  -50.000000       -2.222222
## 16     0.120000                1.000000   20.000000        0.000000
##    kolmogorov_smirnov
## 1            0.040000
## 2            0.040000
## 3            0.080000
## 4            0.080000
## 5            0.120000
## 6            0.200000
## 7            0.160000
## 8            0.160000
## 9            0.120000
## 10           0.280000
## 11           0.360000
## 12           0.240000
## 13           0.080000
## 14           0.040000
## 15          -0.040000
## 16           0.000000

#auc <- h2o.auc(performance_h2o)
#print(paste("AUC:", auc))

#typeof(performance_h2o)

#slotNames(performance_h2o)
#performance_h2o@metrics
#h2o.auc(performance_h2o)
#h2o.accuracy(performance_h2o)
#h2o.confusionMatrix(performance_h2o)
#h2o.metric(performance_h2o)

Apply 11

Ronja Dahlin

2025-05-04