Import Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## Warning: package 'correlationfunnel' was built under R version 4.4.2

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

library(textrecipes)

## Warning: package 'textrecipes' was built under R version 4.4.2

## Loading required package: recipes
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv')

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean Data

skimr::skim(departures)

Data summary
Name	departures
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

Issues with Code

Missing Values
- interim_coceo, still_there, notes, sources, eight_ks, departure_code, ceo_dismissal, fyear_gone, cik, leftofc
Factors or numeric variables
- departure_code
Zero variance variables *_merge
Character variables: convert them to numbers in the recipe steps
Unbalanced target variables: ceo_dismissal
ID variable: dismissal_dataset_id

factors_vec <- departures %>% select(departure_code, ceo_dismissal) %>% names()

departure_clean <- departures %>%
    
    # Clean the target
    filter(!is.na(ceo_dismissal)) %>%
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not dis")) %>%
    mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>%
    
    # Remove variables with too many missing values
    select(-c(interim_coceo, still_there, eight_ks)) %>%
    
    # Remove irrelevant variables
    select(-'_merge', -sources) %>%
    
    # Remove variables that have info that only becomes relevant after the fact
    select(-departure_code) %>%
    
    # Remove redundant variables
    select(-c(gvkey, cik, co_per_rol, leftofc, fyear)) %>%
    
    # Remove duplicate in dismissal_dataset_id, the id variable
    distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
    
    # Remove 2997 in fyear_gone
    filter(fyear_gone < 2025) %>%
    
    # Convert factors that are incorrectly imported numeric variables 
    mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%
    
    # Convert all character variables to factor
    mutate(across(where(is.character), as.factor)) %>%
    
    mutate(notes = as.character(notes))%>%
    na.omit()

Explore Data

departure_clean %>% count(ceo_dismissal)

## # A tibble: 2 × 2
##   ceo_dismissal     n
##   <fct>         <int>
## 1 dismissed      1482
## 2 not dis        5976

departure_clean %>% 
    ggplot(aes(ceo_dismissal)) +
    geom_bar()

ceo dismissal vs departure code

departure_clean %>%
    ggplot(aes(ceo_dismissal, fyear_gone)) +
    geom_count()

correlation plot

departures_clean <- departure_clean

# step 1: binarize
departure_binarized <- departures_clean %>% 
    select(-dismissal_dataset_id, -notes) %>%
    binarize()

departure_binarized %>% glimpse()

## Rows: 7,458
## Columns: 40
## $ coname__BARRICK_GOLD_CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__dismissed    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ ceo_dismissal__not_dis      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ tenure_no_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1993            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__1994            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1995            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone__1996            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1997            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1998            <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1999            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2002            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2003            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2004            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2006            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2008            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2009            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2013            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2014            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2017            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2018            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# step 2: data correlation
departure_correlation <- departure_binarized %>%
    correlate(ceo_dismissal__dismissed)

departure_correlation

## # A tibble: 40 × 3
##    feature          bin       correlation
##    <fct>            <chr>           <dbl>
##  1 ceo_dismissal    dismissed      1     
##  2 ceo_dismissal    not_dis       -1     
##  3 max_tenure_ceodb 1              0.0577
##  4 max_tenure_ceodb 2             -0.0533
##  5 fyear_gone       1999          -0.0390
##  6 fyear_gone       2002           0.0378
##  7 fyear_gone       2003           0.0303
##  8 fyear_gone       2009           0.0292
##  9 fyear_gone       2008           0.0261
## 10 fyear_gone       1997          -0.0255
## # ℹ 30 more rows

# step 3: plot
departure_correlation %>%
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Split Data

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.4.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.2

## Warning: package 'dials' was built under R version 4.4.2

## Warning: package 'infer' was built under R version 4.4.2

## Warning: package 'modeldata' was built under R version 4.4.2

## Warning: package 'parsnip' was built under R version 4.4.2

## Warning: package 'tune' was built under R version 4.4.2

## Warning: package 'workflows' was built under R version 4.4.2

## Warning: package 'workflowsets' was built under R version 4.4.2

## Warning: package 'yardstick' was built under R version 4.4.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

departure_clean <- departure_clean %>% group_by(ceo_dismissal) %>% sample_n(50) %>% ungroup()



departure_split <- initial_split(departure_clean, strata = ceo_dismissal)
departure_train <- training(departure_split)
departure_test <- testing (departure_split)

departure_cv <- rsample::vfold_cv(departure_train, strata = ceo_dismissal)
departure_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [66/8]> Fold01
##  2 <split [66/8]> Fold02
##  3 <split [66/8]> Fold03
##  4 <split [66/8]> Fold04
##  5 <split [66/8]> Fold05
##  6 <split [66/8]> Fold06
##  7 <split [66/8]> Fold07
##  8 <split [68/6]> Fold08
##  9 <split [68/6]> Fold09
## 10 <split [68/6]> Fold10

Preprocess Data

library(themis)

## Warning: package 'themis' was built under R version 4.4.3

xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = departure_train) %>%
    update_role(dismissal_dataset_id, new_role = "ID") %>%
    step_other(coname, exec_fullname, threshold = .05) %>%
    step_tokenize(notes) %>% 
    step_tokenfilter(notes, max_tokens = 100) %>%
    step_tf(notes) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_dummy(all_nominal_predictors()) 

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 74
## Columns: 142
## $ dismissal_dataset_id <dbl> 8512, 2839, 738, 4598, 7282, 8727, 3216, 5780, 52…
## $ ceo_dismissal        <fct> dismissed, dismissed, dismissed, dismissed, dismi…
## $ tf_notes_1           <dbl> -0.2798988, -0.2798988, -0.2798988, -0.2798988, -…
## $ tf_notes_2007        <dbl> -0.2374251, -0.2374251, -0.2374251, -0.2374251, -…
## $ tf_notes_2011        <dbl> -0.2548604, -0.2548604, -0.2548604, -0.2548604, -…
## $ tf_notes_2012        <dbl> -0.2786637, -0.2786637, 2.0125711, -0.2786637, -0…
## $ tf_notes_2013        <dbl> -0.2220223, -0.2220223, -0.2220223, -0.2220223, -…
## $ tf_notes_3           <dbl> -0.3457949, -0.3457949, -0.3457949, -0.3457949, -…
## $ tf_notes_a           <dbl> -0.7706829, -0.7706829, -0.1574513, -0.7706829, -…
## $ tf_notes_after       <dbl> -0.4290308, -0.4290308, -0.4290308, -0.4290308, 3…
## $ tf_notes_agreement   <dbl> -0.2513697, -0.2513697, -0.2513697, -0.2513697, -…
## $ tf_notes_an          <dbl> -0.5031928, -0.5031928, 1.1157753, -0.5031928, -0…
## $ tf_notes_and         <dbl> -0.96545380, -0.45876173, 0.55462239, 0.04793033,…
## $ tf_notes_announced   <dbl> -0.4950844, -0.4950844, 1.5402627, -0.4950844, -0…
## $ tf_notes_as          <dbl> -0.9121223, -0.9121223, 0.5552049, -0.9121223, -0…
## $ tf_notes_at          <dbl> -0.4664175, -0.4664175, -0.4664175, 1.3501559, -0…
## $ tf_notes_be          <dbl> -0.2829699, -0.2829699, -0.2829699, -0.2829699, -…
## $ tf_notes_been        <dbl> -0.4150225, -0.4150225, -0.4150225, -0.4150225, -…
## $ tf_notes_board       <dbl> -0.6174840, -0.6174840, 1.7874538, -0.6174840, -0…
## $ tf_notes_burlington  <dbl> -0.1162476, -0.1162476, -0.1162476, -0.1162476, -…
## $ tf_notes_business    <dbl> -0.3457949, -0.3457949, -0.3457949, -0.3457949, -…
## $ tf_notes_by          <dbl> -0.5825163, 1.1417320, -0.5825163, 1.1417320, 1.1…
## $ tf_notes_ceo         <dbl> -0.6755336, -0.6755336, 0.4870126, 0.4870126, -0.…
## $ tf_notes_chairman    <dbl> -0.7307354, -0.7307354, 1.5703037, -0.7307354, -0…
## $ tf_notes_change      <dbl> -0.2548604, -0.2548604, -0.2548604, -0.2548604, -…
## $ tf_notes_changed     <dbl> -0.3210386, -0.3210386, -0.3210386, -0.3210386, -…
## $ tf_notes_chief       <dbl> -0.6279093, -0.6279093, 0.2325590, -0.6279093, -0…
## $ tf_notes_co          <dbl> -0.3538201, -0.3538201, -0.3538201, -0.3538201, -…
## $ tf_notes_company     <dbl> -0.8812467, 0.0000000, 0.0000000, 0.0000000, 0.00…
## $ `tf_notes_company's` <dbl> -0.3538201, -0.3538201, -0.3538201, -0.3538201, -…
## $ tf_notes_corp        <dbl> -0.3056062, -0.3056062, -0.3056062, -0.3056062, -…
## $ tf_notes_corporation <dbl> -0.3093926, -0.3093926, -0.3093926, -0.3093926, -…
## $ tf_notes_december    <dbl> -0.3457949, -0.3457949, -0.3457949, -0.3457949, -…
## $ `tf_notes_denny's`   <dbl> -0.1162476, -0.1162476, -0.1162476, -0.1162476, -…
## $ tf_notes_director    <dbl> -0.2548604, -0.2548604, 2.1025986, -0.2548604, -0…
## $ tf_notes_directors   <dbl> -0.4466631, -0.4466631, 1.2929721, -0.4466631, -0…
## $ tf_notes_down        <dbl> -0.3926048, -0.3926048, -0.3926048, -0.3926048, -…
## $ tf_notes_during      <dbl> -0.2768037, -0.2768037, -0.2768037, -0.2768037, -…
## $ tf_notes_effective   <dbl> -0.4423796, -0.4423796, 1.8959128, -0.4423796, -0…
## $ tf_notes_energy      <dbl> -0.3056062, -0.3056062, -0.3056062, -0.3056062, -…
## $ tf_notes_executive   <dbl> -0.7246065, -0.7246065, 2.1479407, -0.7246065, -0…
## $ tf_notes_financial   <dbl> -0.3016692, -0.3016692, -0.3016692, -0.3016692, -…
## $ tf_notes_following   <dbl> -0.3056062, -0.3056062, -0.3056062, -0.3056062, -…
## $ tf_notes_for         <dbl> -0.4544814, -0.4544814, -0.4544814, -0.4544814, -…
## $ tf_notes_former      <dbl> -0.3016692, -0.3016692, -0.3016692, -0.3016692, -…
## $ tf_notes_from        <dbl> -0.7080384, -0.7080384, -0.7080384, 0.6354191, 0.…
## $ tf_notes_had         <dbl> -0.3876561, -0.3876561, -0.3876561, 1.5247805, -0…
## $ tf_notes_has         <dbl> -0.5580098, -0.5580098, 0.6217823, -0.5580098, -0…
## $ tf_notes_have        <dbl> -0.3210386, -0.3210386, -0.3210386, -0.3210386, 3…
## $ tf_notes_he          <dbl> -0.6030266, 0.7094430, -0.6030266, 0.7094430, -0.…
## $ tf_notes_his         <dbl> -0.6578082, 0.6231867, -0.6578082, -0.6578082, 0.…
## $ tf_notes_immediately <dbl> -0.3301688, 2.3845528, 2.3845528, -0.3301688, -0.…
## $ tf_notes_in          <dbl> -0.2059372, -0.7703577, -0.2059372, -0.7703577, -…
## $ tf_notes_inc         <dbl> -0.3302612, -0.3302612, 0.9560193, -0.3302612, -0…
## $ tf_notes_interim     <dbl> -0.3695814, -0.3695814, 2.6691992, -0.3695814, -0…
## $ tf_notes_is          <dbl> -0.4564251, -0.4564251, -0.4564251, -0.4564251, 1…
## $ tf_notes_it          <dbl> -0.3912686, -0.3912686, -0.3912686, -0.3912686, -…
## $ tf_notes_its         <dbl> -0.5222951, -0.5222951, 1.1581325, -0.5222951, -0…
## $ tf_notes_july        <dbl> -0.2768037, -0.2768037, -0.2768037, -0.2768037, -…
## $ tf_notes_june        <dbl> -0.2798988, -0.2798988, -0.2798988, -0.2798988, -…
## $ tf_notes_last        <dbl> -0.3210386, -0.3210386, -0.3210386, -0.3210386, -…
## $ tf_notes_long        <dbl> -0.2300956, -0.2300956, -0.2300956, -0.2300956, -…
## $ tf_notes_march       <dbl> -0.2768037, -0.2768037, -0.2768037, -0.2768037, -…
## $ tf_notes_million     <dbl> -0.3482133, -0.3482133, -0.3482133, -0.3482133, -…
## $ tf_notes_most        <dbl> -0.2786637, -0.2786637, -0.2786637, -0.2786637, -…
## $ tf_notes_mr          <dbl> -0.4138240, -0.4138240, -0.4138240, -0.4138240, -…
## $ tf_notes_named       <dbl> -0.3301688, -0.3301688, 2.3845528, -0.3301688, -0…
## $ tf_notes_new         <dbl> -0.3240097, -0.3240097, -0.3240097, -0.3240097, -…
## $ tf_notes_not         <dbl> -0.2786637, -0.2786637, -0.2786637, -0.2786637, -…
## $ tf_notes_nyse        <dbl> -0.2513697, -0.2513697, 2.4059669, -0.2513697, -0…
## $ tf_notes_october     <dbl> -0.2798988, -0.2798988, -0.2798988, -0.2798988, -…
## $ tf_notes_of          <dbl> -0.5279534, -0.5279534, 0.3032924, -0.5279534, -0…
## $ tf_notes_officer     <dbl> -0.5622627, -0.5622627, 0.3230020, -0.5622627, -0…
## $ tf_notes_on          <dbl> -0.6761113, -0.6761113, -0.6761113, 0.3049130, -0…
## $ tf_notes_operating   <dbl> -0.3210386, -0.3210386, -0.3210386, -0.3210386, -…
## $ tf_notes_our         <dbl> -0.1787886, -0.1787886, -0.1787886, -0.1787886, -…
## $ tf_notes_over        <dbl> -0.2605553, -0.2605553, -0.2605553, -0.2605553, -…
## $ tf_notes_performance <dbl> -0.3301688, -0.3301688, -0.3301688, -0.3301688, 2…
## $ tf_notes_president   <dbl> -0.7650788, -0.7650788, 1.9966689, 0.6157951, -0.…
## $ tf_notes_resigned    <dbl> -0.4950844, -0.4950844, -0.4950844, 1.5402627, -0…
## $ tf_notes_retire      <dbl> -0.3695814, -0.3695814, -0.3695814, -0.3695814, -…
## $ tf_notes_retired     <dbl> -0.3695814, -0.3695814, -0.3695814, -0.3695814, -…
## $ tf_notes_retirement  <dbl> -0.3056062, -0.3056062, 2.5212514, -0.3056062, -0…
## $ tf_notes_said        <dbl> -0.4290308, -0.4290308, -0.4290308, -0.4290308, -…
## $ tf_notes_serve       <dbl> -0.2786637, -0.2786637, -0.2786637, -0.2786637, -…
## $ tf_notes_served      <dbl> -0.3584205, -0.3584205, -0.3584205, -0.3584205, -…
## $ tf_notes_since       <dbl> -0.3695814, -0.3695814, -0.3695814, -0.3695814, -…
## $ tf_notes_that        <dbl> -0.5083637, 0.4816077, -0.5083637, -0.5083637, 0.…
## $ tf_notes_the         <dbl> -0.7005510, -0.7005510, 0.1830985, 0.1830985, -0.…
## $ tf_notes_this        <dbl> -0.3457949, -0.3457949, -0.3457949, -0.3457949, -…
## $ tf_notes_time        <dbl> -0.3301688, -0.3301688, -0.3301688, -0.3301688, -…
## $ tf_notes_to          <dbl> -0.8669537, -0.8669537, -0.8669537, -0.3323323, -…
## $ tf_notes_today       <dbl> -0.301162, -0.301162, 4.651279, -0.301162, -0.301…
## $ tf_notes_until       <dbl> -0.407609, -0.407609, -0.407609, -0.407609, -0.40…
## $ tf_notes_was         <dbl> -0.7876749, 1.1552565, -0.7876749, -0.7876749, -0…
## $ tf_notes_were        <dbl> -0.3301688, -0.3301688, -0.3301688, -0.3301688, 2…
## $ tf_notes_which       <dbl> -0.4125575, -0.4125575, -0.4125575, -0.4125575, -…
## $ tf_notes_who         <dbl> -0.3695814, -0.3695814, -0.3695814, -0.3695814, -…
## $ tf_notes_will        <dbl> -0.3463332, -0.3463332, 1.0025434, -0.3463332, -0…
## $ tf_notes_with        <dbl> -0.5269388, 0.6546815, 0.6546815, -0.5269388, -0.…
## $ tf_notes_year        <dbl> -0.3761136, -0.3761136, -0.3761136, -0.3761136, -…
## $ tf_notes_years       <dbl> -0.3767387, -0.3767387, -0.3767387, -0.3767387, 2…
## $ coname_other         <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1…
## $ exec_fullname_other  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ tenure_no_ceodb_X2   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tenure_no_ceodb_X3   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X2  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X3  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X4  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1988     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1990     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1991     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1992     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1993     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1994     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1995     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1996     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1997     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1998     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1999     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2000     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
## $ fyear_gone_X2001     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2002     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2003     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ fyear_gone_X2004     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ fyear_gone_X2005     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2006     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2007     <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2008     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2009     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2010     <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2011     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2012     <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2013     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2014     <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2015     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2016     <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ fyear_gone_X2017     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2018     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2019     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ fyear_gone_X2020     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2021     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune Hyperparameters

tree_grid <- grid_regular(trees(),
                          tree_depth(),
                          levels = 5)

doParallel::registerDoParallel()

set.seed(65743)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = departure_cv, 
            grid = 5,
            control = control_grid(save_pred = TRUE))

Model Evaluation

Identify optimal values for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 8
##    trees tree_depth .metric     .estimator  mean     n std_err .config          
##    <int>      <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>            
##  1  1741          3 accuracy    binary     0.629    10  0.0503 Preprocessor1_Mo…
##  2  1741          3 brier_class binary     0.293    10  0.0446 Preprocessor1_Mo…
##  3  1741          3 roc_auc     binary     0.738    10  0.0568 Preprocessor1_Mo…
##  4   885          5 accuracy    binary     0.612    10  0.0573 Preprocessor1_Mo…
##  5   885          5 brier_class binary     0.293    10  0.0474 Preprocessor1_Mo…
##  6   885          5 roc_auc     binary     0.774    10  0.0476 Preprocessor1_Mo…
##  7   325          7 accuracy    binary     0.642    10  0.0549 Preprocessor1_Mo…
##  8   325          7 brier_class binary     0.266    10  0.0410 Preprocessor1_Mo…
##  9   325          7 roc_auc     binary     0.760    10  0.0561 Preprocessor1_Mo…
## 10  1312         12 accuracy    binary     0.642    10  0.0482 Preprocessor1_Mo…
## 11  1312         12 brier_class binary     0.280    10  0.0427 Preprocessor1_Mo…
## 12  1312         12 roc_auc     binary     0.756    10  0.0588 Preprocessor1_Mo…
## 13   555         15 accuracy    binary     0.629    10  0.0567 Preprocessor1_Mo…
## 14   555         15 brier_class binary     0.274    10  0.0413 Preprocessor1_Mo…
## 15   555         15 roc_auc     binary     0.767    10  0.0555 Preprocessor1_Mo…

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(departure_split)

## Warning: package 'xgboost' was built under R version 4.4.2

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.577 Preprocessor1_Model1
## 2 roc_auc     binary         0.538 Preprocessor1_Model1
## 3 brier_class binary         0.348 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

Variable importance

library(vip)

## Warning: package 'vip' was built under R version 4.4.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had accuracy of 0.615 and an AUC of 0.657.

Feature transformation: normalized numeric data. It resulted in but an improvement to the AUC, which is now 0.524.
Feature transformation: pca, no improvement

Apply5

Paige Biester

2025-03-07