CEO Departures: The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Build a classification model to predict CEO departure (ceo_dismissal). Use the departures dataset.

library(tidyverse)
library(tidyquant)

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')

departures %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

Notes about the data

ceo_dismissal (target) * imported as numeric: convert it to factor * missing values in the target variable: remove * unbalanced data in ceo_dismissal (one category is way bigger than the other): address it in the preprocessing step with themis::step_smote(ceo_dismissal)
too many missing values interim_coceo, still_there, _eight_ks: drop the variables
dismissal_dataset_id is imported as numeric but should be character: convert * It seems to be the unique id variable but has two values appearing twice: remove one of the duplicates
Some variables are imported as numeric but should be categorical variables with a few unique values: convert * departure_code, tenure_no_ceodb, max_tenure_ceodb, fyear_gone
data entering error in fyear_gone (the year of 2997 must be an error): delete
Some variables are redundant: remove * gvkey (its info captured in coname), co_per_rol (its info captured in exec_fullname and coname) * leftofc (its info captured in fyear) * cik (not sure what it is) * fyear (its info captured in fyear_gone)
Some variables don’t seem to add predictive power * _merge, sources
A few character variables: convert to factor * notes is a string, which shouldn’t be converted to factor
notes is a string: tokenize and use words as predictors
departure_code describes the departure reason and is not a predictor: remove
presence of high cardinality predictors - coname and exec_fullname: remove both. It takes forever to run.

data <- departures %>%
    
    # Clean ceo_dismissal
    filter(!is.na(ceo_dismissal)) %>%
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not dismissed")) %>%

    # Drop variables with too many missing values
    select(-interim_coceo, - still_there, - eight_ks) %>%
    
    # Treat dismissal_dataset_id
    mutate(dismissal_dataset_id = as.character(dismissal_dataset_id)) %>%
    distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
    
    # Delete year of 2997 in fyear_geon
    filter(fyear_gone < 2023) %>%
    
    # Drop redundant variables
    select(-departure_code, -fyear, -gvkey, - co_per_rol, - leftofc, - cik, - sources, - `_merge`) %>%
    
    # Drop high cardinality predictors
    select(-coname, -exec_fullname) %>%
    
    # Convert to factor the variables with a few unique values
    mutate(across(tenure_no_ceodb:fyear_gone, factor)) %>%
    
    # Convert to factor all character variables, except the string variable - notes
    mutate(across(where(is.character), factor)) %>%
    
    # Keep notes as character
    mutate(notes = as.character(notes)) %>%
    filter(!is.na(notes))

Explore data

*** Identify variables with correlation with the target variable.***

* Company names

top_companies <- data %>% 
    count(coname) %>% 
    slice_max(n, n = 10) %>% pull(coname)

data %>%
  filter(coname %in% top_companies) %>%
    ggplot(aes(coname, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Companies",
         x = NULL, y = "Proportion", fill = NULL)

* Executives

top_executives <- data %>% 
    count(exec_fullname) %>% 
    slice_max(n, n = 10) %>% pull(exec_fullname)

data %>%
  filter(exec_fullname %in% top_executives) %>%
    ggplot(aes(exec_fullname, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Executives",
         x = NULL, y = "Proportion", fill = NULL)

* Tenure Number

For CEOs who return, this value should capture whether this is the first or second time in office.

data %>%
    ggplot(aes(tenure_no_ceodb, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Tenure Number",
         x = NULL, y = "Proportion", fill = NULL)

* Max Tenure Number

For this CEO, how many times did s/he serve as CEO

data %>%
    ggplot(aes(max_tenure_ceodb, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Max Tenure Number",
         x = NULL, y = "Proportion", fill = NULL)

* Departure Year

fiscal year of the CEO’s effective departure date

data %>%
    ggplot(aes(fyear_gone, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Companies",
         x = NULL, y = "Proportion", fill = NULL)

* Notes

Long-form description and justification for the coding scheme assignment.

library(tidytext) # for tokenization
library(tidylo)   # for bind_log_odds

log_odds_weighted_tb <- data %>%
    
    # Extract most frequent words from notes
    unnest_tokens(word, notes) %>%
    anti_join(stop_words) %>%
    count(ceo_dismissal, word) %>%
    # slice_max(order_by = n, n = 100) %>%
    
    # Calculate how likely each word show up from dismissed CEO's notes
    bind_log_odds(ceo_dismissal, word, n) %>%
    arrange(-log_odds_weighted) 
    
# Select the category
log_odds_weighted_tb %>% 
    filter(ceo_dismissal == "dismissed") %>%
    slice_max(order_by = n, n = 20) %>%
    
    ggplot(aes(log_odds_weighted, fct_reorder(word, log_odds_weighted))) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "The most likely word in the dismissed CEO's departure notes",
         y = "Words from Notes", x = "Log Odds Weighted")

log_odds_weighted_tb %>% 
    filter(ceo_dismissal == "not dismissed") %>%
    slice_max(order_by = n, n = 20) %>%
    
    ggplot(aes(log_odds_weighted, fct_reorder(word, log_odds_weighted))) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "The most likely word in the dismissed CEO's departure notes",
         y = "Words from Notes", x = "Log Odds Weighted")

Feature engineering

# data <- sample_n(data, 100)

library(tidymodels)

set.seed(123)
departure_split <- initial_split(data, strata = ceo_dismissal)

departure_train <- training(departure_split)
departure_test <- testing(departure_split)

set.seed(234)
departure_folds <- vfold_cv(departure_train, strata = ceo_dismissal)
departure_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5032/561]> Fold01
##  2 <split [5033/560]> Fold02
##  3 <split [5034/559]> Fold03
##  4 <split [5034/559]> Fold04
##  5 <split [5034/559]> Fold05
##  6 <split [5034/559]> Fold06
##  7 <split [5034/559]> Fold07
##  8 <split [5034/559]> Fold08
##  9 <split [5034/559]> Fold09
## 10 <split [5034/559]> Fold10

library(embed)
library(textrecipes)

departures_rec <- 
    recipe(ceo_dismissal ~ ., data = departure_train) %>%
    update_role(dismissal_dataset_id, new_role = "id") %>%
    step_tokenize(notes) %>%
    step_stopwords(notes) %>%
    step_tokenfilter(notes, max_tokens = 100) %>%
    step_tfidf(notes) %>%
    # step_lencode_glm(coname, outcome = vars(ceo_dismissal)) %>%
    step_dummy(all_nominal_predictors())

departures_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 5,593
## Columns: 140
## $ dismissal_dataset_id    <fct> 13, 78, 85, 88, 119, 143, 162, 198, 243, 244, …
## $ ceo_dismissal           <fct> dismissed, dismissed, dismissed, dismissed, di…
## $ tfidf_notes_1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_1995        <dbl> 0.0000000, 0.0000000, 0.1790195, 0.0000000, 0.…
## $ tfidf_notes_1996        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_1997        <dbl> 0.0000000, 0.0000000, 0.3477482, 0.0000000, 0.…
## $ tfidf_notes_1998        <dbl> 0.0000000, 0.0000000, 0.3414034, 0.0000000, 0.…
## $ tfidf_notes_1999        <dbl> 0.0000000, 0.2824006, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_2000        <dbl> 0.0000000, 0.1474853, 0.1735121, 0.0000000, 0.…
## $ tfidf_notes_2001        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_2006        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_2007        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_3           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_30          <dbl> 0.0000000, 0.1530249, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_31          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_acquired    <dbl> 0.0000000, 0.1395012, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_acquisition <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_agreement   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_also        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_announced   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_appointed   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_april       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_august      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_bank        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_based       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.28791778…
## $ tfidf_notes_became      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_become      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_billion     <dbl> 0.0000000, 0.1262248, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_board       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_business    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_ceo         <dbl> 0.00000000, 0.12122123, 0.07130661, 0.00000000…
## $ tfidf_notes_chairman    <dbl> 0.00000000, 0.00000000, 0.06929626, 0.13089294…
## $ tfidf_notes_chief       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.13138957…
## $ tfidf_notes_co          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_companies   <dbl> 0.0000000, 0.0000000, 0.1844912, 0.0000000, 0.…
## $ tfidf_notes_company     <dbl> 0.21038038, 0.10519019, 0.06187658, 0.23375598…
## $ `tfidf_notes_company's` <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ `tfidf_notes_company’s` <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_continue    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_corp        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_corporation <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_december    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_departure   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_director    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_directors   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_effective   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_elected     <dbl> 0.0000000, 0.0000000, 0.1847131, 0.0000000, 0.…
## $ tfidf_notes_end         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_energy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_executive   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.12437997…
## $ tfidf_notes_february    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_financial   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_following   <dbl> 0.0000000, 0.1365265, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_former      <dbl> 0.0000000, 0.1525080, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_group       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_inc         <dbl> 0.00000000, 0.07713483, 0.09074686, 0.00000000…
## $ tfidf_notes_interim     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_j           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_january     <dbl> 0.0000000, 0.1319778, 0.1552680, 0.0000000, 0.…
## $ tfidf_notes_john        <dbl> 0.0000000, 0.0000000, 0.1812667, 0.0000000, 0.…
## $ tfidf_notes_july        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_june        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_left        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_management  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_march       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_may         <dbl> 0.5045218, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_member      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_merger      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_million     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_mr          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_named       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_new         <dbl> 0.4443834, 0.0000000, 0.1307010, 0.2468797, 0.…
## $ tfidf_notes_november    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_october     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_officer     <dbl> 0.00000000, 0.06493599, 0.00000000, 0.00000000…
## $ tfidf_notes_one         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_operating   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_position    <dbl> 0.0000000, 0.1137372, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_president   <dbl> 0.00000000, 0.00000000, 0.07761948, 0.00000000…
## $ tfidf_notes_remain      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_resignation <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_resigned    <dbl> 0.4499866, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_retire      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_retired     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_retirement  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_role        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_said        <dbl> 0.000000, 0.000000, 0.000000, 0.236835, 0.0000…
## $ tfidf_notes_september   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_serve       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_served      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_share       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_shares      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_since       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_step        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.3416076, 0.…
## $ tfidf_notes_stepped     <dbl> 0.0000000, 0.1318699, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_stock       <dbl> 0.0000000, 0.1271799, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_time        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_today       <dbl> 0.4586292, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_two         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_vice        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_year        <dbl> 0.00000000, 0.10163478, 0.00000000, 0.00000000…
## $ tfidf_notes_years       <dbl> 0.00000000, 0.10036738, 0.11807927, 0.00000000…
## $ tenure_no_ceodb_X2      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tenure_no_ceodb_X3      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X2     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X3     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X4     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1988        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1990        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1991        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1992        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1993        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0…
## $ fyear_gone_X1994        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1995        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1996        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1997        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ fyear_gone_X1998        <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2000        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ fyear_gone_X2001        <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ fyear_gone_X2002        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2003        <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ fyear_gone_X2004        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2005        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2006        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2007        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2008        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2009        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2010        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2011        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2012        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2013        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2014        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2015        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2016        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2017        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2018        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2019        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2020        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2021        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

prep(departures_rec) %>%
  tidy(number = 1)

prep(departures_rec) %>%
  tidy(number = 1) %>%
  filter(level == "..new")

Build a model

xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(departures_rec, xgb_spec)

library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_grid(
  xgb_wf,
  resamples = departure_folds,
  grid = 5,
  control = control_grid(verbose = TRUE, save_pred = TRUE)
)

xgb_rs

## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits             id     .metrics          .notes           .predictions
##    <list>             <chr>  <list>            <list>           <list>      
##  1 <split [5032/561]> Fold01 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  2 <split [5033/560]> Fold02 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  3 <split [5034/559]> Fold03 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  4 <split [5034/559]> Fold04 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  5 <split [5034/559]> Fold05 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  6 <split [5034/559]> Fold06 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  7 <split [5034/559]> Fold07 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  8 <split [5034/559]> Fold08 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  9 <split [5034/559]> Fold09 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
## 10 <split [5034/559]> Fold10 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>

Model Evaluation

* Show Performance

collect_metrics(xgb_rs)

## # A tibble: 10 × 9
##     mtry trees min_n .metric  .estimator  mean     n std_err .config            
##    <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>              
##  1    28   606    28 accuracy binary     0.837    10 0.00407 Preprocessor1_Mode…
##  2    28   606    28 roc_auc  binary     0.854    10 0.00665 Preprocessor1_Mode…
##  3    32  1196    24 accuracy binary     0.844    10 0.00351 Preprocessor1_Mode…
##  4    32  1196    24 roc_auc  binary     0.861    10 0.00645 Preprocessor1_Mode…
##  5    56  1331    37 accuracy binary     0.842    10 0.00319 Preprocessor1_Mode…
##  6    56  1331    37 roc_auc  binary     0.852    10 0.00647 Preprocessor1_Mode…
##  7    91  1641     7 accuracy binary     0.853    10 0.00443 Preprocessor1_Mode…
##  8    91  1641     7 roc_auc  binary     0.868    10 0.00639 Preprocessor1_Mode…
##  9   128   194    13 accuracy binary     0.826    10 0.00422 Preprocessor1_Mode…
## 10   128   194    13 roc_auc  binary     0.824    10 0.00945 Preprocessor1_Mode…

# collect_predictions(xgb_rs) %>%
#     group_by(id) %>%
#     roc_curve(ceo_dismissal, .pred_class) %>%
#     autoplot()

# conf_mat_resampled(xgb_rs, tidy = FALSE) %>%
    # autoplot()

* Finalize workflow (last_fit)

xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
  last_fit(departure_split)

xgb_last

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [5593/1865]> train/test split <tibble> <tibble> <tibble>     <workflow>

collect_metrics(xgb_last)

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.852 Preprocessor1_Model1
## 2 roc_auc  binary         0.868 Preprocessor1_Model1

collect_predictions(xgb_last) %>%
    conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

library(vip)
xgb_last %>%
  extract_fit_engine() %>%
  vip()

Apply 6: CEO Departures