CEO Departures: The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Build a classification model to predict CEO departure (ceo_dismissal). Use the departures dataset.

library(tidyverse)
library(tidyquant)

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')

departures %>% skimr::skim()
Data summary
Name Piped data
Number of rows 9423
Number of columns 19
_______________________
Column type frequency:
character 8
numeric 10
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coname 0 1.00 2 30 0 3860 0
exec_fullname 0 1.00 5 790 0 8701 0
interim_coceo 9105 0.03 6 7 0 6 0
still_there 7311 0.22 3 10 0 77 0
notes 1644 0.83 5 3117 0 7755 0
sources 1475 0.84 18 1843 0 7915 0
eight_ks 4499 0.52 69 3884 0 4914 0
_merge 0 1.00 11 11 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1.00 5684.10 25005.46 1 2305.5 4593 6812.5 559044 ▇▁▁▁▁
gvkey 0 1.00 40132.48 53921.34 1004 7337.0 14385 60900.5 328795 ▇▁▁▁▁
fyear 0 1.00 2007.74 8.19 1987 2000.0 2008 2016.0 2020 ▁▆▅▅▇
co_per_rol 0 1.00 25580.22 18202.38 -1 8555.5 22980 39275.5 64602 ▇▆▅▃▃
departure_code 1667 0.82 5.20 1.53 1 5.0 5 7.0 9 ▁▃▇▅▁
ceo_dismissal 1813 0.81 0.20 0.40 0 0.0 0 0.0 1 ▇▁▁▁▂
tenure_no_ceodb 0 1.00 1.03 0.17 0 1.0 1 1.0 3 ▁▇▁▁▁
max_tenure_ceodb 0 1.00 1.05 0.24 1 1.0 1 1.0 4 ▇▁▁▁▁
fyear_gone 1802 0.81 2006.64 13.63 1980 2000.0 2007 2013.0 2997 ▇▁▁▁▁
cik 245 0.97 741469.17 486551.43 1750 106413.0 857323 1050375.8 1808065 ▆▁▇▂▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
leftofc 1802 0.81 1981-01-01 2998-04-27 2006-12-31 3627

Notes about the data

data <- departures %>%
    
    # Clean ceo_dismissal
    filter(!is.na(ceo_dismissal)) %>%
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not dismissed")) %>%

    # Drop variables with too many missing values
    select(-interim_coceo, - still_there, - eight_ks) %>%
    
    # Treat dismissal_dataset_id
    mutate(dismissal_dataset_id = as.character(dismissal_dataset_id)) %>%
    distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
    
    # Delete year of 2997 in fyear_geon
    filter(fyear_gone < 2023) %>%
    
    # Drop redundant variables
    select(-departure_code, -fyear, -gvkey, - co_per_rol, - leftofc, - cik, - sources, - `_merge`) %>%
    
    # Drop high cardinality predictors
    select(-coname, -exec_fullname) %>%
    
    # Convert to factor the variables with a few unique values
    mutate(across(tenure_no_ceodb:fyear_gone, factor)) %>%
    
    # Convert to factor all character variables, except the string variable - notes
    mutate(across(where(is.character), factor)) %>%
    
    # Keep notes as character
    mutate(notes = as.character(notes)) %>%
    filter(!is.na(notes))

Explore data

*** Identify variables with correlation with the target variable.***

* Company names

top_companies <- data %>% 
    count(coname) %>% 
    slice_max(n, n = 10) %>% pull(coname)

data %>%
  filter(coname %in% top_companies) %>%
    ggplot(aes(coname, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Companies",
         x = NULL, y = "Proportion", fill = NULL)

* Executives

top_executives <- data %>% 
    count(exec_fullname) %>% 
    slice_max(n, n = 10) %>% pull(exec_fullname)

data %>%
  filter(exec_fullname %in% top_executives) %>%
    ggplot(aes(exec_fullname, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Executives",
         x = NULL, y = "Proportion", fill = NULL)

* Tenure Number

For CEOs who return, this value should capture whether this is the first or second time in office.

data %>%
    ggplot(aes(tenure_no_ceodb, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Tenure Number",
         x = NULL, y = "Proportion", fill = NULL)

* Max Tenure Number

For this CEO, how many times did s/he serve as CEO

data %>%
    ggplot(aes(max_tenure_ceodb, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Max Tenure Number",
         x = NULL, y = "Proportion", fill = NULL)

* Departure Year

fiscal year of the CEO’s effective departure date

data %>%
    ggplot(aes(fyear_gone, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Companies",
         x = NULL, y = "Proportion", fill = NULL)

* Notes

Long-form description and justification for the coding scheme assignment.

library(tidytext) # for tokenization
library(tidylo)   # for bind_log_odds

log_odds_weighted_tb <- data %>%
    
    # Extract most frequent words from notes
    unnest_tokens(word, notes) %>%
    anti_join(stop_words) %>%
    count(ceo_dismissal, word) %>%
    # slice_max(order_by = n, n = 100) %>%
    
    # Calculate how likely each word show up from dismissed CEO's notes
    bind_log_odds(ceo_dismissal, word, n) %>%
    arrange(-log_odds_weighted) 
    
# Select the category
log_odds_weighted_tb %>% 
    filter(ceo_dismissal == "dismissed") %>%
    slice_max(order_by = n, n = 20) %>%
    
    ggplot(aes(log_odds_weighted, fct_reorder(word, log_odds_weighted))) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "The most likely word in the dismissed CEO's departure notes",
         y = "Words from Notes", x = "Log Odds Weighted")

log_odds_weighted_tb %>% 
    filter(ceo_dismissal == "not dismissed") %>%
    slice_max(order_by = n, n = 20) %>%
    
    ggplot(aes(log_odds_weighted, fct_reorder(word, log_odds_weighted))) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "The most likely word in the dismissed CEO's departure notes",
         y = "Words from Notes", x = "Log Odds Weighted")

Feature engineering

# data <- sample_n(data, 100)

library(tidymodels)

set.seed(123)
departure_split <- initial_split(data, strata = ceo_dismissal)

departure_train <- training(departure_split)
departure_test <- testing(departure_split)

set.seed(234)
departure_folds <- vfold_cv(departure_train, strata = ceo_dismissal)
departure_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5032/561]> Fold01
##  2 <split [5033/560]> Fold02
##  3 <split [5034/559]> Fold03
##  4 <split [5034/559]> Fold04
##  5 <split [5034/559]> Fold05
##  6 <split [5034/559]> Fold06
##  7 <split [5034/559]> Fold07
##  8 <split [5034/559]> Fold08
##  9 <split [5034/559]> Fold09
## 10 <split [5034/559]> Fold10
library(embed)
library(textrecipes)

departures_rec <- 
    recipe(ceo_dismissal ~ ., data = departure_train) %>%
    update_role(dismissal_dataset_id, new_role = "id") %>%
    step_tokenize(notes) %>%
    step_stopwords(notes) %>%
    step_tokenfilter(notes, max_tokens = 100) %>%
    step_tfidf(notes) %>%
    # step_lencode_glm(coname, outcome = vars(ceo_dismissal)) %>%
    step_dummy(all_nominal_predictors())

departures_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 5,593
## Columns: 140
## $ dismissal_dataset_id    <fct> 13, 78, 85, 88, 119, 143, 162, 198, 243, 244, …
## $ ceo_dismissal           <fct> dismissed, dismissed, dismissed, dismissed, di…
## $ tfidf_notes_1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_1995        <dbl> 0.0000000, 0.0000000, 0.1790195, 0.0000000, 0.…
## $ tfidf_notes_1996        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_1997        <dbl> 0.0000000, 0.0000000, 0.3477482, 0.0000000, 0.…
## $ tfidf_notes_1998        <dbl> 0.0000000, 0.0000000, 0.3414034, 0.0000000, 0.…
## $ tfidf_notes_1999        <dbl> 0.0000000, 0.2824006, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_2000        <dbl> 0.0000000, 0.1474853, 0.1735121, 0.0000000, 0.…
## $ tfidf_notes_2001        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_2006        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_2007        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_3           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_30          <dbl> 0.0000000, 0.1530249, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_31          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_acquired    <dbl> 0.0000000, 0.1395012, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_acquisition <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_agreement   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_also        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_announced   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_appointed   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_april       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_august      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_bank        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_based       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.28791778…
## $ tfidf_notes_became      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_become      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_billion     <dbl> 0.0000000, 0.1262248, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_board       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_business    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_ceo         <dbl> 0.00000000, 0.12122123, 0.07130661, 0.00000000…
## $ tfidf_notes_chairman    <dbl> 0.00000000, 0.00000000, 0.06929626, 0.13089294…
## $ tfidf_notes_chief       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.13138957…
## $ tfidf_notes_co          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_companies   <dbl> 0.0000000, 0.0000000, 0.1844912, 0.0000000, 0.…
## $ tfidf_notes_company     <dbl> 0.21038038, 0.10519019, 0.06187658, 0.23375598…
## $ `tfidf_notes_company's` <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ `tfidf_notes_company’s` <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_continue    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_corp        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_corporation <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_december    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_departure   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_director    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_directors   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_effective   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_elected     <dbl> 0.0000000, 0.0000000, 0.1847131, 0.0000000, 0.…
## $ tfidf_notes_end         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_energy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_executive   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.12437997…
## $ tfidf_notes_february    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_financial   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_following   <dbl> 0.0000000, 0.1365265, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_former      <dbl> 0.0000000, 0.1525080, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_group       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_inc         <dbl> 0.00000000, 0.07713483, 0.09074686, 0.00000000…
## $ tfidf_notes_interim     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_j           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_january     <dbl> 0.0000000, 0.1319778, 0.1552680, 0.0000000, 0.…
## $ tfidf_notes_john        <dbl> 0.0000000, 0.0000000, 0.1812667, 0.0000000, 0.…
## $ tfidf_notes_july        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_june        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_left        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_management  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_march       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_may         <dbl> 0.5045218, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_member      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_merger      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_million     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_mr          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_named       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_new         <dbl> 0.4443834, 0.0000000, 0.1307010, 0.2468797, 0.…
## $ tfidf_notes_november    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_october     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_officer     <dbl> 0.00000000, 0.06493599, 0.00000000, 0.00000000…
## $ tfidf_notes_one         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_operating   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_position    <dbl> 0.0000000, 0.1137372, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_president   <dbl> 0.00000000, 0.00000000, 0.07761948, 0.00000000…
## $ tfidf_notes_remain      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_resignation <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_resigned    <dbl> 0.4499866, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_retire      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_retired     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_retirement  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_role        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_said        <dbl> 0.000000, 0.000000, 0.000000, 0.236835, 0.0000…
## $ tfidf_notes_september   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_serve       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_served      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_share       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_shares      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_since       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_step        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.3416076, 0.…
## $ tfidf_notes_stepped     <dbl> 0.0000000, 0.1318699, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_stock       <dbl> 0.0000000, 0.1271799, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_time        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_today       <dbl> 0.4586292, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_two         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_vice        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_year        <dbl> 0.00000000, 0.10163478, 0.00000000, 0.00000000…
## $ tfidf_notes_years       <dbl> 0.00000000, 0.10036738, 0.11807927, 0.00000000…
## $ tenure_no_ceodb_X2      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tenure_no_ceodb_X3      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X2     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X3     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X4     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1988        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1990        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1991        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1992        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1993        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0…
## $ fyear_gone_X1994        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1995        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1996        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1997        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ fyear_gone_X1998        <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2000        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ fyear_gone_X2001        <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ fyear_gone_X2002        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2003        <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ fyear_gone_X2004        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2005        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2006        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2007        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2008        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2009        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2010        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2011        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2012        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2013        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2014        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2015        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2016        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2017        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2018        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2019        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2020        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2021        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
prep(departures_rec) %>%
  tidy(number = 1)
prep(departures_rec) %>%
  tidy(number = 1) %>%
  filter(level == "..new")

Build a model

xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(departures_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_grid(
  xgb_wf,
  resamples = departure_folds,
  grid = 5,
  control = control_grid(verbose = TRUE, save_pred = TRUE)
)

xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits             id     .metrics          .notes           .predictions
##    <list>             <chr>  <list>            <list>           <list>      
##  1 <split [5032/561]> Fold01 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  2 <split [5033/560]> Fold02 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  3 <split [5034/559]> Fold03 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  4 <split [5034/559]> Fold04 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  5 <split [5034/559]> Fold05 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  6 <split [5034/559]> Fold06 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  7 <split [5034/559]> Fold07 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  8 <split [5034/559]> Fold08 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  9 <split [5034/559]> Fold09 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
## 10 <split [5034/559]> Fold10 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>

Model Evaluation

* Show Performance

collect_metrics(xgb_rs)
## # A tibble: 10 × 9
##     mtry trees min_n .metric  .estimator  mean     n std_err .config            
##    <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>              
##  1    28   606    28 accuracy binary     0.837    10 0.00407 Preprocessor1_Mode…
##  2    28   606    28 roc_auc  binary     0.854    10 0.00665 Preprocessor1_Mode…
##  3    32  1196    24 accuracy binary     0.844    10 0.00351 Preprocessor1_Mode…
##  4    32  1196    24 roc_auc  binary     0.861    10 0.00645 Preprocessor1_Mode…
##  5    56  1331    37 accuracy binary     0.842    10 0.00319 Preprocessor1_Mode…
##  6    56  1331    37 roc_auc  binary     0.852    10 0.00647 Preprocessor1_Mode…
##  7    91  1641     7 accuracy binary     0.853    10 0.00443 Preprocessor1_Mode…
##  8    91  1641     7 roc_auc  binary     0.868    10 0.00639 Preprocessor1_Mode…
##  9   128   194    13 accuracy binary     0.826    10 0.00422 Preprocessor1_Mode…
## 10   128   194    13 roc_auc  binary     0.824    10 0.00945 Preprocessor1_Mode…
# collect_predictions(xgb_rs) %>%
#     group_by(id) %>%
#     roc_curve(ceo_dismissal, .pred_class) %>%
#     autoplot()

# conf_mat_resampled(xgb_rs, tidy = FALSE) %>%
    # autoplot()

* Finalize workflow (last_fit)

xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
  last_fit(departure_split)

xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [5593/1865]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(xgb_last)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.852 Preprocessor1_Model1
## 2 roc_auc  binary         0.868 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
    conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

library(vip)
xgb_last %>%
  extract_fit_engine() %>%
  vip()