The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Goal is to predict CEO departure (ceo_dismissal) by using the departures dataset.

Import Data

data <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv")

Explore Data

skimr::skim(data)

Data summary
Name	data
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	9
numeric	10
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
leftofc	1802	0.81	20	20	3627
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
X_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Issues With The Data

Missing Values

Interim_coceo (97% missing)
still_there (78% missing)
eight_ks (48% missing)

Factors or Numeric Variables

departure_code (categorical but currently numeric)
interim_coceo (needs to be a factor)
leftofc (needs to be a factor)
still_there (needs to be a factor)

0 Variance Variables

X_merge

Character Names

coname
exec_fullname
sources

Unbalanced Target Variable

ceo_dismissal

ID Variable

dismissal_dataset_id
gvkey
cik

Data cleaning

# Clean the data and ensure ceo_dismissal is a factor
data_clean <- data %>%
  # Convert ceo_dismissal and factors to proper types
  filter(!is.na(ceo_dismissal)) %>%
  mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
  mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>% 
  
  # Remove variables with missing values in key columns
  select(-c(interim_coceo, still_there, eight_ks)) %>%

  # Remove irrelevant variables that don't seem to have predictive power
  select(-c(X_merge, sources)) %>%

  # Remove variable with info that only becomes available after the fact
  select(-departure_code) %>%

  # Remove redundant variables 
  select(-c(gvkey, cik, co_per_rol, leftofc, fyear)) %>%

  # Remove duplicates in dismissal_dataset_id, which is the id variable
  distinct(dismissal_dataset_id, .keep_all = TRUE) %>%

  # Remove 2997 in fyear_gone
  filter(fyear_gone < 2025) %>%

  # Convert numeric variables that should be factors
  mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%

  # Convert all character variables to factors
  mutate(across(where(is.character), as.factor)) %>%

  # Convert notes to character
  mutate(notes = as.character(notes)) %>%
  
  # Remove missing values
  na.omit()

skimr::skim(data_clean)

Data summary
Name	data_clean
Number of rows	7458
Number of columns	8
_______________________
Column type frequency:
character	1
factor	6
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
notes	0	1	5	3117	0	7448	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
coname	1	FALSE	3427	BAR: 8, CLA: 8, FED: 8, NTN: 8
exec_fullname	1	FALSE	6961	Joh: 4, Mel: 4, Alb: 3, Ami: 3
ceo_dismissal	1	FALSE	2	not: 5976, dis: 1482
tenure_no_ceodb	1	FALSE	3	1: 7274, 2: 177, 3: 7
max_tenure_ceodb	1	FALSE	4	1: 7123, 2: 317, 3: 15, 4: 3
fyear_gone	1	FALSE	34	200: 378, 199: 350, 200: 332, 200: 320

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1	5570.24	25786.43	1	2170.25	4321.5	6575.75	559044	▇▁▁▁▁

Data Exploration

# Bar plot for CEO Dismissal
data_clean %>%
  ggplot(aes(ceo_dismissal)) + 
  geom_bar() +
  labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")

data_clean %>%    
  ggplot(aes(x = ceo_dismissal, y = tenure_no_ceodb)) + 
  geom_boxplot() +
  labs(title = "CEO Dismissal vs. Tenure", x = "CEO Dismissal", y = "CEO Tenure")

data_clean %>% 
  select(-dismissal_dataset_id, -notes) %>%
  binarize() -> data_binarized

data_binarized %>% glimpse()

## Rows: 7,458
## Columns: 40
## $ coname__BARRICK_GOLD_CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__dismissed    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ ceo_dismissal__not_dis      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ tenure_no_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1993            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__1994            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1995            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone__1996            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1997            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1998            <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1999            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2002            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2003            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2004            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2006            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2008            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2009            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2013            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2014            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2017            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2018            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Correlation for both categories of ceo_dismissal

correlation_results <- data_binarized %>%
  correlate(`ceo_dismissal__not_dis`)

correlation_results

## # A tibble: 40 × 3
##    feature          bin       correlation
##    <fct>            <chr>           <dbl>
##  1 ceo_dismissal    dismissed     -1     
##  2 ceo_dismissal    not_dis        1     
##  3 max_tenure_ceodb 1             -0.0577
##  4 max_tenure_ceodb 2              0.0533
##  5 fyear_gone       1999           0.0390
##  6 fyear_gone       2002          -0.0378
##  7 fyear_gone       2003          -0.0303
##  8 fyear_gone       2009          -0.0292
##  9 fyear_gone       2008          -0.0261
## 10 fyear_gone       1997           0.0255
## # ℹ 30 more rows

# Step 3: Plot the correlation funnel
correlation_results %>% 
  correlationfunnel::plot_correlation_funnel() +
  labs(title = "Correlation Funnel for CEO Dismissal")

## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Sample to smaller size

# set.seed(2025)
# data_clean <- data_clean %>%
#   group_by(ceo_dismissal) %>%
#   sample_n(50) %>%
#   ungroup()

Split Data

# Set seed for reproducibility
set.seed(1234)

# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5032/561]> Fold01
##  2 <split [5033/560]> Fold02
##  3 <split [5034/559]> Fold03
##  4 <split [5034/559]> Fold04
##  5 <split [5034/559]> Fold05
##  6 <split [5034/559]> Fold06
##  7 <split [5034/559]> Fold07
##  8 <split [5034/559]> Fold08
##  9 <split [5034/559]> Fold09
## 10 <split [5034/559]> Fold10

Preprocess Data

xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
  update_role(dismissal_dataset_id, new_role = "id") %>%
  step_tokenize(notes) %>% 
  step_tokenfilter(notes, max_tokens = 100) %>%
  step_tfidf(notes) %>%
  step_other(coname, exec_fullname) %>%
  step_dummy(all_nominal_predictors()) %>%  
  step_smote(ceo_dismissal)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 8,964
## Columns: 142
## $ dismissal_dataset_id    <int> 84, 85, 119, 162, 243, 244, 263, 280, 300, 346…
## $ ceo_dismissal           <fct> dismissed, dismissed, dismissed, dismissed, di…
## $ tfidf_notes_1           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_1997        <dbl> 0.00000000, 0.21964232, 0.00000000, 0.00000000…
## $ tfidf_notes_1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_a           <dbl> 0.08168614, 0.07260990, 0.00000000, 0.07172441…
## $ tfidf_notes_acquisition <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_after       <dbl> 0.04790855, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_agreement   <dbl> 0.07417579, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_also        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_an          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_and         <dbl> 0.11579704, 0.15439605, 0.08932914, 0.08134036…
## $ tfidf_notes_announced   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_april       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_as          <dbl> 0.05278867, 0.07038489, 0.00000000, 0.02317551…
## $ tfidf_notes_at          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_based       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_be          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_been        <dbl> 0.00000000, 0.00000000, 0.07469936, 0.05101420…
## $ tfidf_notes_billion     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_board       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_business    <dbl> 0.07538640, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_but         <dbl> 0.00000000, 0.09612422, 0.00000000, 0.00000000…
## $ tfidf_notes_by          <dbl> 0.04792738, 0.06390318, 0.00000000, 0.04208258…
## $ tfidf_notes_ceo         <dbl> 0.03350873, 0.04467830, 0.04308265, 0.00000000…
## $ tfidf_notes_chairman    <dbl> 0.06558589, 0.04372393, 0.00000000, 0.00000000…
## $ tfidf_notes_chief       <dbl> 0.00000000, 0.00000000, 0.04239335, 0.00000000…
## $ tfidf_notes_co          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_company     <dbl> 0.00000000, 0.03895933, 0.00000000, 0.02565614…
## $ `tfidf_notes_company's` <dbl> 0.06653157, 0.00000000, 0.00000000, 0.00000000…
## $ `tfidf_notes_company’s` <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_corp        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_corporation <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_december    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_director    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_directors   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_down        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_effective   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_executive   <dbl> 0.00000000, 0.00000000, 0.03997041, 0.00000000…
## $ tfidf_notes_financial   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06671555…
## $ tfidf_notes_for         <dbl> 0.00000000, 0.00000000, 0.04795045, 0.03274665…
## $ tfidf_notes_from        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03283809…
## $ tfidf_notes_group       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_had         <dbl> 0.00000000, 0.00000000, 0.08040328, 0.05490956…
## $ tfidf_notes_has         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03817834…
## $ tfidf_notes_have        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06830103…
## $ tfidf_notes_he          <dbl> 0.00000000, 0.00000000, 0.04663180, 0.03184611…
## $ tfidf_notes_his         <dbl> 0.07867326, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_in          <dbl> 0.10983246, 0.03661082, 0.03530329, 0.04821913…
## $ tfidf_notes_inc         <dbl> 0.00000000, 0.05656937, 0.00000000, 0.00000000…
## $ tfidf_notes_into        <dbl> 0.07092530, 0.00000000, 0.00000000, 0.06227587…
## $ tfidf_notes_is          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_it          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04600021…
## $ tfidf_notes_its         <dbl> 0.00000000, 0.06430819, 0.00000000, 0.00000000…
## $ tfidf_notes_january     <dbl> 0.00000000, 0.09673804, 0.00000000, 0.00000000…
## $ tfidf_notes_july        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_june        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_march       <dbl> 0.00000000, 0.00000000, 0.09829902, 0.00000000…
## $ tfidf_notes_may         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0623714, 0.…
## $ tfidf_notes_member      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_merger      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_million     <dbl> 0.00000000, 0.00000000, 0.17602458, 0.00000000…
## $ tfidf_notes_mr          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.07460464…
## $ tfidf_notes_named       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_new         <dbl> 0.00000000, 0.08200327, 0.00000000, 0.00000000…
## $ tfidf_notes_not         <dbl> 0.06976042, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_of          <dbl> 0.02240717, 0.00000000, 0.08642765, 0.03934917…
## $ tfidf_notes_officer     <dbl> 0.00000000, 0.00000000, 0.04680788, 0.00000000…
## $ tfidf_notes_on          <dbl> 0.00000000, 0.04922023, 0.09492472, 0.00000000…
## $ tfidf_notes_operating   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_over        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_position    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_president   <dbl> 0.00000000, 0.04908544, 0.09466477, 0.00000000…
## $ tfidf_notes_resigned    <dbl> 0.00000000, 0.00000000, 0.07944799, 0.00000000…
## $ tfidf_notes_retire      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retired     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retirement  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_role        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_said        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_served      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_share       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_since       <dbl> 0.0000000, 0.0000000, 0.1493987, 0.0000000, 0.…
## $ tfidf_notes_stepped     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_stock       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_that        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03297636…
## $ tfidf_notes_the         <dbl> 0.04244354, 0.02829569, 0.02728513, 0.11180250…
## $ tfidf_notes_this        <dbl> 0.06393836, 0.00000000, 0.00000000, 0.05614100…
## $ tfidf_notes_time        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_to          <dbl> 0.02574055, 0.06864146, 0.00000000, 0.04520291…
## $ tfidf_notes_today       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_two         <dbl> 0.15557457, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_until       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_vice        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_was         <dbl> 0.03305141, 0.00000000, 0.00000000, 0.02902075…
## $ tfidf_notes_when        <dbl> 0.06960593, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_which       <dbl> 0.00000000, 0.00000000, 0.08236189, 0.11249429…
## $ tfidf_notes_who         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_will        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_with        <dbl> 0.04083248, 0.00000000, 0.05249891, 0.00000000…
## $ tfidf_notes_would       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_year        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04960310…
## $ tfidf_notes_years       <dbl> 0.00000000, 0.07387007, 0.00000000, 0.00000000…
## $ coname_other            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ exec_fullname_other     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ tenure_no_ceodb_X2      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tenure_no_ceodb_X3      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X2     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X3     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X4     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1988        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1990        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1991        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1992        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1993        <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ fyear_gone_X1994        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ fyear_gone_X1995        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ fyear_gone_X1996        <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ fyear_gone_X1997        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ fyear_gone_X1998        <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2000        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2001        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2002        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2003        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0…
## $ fyear_gone_X2004        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2005        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2006        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2007        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2008        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2009        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2010        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2011        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2012        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2013        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2014        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2015        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2016        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2017        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2018        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2019        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2020        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2021        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(),
             tree_depth = tune(),
             min_n = tune(),
             learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(65743)

xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5, 
            control = control_grid(save_pred = TRUE))

Model Evaluation

Identify Optimal Values for Hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 10
##    trees min_n tree_depth learn_rate .metric     .estimator  mean     n std_err
##    <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>   <dbl>
##  1   885     4          1    0.0670  accuracy    binary     0.813    10 0.00458
##  2   885     4          1    0.0670  brier_class binary     0.131    10 0.00228
##  3   885     4          1    0.0670  roc_auc     binary     0.830    10 0.00417
##  4   541    16          9    0.0266  accuracy    binary     0.831    10 0.00532
##  5   541    16          9    0.0266  brier_class binary     0.118    10 0.00254
##  6   541    16          9    0.0266  roc_auc     binary     0.854    10 0.00405
##  7   325    18         10    0.00276 accuracy    binary     0.784    10 0.00637
##  8   325    18         10    0.00276 brier_class binary     0.169    10 0.00192
##  9   325    18         10    0.00276 roc_auc     binary     0.777    10 0.00648
## 10  1754    32         13    0.00495 accuracy    binary     0.826    10 0.00460
## 11  1754    32         13    0.00495 brier_class binary     0.120    10 0.00216
## 12  1754    32         13    0.00495 roc_auc     binary     0.849    10 0.00374
## 13  1312    38          7    0.141   accuracy    binary     0.825    10 0.00396
## 14  1312    38          7    0.141   brier_class binary     0.129    10 0.00284
## 15  1312    38          7    0.141   roc_auc     binary     0.837    10 0.00501
## # ℹ 1 more variable: .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the Model for the Last Time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.834 Preprocessor1_Model1
## 2 roc_auc     binary         0.856 Preprocessor1_Model1
## 3 brier_class binary         0.117 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

Variable Importance

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Apply it to your data 7

Nils Skogestig

2025-03-06