The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Goal is to predict CEO departure (ceo_dismissal) by using the departures dataset.

Import Data

data <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv")

Explore Data

skimr::skim(data)
Data summary
Name data
Number of rows 9423
Number of columns 19
_______________________
Column type frequency:
character 9
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coname 0 1.00 2 30 0 3860 0
exec_fullname 0 1.00 5 790 0 8701 0
interim_coceo 9105 0.03 6 7 0 6 0
leftofc 1802 0.81 20 20 0 3627 0
still_there 7311 0.22 3 10 0 77 0
notes 1644 0.83 5 3117 0 7755 0
sources 1475 0.84 18 1843 0 7915 0
eight_ks 4499 0.52 69 3884 0 4914 0
X_merge 0 1.00 11 11 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1.00 5684.10 25005.46 1 2305.5 4593 6812.5 559044 ▇▁▁▁▁
gvkey 0 1.00 40132.48 53921.34 1004 7337.0 14385 60900.5 328795 ▇▁▁▁▁
fyear 0 1.00 2007.74 8.19 1987 2000.0 2008 2016.0 2020 ▁▆▅▅▇
co_per_rol 0 1.00 25580.22 18202.38 -1 8555.5 22980 39275.5 64602 ▇▆▅▃▃
departure_code 1667 0.82 5.20 1.53 1 5.0 5 7.0 9 ▁▃▇▅▁
ceo_dismissal 1813 0.81 0.20 0.40 0 0.0 0 0.0 1 ▇▁▁▁▂
tenure_no_ceodb 0 1.00 1.03 0.17 0 1.0 1 1.0 3 ▁▇▁▁▁
max_tenure_ceodb 0 1.00 1.05 0.24 1 1.0 1 1.0 4 ▇▁▁▁▁
fyear_gone 1802 0.81 2006.64 13.63 1980 2000.0 2007 2013.0 2997 ▇▁▁▁▁
cik 245 0.97 741469.17 486551.43 1750 106413.0 857323 1050375.8 1808065 ▆▁▇▂▁

Issues With The Data

Missing Values

  • Interim_coceo (97% missing)
  • still_there (78% missing)
  • eight_ks (48% missing)

Factors or Numeric Variables

  • departure_code (categorical but currently numeric)
  • interim_coceo (needs to be a factor)
  • leftofc (needs to be a factor)
  • still_there (needs to be a factor)

0 Variance Variables

  • X_merge

Character Names

  • coname
  • exec_fullname
  • sources

Unbalanced Target Variable

  • ceo_dismissal

ID Variable

  • dismissal_dataset_id
  • gvkey
  • cik

Data cleaning

# Clean the data and ensure ceo_dismissal is a factor
data_clean <- data %>%
  # Convert ceo_dismissal and factors to proper types
  filter(!is.na(ceo_dismissal)) %>%
  mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
  mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>% 
  
  # Remove variables with missing values in key columns
  select(-c(interim_coceo, still_there, eight_ks)) %>%

  # Remove irrelevant variables that don't seem to have predictive power
  select(-c(X_merge, sources)) %>%

  # Remove variable with info that only becomes available after the fact
  select(-departure_code) %>%

  # Remove redundant variables 
  select(-c(gvkey, cik, co_per_rol, leftofc, fyear)) %>%

  # Remove duplicates in dismissal_dataset_id, which is the id variable
  distinct(dismissal_dataset_id, .keep_all = TRUE) %>%

  # Remove 2997 in fyear_gone
  filter(fyear_gone < 2025) %>%

  # Convert numeric variables that should be factors
  mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%

  # Convert all character variables to factors
  mutate(across(where(is.character), as.factor)) %>%

  # Convert notes to character
  mutate(notes = as.character(notes)) %>%
  
  # Remove missing values
  na.omit()

skimr::skim(data_clean)
Data summary
Name data_clean
Number of rows 7458
Number of columns 8
_______________________
Column type frequency:
character 1
factor 6
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
notes 0 1 5 3117 0 7448 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
coname 0 1 FALSE 3427 BAR: 8, CLA: 8, FED: 8, NTN: 8
exec_fullname 0 1 FALSE 6961 Joh: 4, Mel: 4, Alb: 3, Ami: 3
ceo_dismissal 0 1 FALSE 2 not: 5976, dis: 1482
tenure_no_ceodb 0 1 FALSE 3 1: 7274, 2: 177, 3: 7
max_tenure_ceodb 0 1 FALSE 4 1: 7123, 2: 317, 3: 15, 4: 3
fyear_gone 0 1 FALSE 34 200: 378, 199: 350, 200: 332, 200: 320

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1 5570.24 25786.43 1 2170.25 4321.5 6575.75 559044 ▇▁▁▁▁

Data Exploration

# Bar plot for CEO Dismissal
data_clean %>%
  ggplot(aes(ceo_dismissal)) + 
  geom_bar() +
  labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")

data_clean %>%    
  ggplot(aes(x = ceo_dismissal, y = tenure_no_ceodb)) + 
  geom_boxplot() +
  labs(title = "CEO Dismissal vs. Tenure", x = "CEO Dismissal", y = "CEO Tenure")

data_clean %>% 
  select(-dismissal_dataset_id, -notes) %>%
  binarize() -> data_binarized

data_binarized %>% glimpse()
## Rows: 7,458
## Columns: 40
## $ coname__BARRICK_GOLD_CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__dismissed    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ ceo_dismissal__not_dis      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ tenure_no_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1993            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__1994            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1995            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone__1996            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1997            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1998            <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1999            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2002            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2003            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2004            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2006            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2008            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2009            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2013            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2014            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2017            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2018            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Correlation for both categories of ceo_dismissal

correlation_results <- data_binarized %>%
  correlate(`ceo_dismissal__not_dis`)
correlation_results
## # A tibble: 40 × 3
##    feature          bin       correlation
##    <fct>            <chr>           <dbl>
##  1 ceo_dismissal    dismissed     -1     
##  2 ceo_dismissal    not_dis        1     
##  3 max_tenure_ceodb 1             -0.0577
##  4 max_tenure_ceodb 2              0.0533
##  5 fyear_gone       1999           0.0390
##  6 fyear_gone       2002          -0.0378
##  7 fyear_gone       2003          -0.0303
##  8 fyear_gone       2009          -0.0292
##  9 fyear_gone       2008          -0.0261
## 10 fyear_gone       1997           0.0255
## # ℹ 30 more rows
# Step 3: Plot the correlation funnel
correlation_results %>% 
  correlationfunnel::plot_correlation_funnel() +
  labs(title = "Correlation Funnel for CEO Dismissal")
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Sample to smaller size

data_clean <- data_clean %>% 
  group_by(ceo_dismissal) %>% 
  sample_n(50) %>% 
  ungroup()

Split Data

# Set seed for reproducibility
set.seed(1234)

# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [66/8]> Fold01
##  2 <split [66/8]> Fold02
##  3 <split [66/8]> Fold03
##  4 <split [66/8]> Fold04
##  5 <split [66/8]> Fold05
##  6 <split [66/8]> Fold06
##  7 <split [66/8]> Fold07
##  8 <split [68/6]> Fold08
##  9 <split [68/6]> Fold09
## 10 <split [68/6]> Fold10

Preprocess Data

xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
  update_role(dismissal_dataset_id, new_role = "id") %>%
  step_tokenize(notes) %>% 
  step_tokenfilter(notes, max_tokens = 100) %>%
  step_tfidf(notes) %>%
  step_other(coname, exec_fullname) %>%
  step_dummy(all_nominal_predictors()) %>%  
  step_smote(ceo_dismissal)

xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 74
## Columns: 142
## $ dismissal_dataset_id     <int> 8884, 1741, 3459, 830, 7874, 7747, 3138, 235,…
## $ ceo_dismissal            <fct> dismissed, dismissed, dismissed, dismissed, d…
## $ tfidf_notes_1            <dbl> 0.00000000, 0.06154502, 0.00000000, 0.0000000…
## $ tfidf_notes_11           <dbl> 0.00000000, 0.00000000, 0.14526368, 0.0000000…
## $ tfidf_notes_1994         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_1997         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_1999         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_2012         <dbl> 0.00000000, 0.13923818, 0.00000000, 0.0000000…
## $ tfidf_notes_3            <dbl> 0.00000000, 0.03636371, 0.00000000, 0.0000000…
## $ tfidf_notes_a            <dbl> 0.00000000, 0.04909308, 0.00000000, 0.0000000…
## $ tfidf_notes_acquired     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_acquisition  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_after        <dbl> 0.11514806, 0.00000000, 0.09090637, 0.0000000…
## $ tfidf_notes_also         <dbl> 0.00000000, 0.03325362, 0.00000000, 0.0000000…
## $ tfidf_notes_an           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_and          <dbl> 0.05825968, 0.02730922, 0.04599448, 0.0397225…
## $ tfidf_notes_announced    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_appointed    <dbl> 0.00000000, 0.04312516, 0.00000000, 0.0000000…
## $ tfidf_notes_are          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_as           <dbl> 0.000000000, 0.046920648, 0.000000000, 0.0000…
## $ tfidf_notes_at           <dbl> 0.00000000, 0.02481501, 0.00000000, 0.0721891…
## $ tfidf_notes_axa          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_notes_based        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_notes_be           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_been         <dbl> 0.00000000, 0.08094585, 0.00000000, 0.0000000…
## $ tfidf_notes_before       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_board        <dbl> 0.00000000, 0.03884980, 0.00000000, 0.0000000…
## $ tfidf_notes_by           <dbl> 0.00000000, 0.02549089, 0.08586404, 0.0741553…
## $ tfidf_notes_ceo          <dbl> 0.15684284, 0.01838002, 0.06191165, 0.0000000…
## $ tfidf_notes_chairman     <dbl> 0.00000000, 0.03676004, 0.06191165, 0.0000000…
## $ tfidf_notes_changed      <dbl> 0.00000000, 0.03636371, 0.00000000, 0.1057853…
## $ tfidf_notes_chief        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_co           <dbl> 0.00000000, 0.03825842, 0.00000000, 0.0000000…
## $ tfidf_notes_company      <dbl> 0.14892819, 0.00000000, 0.00000000, 0.0000000…
## $ `tfidf_notes_company’s`  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_continue     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_corporation  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_director     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_directors    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.1443782…
## $ tfidf_notes_down         <dbl> 0.11184308, 0.05242644, 0.00000000, 0.0000000…
## $ tfidf_notes_due          <dbl> 0.16323593, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_effective    <dbl> 0.00000000, 0.03194931, 0.00000000, 0.0000000…
## $ tfidf_notes_energy       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_executive    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_financial    <dbl> 0.00000000, 0.00000000, 0.12248830, 0.0000000…
## $ tfidf_notes_for          <dbl> 0.00000000, 0.02150381, 0.00000000, 0.0000000…
## $ tfidf_notes_from         <dbl> 0.00000000, 0.01980364, 0.06670701, 0.0000000…
## $ tfidf_notes_had          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.1057853…
## $ tfidf_notes_has          <dbl> 0.00000000, 0.05744623, 0.00000000, 0.0000000…
## $ tfidf_notes_he           <dbl> 0.00000000, 0.00000000, 0.07943971, 0.0686070…
## $ tfidf_notes_his          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.1372140…
## $ tfidf_notes_illinois     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_notes_in           <dbl> 0.00000000, 0.06077881, 0.00000000, 0.0884055…
## $ tfidf_notes_inc          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_iowa         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_notes_is           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0809357…
## $ tfidf_notes_it           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_its          <dbl> 0.00000000, 0.05564332, 0.00000000, 0.0000000…
## $ tfidf_notes_j            <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_management   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_march        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_may          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_mcmahon      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_notes_member       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_million      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_mr           <dbl> 0.00000000, 0.00000000, 0.08586404, 0.0000000…
## $ tfidf_notes_named        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_net          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_not          <dbl> 0.17268448, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_of           <dbl> 0.11229103, 0.07895463, 0.08865082, 0.0000000…
## $ tfidf_notes_officer      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_on           <dbl> 0.00000000, 0.00000000, 0.07574825, 0.0000000…
## $ tfidf_notes_over         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_performance  <dbl> 0.16323593, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_position     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_president    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_resignation  <dbl> 0.00000000, 0.00000000, 0.11201220, 0.0000000…
## $ tfidf_notes_resigned     <dbl> 0.00000000, 0.00000000, 0.12248830, 0.0000000…
## $ tfidf_notes_retired      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_retirement   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_notes_said         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_served       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_notes_share        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_shareholders <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_since        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_step         <dbl> 0.00000000, 0.03825842, 0.00000000, 0.0000000…
## $ tfidf_notes_stepped      <dbl> 0.16323593, 0.03825842, 0.00000000, 0.0000000…
## $ tfidf_notes_than         <dbl> 0.00000000, 0.03825842, 0.00000000, 0.0000000…
## $ tfidf_notes_that         <dbl> 0.00000000, 0.03611783, 0.00000000, 0.1576051…
## $ tfidf_notes_the          <dbl> 0.05236803, 0.06136879, 0.12402955, 0.0357054…
## $ tfidf_notes_this         <dbl> 0.00000000, 0.03194931, 0.00000000, 0.0929434…
## $ tfidf_notes_three        <dbl> 0.00000000, 0.04047292, 0.00000000, 0.0000000…
## $ tfidf_notes_to           <dbl> 0.06483073, 0.03038941, 0.00000000, 0.0884055…
## $ tfidf_notes_until        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.1057853…
## $ tfidf_notes_vice         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_notes_was          <dbl> 0.00000000, 0.01942490, 0.06543124, 0.0565088…
## $ tfidf_notes_which        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_who          <dbl> 0.00000000, 0.03636371, 0.00000000, 0.0000000…
## $ tfidf_notes_will         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_with         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_year         <dbl> 0.00000000, 0.02872312, 0.00000000, 0.0000000…
## $ tfidf_notes_years        <dbl> 0.00000000, 0.02872312, 0.00000000, 0.0000000…
## $ coname_other             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ exec_fullname_other      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb_X2       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tenure_no_ceodb_X3       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X2      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X3      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X4      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1988         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1990         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1991         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1992         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1993         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1994         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone_X1995         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1996         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1997         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1998         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone_X1999         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ fyear_gone_X2000         <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2001         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2002         <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2003         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2004         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2005         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2006         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2007         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2008         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2009         <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2010         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone_X2011         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2012         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2013         <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2014         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2015         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2016         <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2017         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2018         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2019         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ fyear_gone_X2020         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2021         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(),
             tree_depth = tune(),
             min_n = tune(),
             learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec) 

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(65743)

xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 10) 
xgboost_tune %>% collect_metrics()
## # A tibble: 30 × 10
##    trees min_n tree_depth learn_rate .metric     .estimator  mean     n std_err
##    <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>   <dbl>
##  1  1870     5          9    0.0481  accuracy    binary     0.546    10  0.0662
##  2  1870     5          9    0.0481  brier_class binary     0.299    10  0.0422
##  3  1870     5          9    0.0481  roc_auc     binary     0.568    10  0.0762
##  4   963     7         14    0.291   accuracy    binary     0.592    10  0.0515
##  5   963     7         14    0.291   brier_class binary     0.273    10  0.0277
##  6   963     7         14    0.291   roc_auc     binary     0.521    10  0.0858
##  7  1077    11         10    0.00312 accuracy    binary     0.5      10  0     
##  8  1077    11         10    0.00312 brier_class binary     0.25     10  0     
##  9  1077    11         10    0.00312 roc_auc     binary     0.5      10  0     
## 10   643    14         11    0.00413 accuracy    binary     0.5      10  0     
## # ℹ 20 more rows
## # ℹ 1 more variable: .config <chr>