The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Goal is to predict CEO departure (ceo_dismissal) by using the departures dataset.

Import Data

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'stringr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.1
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'dials' was built under R version 4.3.1
## Warning: package 'scales' was built under R version 4.3.1
## Warning: package 'infer' was built under R version 4.3.1
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.1
## Warning: package 'recipes' was built under R version 4.3.1
## Warning: package 'rsample' was built under R version 4.3.1
## Warning: package 'tune' was built under R version 4.3.1
## Warning: package 'workflows' was built under R version 4.3.1
## Warning: package 'workflowsets' was built under R version 4.3.1
## Warning: package 'yardstick' was built under R version 4.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.3.1
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.1
library(correlationfunnel)
## ══ correlationfunnel Tip #3 ════════════════════════════════════════════════════
## Using `binarize()` with data containing many columns or many rows can increase dimensionality substantially.
## Try subsetting your data column-wise or row-wise to avoid creating too many columns.
## You can always make a big problem smaller by sampling. :)
library(usemodels)

data <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv")

Explore Data

skimr::skim(data)
Data summary
Name data
Number of rows 9423
Number of columns 19
_______________________
Column type frequency:
character 9
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coname 0 1.00 2 30 0 3860 0
exec_fullname 0 1.00 5 790 0 8701 0
interim_coceo 9105 0.03 6 7 0 6 0
leftofc 1802 0.81 20 20 0 3627 0
still_there 7311 0.22 3 10 0 77 0
notes 1644 0.83 5 3117 0 7755 0
sources 1475 0.84 18 1843 0 7915 0
eight_ks 4499 0.52 69 3884 0 4914 0
X_merge 0 1.00 11 11 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1.00 5684.10 25005.46 1 2305.5 4593 6812.5 559044 ▇▁▁▁▁
gvkey 0 1.00 40132.48 53921.34 1004 7337.0 14385 60900.5 328795 ▇▁▁▁▁
fyear 0 1.00 2007.74 8.19 1987 2000.0 2008 2016.0 2020 ▁▆▅▅▇
co_per_rol 0 1.00 25580.22 18202.38 -1 8555.5 22980 39275.5 64602 ▇▆▅▃▃
departure_code 1667 0.82 5.20 1.53 1 5.0 5 7.0 9 ▁▃▇▅▁
ceo_dismissal 1813 0.81 0.20 0.40 0 0.0 0 0.0 1 ▇▁▁▁▂
tenure_no_ceodb 0 1.00 1.03 0.17 0 1.0 1 1.0 3 ▁▇▁▁▁
max_tenure_ceodb 0 1.00 1.05 0.24 1 1.0 1 1.0 4 ▇▁▁▁▁
fyear_gone 1802 0.81 2006.64 13.63 1980 2000.0 2007 2013.0 2997 ▇▁▁▁▁
cik 245 0.97 741469.17 486551.43 1750 106413.0 857323 1050375.8 1808065 ▆▁▇▂▁

Issues with the data:

Missing Values:

* interim_coceo (91% missing)
* eight_ks (52% missing)

Factors or Numeric Variables:

* departure_code (categorical but currently numeric)
* interim_coceo (needs to be a factor)
* leftofc (needs to be a factor)
* still_there (needs to be a factor)

Zero Variance Variables:

* X-merge

Character Variables:

* coname 
* exec_fullname
* sources

Unbalanced Target Variable:

* ceo_dismissal

ID Variable:

* dismissal_dataset_id
* gvkey
* cik

Clean Data

# Clean the data and ensure ceo_dismissal is a factor
data_clean <- data %>%
    mutate(leftofc = as.Date(leftofc, format = "%Y-%m-%dT"))

# Identify factor variables
factors_vec <- c("departure_code", "leftofc")

data_clean <- data_clean %>%
   
    # Convert ceo_dismissal and factors to proper types
    mutate(ceo_dismissal = as.factor(ceo_dismissal),
           across(all_of(factors_vec), as.factor)) %>%
    
    # Remove rows with missing values in key columns
    drop_na(ceo_dismissal, tenure_no_ceodb, fyear_gone, departure_code, leftofc) %>%
    
    # Drop zero-variance and irrelevant columns
    select(-c(X_merge, interim_coceo, eight_ks, dismissal_dataset_id, gvkey, cik, coname, exec_fullname, sources, still_there, leftofc))

skimr::skim(data_clean)
Data summary
Name data_clean
Number of rows 7478
Number of columns 8
_______________________
Column type frequency:
character 1
factor 2
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
notes 17 1 5 3117 0 7451 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
departure_code 0 1 FALSE 7 5: 3577, 7: 2030, 3: 1316, 4: 194
ceo_dismissal 0 1 FALSE 2 0: 5994, 1: 1484

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
fyear 0 1 2005.61 7.45 1987 1999.0 2006.0 2012.00 2020 ▁▇▆▇▆
co_per_rol 0 1 21446.28 16354.08 -1 6970.5 18267.5 33422.75 64601 ▇▅▅▂▁
tenure_no_ceodb 0 1 1.03 0.16 1 1.0 1.0 1.00 3 ▇▁▁▁▁
max_tenure_ceodb 0 1 1.05 0.23 1 1.0 1.0 1.00 4 ▇▁▁▁▁
fyear_gone 0 1 2006.54 13.69 1980 2000.0 2006.0 2013.00 2997 ▇▁▁▁▁

Explore Data

# Bar plot for CEO Dismissal
data_clean %>%
    ggplot(aes(ceo_dismissal)) + 
    geom_bar() +
    labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")

data_clean %>%    
    ggplot(aes(x = ceo_dismissal, y = tenure_no_ceodb)) + 
    geom_boxplot() +
    labs(title = "CEO Dismissal vs. Tenure", x = "CEO Dismissal", y = "CEO Tenure")

data_clean <- data_clean %>%
    drop_na(notes)

# Step 1: Binarize the data
data_binarized <- data_clean %>%
    binarize()

data_binarized %>% glimpse()
## Rows: 7,461
## Columns: 29
## $ `fyear__-Inf_1999`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ fyear__1999_2006                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ fyear__2006_2012                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ fyear__2012_Inf                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <dbl> …
## $ `co_per_rol__-Inf_6978`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ co_per_rol__6978_18259                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  <dbl> …
## $ co_per_rol__18259_33390                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ co_per_rol__33390_Inf                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   <dbl> …
## $ departure_code__1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ departure_code__2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ departure_code__3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ departure_code__4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ departure_code__5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ departure_code__6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ departure_code__7                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ ceo_dismissal__0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ ceo_dismissal__1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ tenure_no_ceodb__1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ tenure_no_ceodb__2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ `tenure_no_ceodb__-OTHER`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               <dbl> …
## $ max_tenure_ceodb__1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     <dbl> …
## $ max_tenure_ceodb__2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     <dbl> …
## $ `max_tenure_ceodb__-OTHER`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              <dbl> …
## $ `fyear_gone__-Inf_2000`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ fyear_gone__2000_2006                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   <dbl> …
## $ fyear_gone__2006_2013                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   <dbl> …
## $ fyear_gone__2013_Inf                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <dbl> …
## $ `notes__Constantine_S._Macricostas_is_Chairman_of_the_Board_and_founder_of_the_Company._Mr._Macricostas_was_Executive_Chairman_of_the_Company_until_January_20,_2018._Mr._Macricostas_previously_served_as_Chief_Executive_Officer_of_the_Company_on_three_different_occasions_from_1974_until_August_1997,_from_February_2004_to_June_2005,_and_from_April_2009_until_May_2015._Mr._Macricostas_is_a_former_director_of_RagingWire_Data_Centers,_Inc.,_(“RagingWire”)._Mr._Macricostas_is_the_father_of_George_Macricostas._Mr._Macricostas’_knowledge_of_the_Company_and_its_operations,_as_well_as,_the_industry_is_invaluable_to_the_Board_of_Directors_in_evaluating_and_directing_the_Company’s_future._Through_his_long_service_to_the_Company_and_experience_in_the_photomask_industry,_he_has_developed_extensive_knowledge_in_the_areas_of_leadership,_safety,_risk_oversight,_management,_and_corporate_governance,_each_of_which_provides_great_value_to_the_Board_of_Directors._Mr._Macricostas_is_a_member_of_the_Cyber_Security_Committee_of_the_Board.` <dbl> …
## $ `notes__-OTHER`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <dbl> …
# Correlation for both categories of ceo_dismissal
data_correlation_0 <- data_binarized %>%
    correlate(`ceo_dismissal__0`)  # Correlation for one class

data_correlation_other <- data_binarized %>%
    correlate(`ceo_dismissal__1`)  # Correlation for the other class

# Display the correlation results
data_correlation_0
## # A tibble: 29 × 3
##    feature        bin       correlation
##    <fct>          <chr>           <dbl>
##  1 ceo_dismissal  0              1     
##  2 ceo_dismissal  1             -1     
##  3 departure_code 3             -0.929 
##  4 departure_code 5              0.477 
##  5 departure_code 7              0.304 
##  6 departure_code 4             -0.273 
##  7 departure_code 6              0.0788
##  8 fyear          -Inf_1999      0.0782
##  9 co_per_rol     -Inf_6978      0.0596
## 10 fyear_gone     -Inf_2000      0.0591
## # ℹ 19 more rows
data_correlation_other
## # A tibble: 29 × 3
##    feature        bin       correlation
##    <fct>          <chr>           <dbl>
##  1 ceo_dismissal  0             -1     
##  2 ceo_dismissal  1              1     
##  3 departure_code 3              0.929 
##  4 departure_code 5             -0.477 
##  5 departure_code 7             -0.304 
##  6 departure_code 4              0.273 
##  7 departure_code 6             -0.0788
##  8 fyear          -Inf_1999     -0.0782
##  9 co_per_rol     -Inf_6978     -0.0596
## 10 fyear_gone     -Inf_2000     -0.0591
## # ℹ 19 more rows
# Step 3: Plot the correlation funnel
data_correlation_0 %>% 
    correlationfunnel::plot_correlation_funnel() +
    labs(title = "Correlation Funnel for CEO Dismissal (Class 0)")

data_correlation_other %>% 
    correlationfunnel::plot_correlation_funnel() +
    labs(title = "Correlation Funnel for CEO Dismissal (Class Other)")

Model Building

Split Data

library(tidymodels)

# Set seed for reproducibility
set.seed(1234)

# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5034/561]> Fold01
##  2 <split [5034/561]> Fold02
##  3 <split [5035/560]> Fold03
##  4 <split [5036/559]> Fold04
##  5 <split [5036/559]> Fold05
##  6 <split [5036/559]> Fold06
##  7 <split [5036/559]> Fold07
##  8 <split [5036/559]> Fold08
##  9 <split [5036/559]> Fold09
## 10 <split [5036/559]> Fold10

Preprocess data

library(themis)

#data_train <- data_train %>%
    #mutate(leftofc = as.Date(as.character(leftofc), format = "%Y-%m-%d"))

xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
    step_dummy(all_nominal_predictors(), -notes) %>%
    step_tokenize(notes) %>% 
    step_tokenfilter(notes, max_tokens = 100) %>%
    step_tfidf(notes) %>%  
    #step_date(leftofc, features = c("year", "month", "doy")) %>%
    #step_mutate(leftofc_month = as.numeric(leftofc_month)) %>%
    #step_select(-leftofc) %>%
    step_smote(ceo_dismissal)

xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 8,966
## Columns: 114
## $ fyear                   <dbl> 1997, 1998, 2001, 1997, 1997, 1993, 2007, 1993…
## $ co_per_rol              <dbl> 1, 6, 11, 16, 22, 33, 34, 43, 60, 66, 68, 71, …
## $ tenure_no_ceodb         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ max_tenure_ceodb        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ fyear_gone              <dbl> 1998, 1998, 2002, 1997, 1998, 1995, 2007, 1993…
## $ departure_code_X2       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code_X3       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code_X4       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code_X5       <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1…
## $ departure_code_X6       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ departure_code_X7       <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ departure_code_X8       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code_X9       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_1           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_1997        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_1998        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.19367862…
## $ tfidf_notes_1999        <dbl> 0.0000000, 0.1188082, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_a           <dbl> 0.00000000, 0.04092227, 0.00000000, 0.00000000…
## $ tfidf_notes_acquisition <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_after       <dbl> 0.07456141, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_agreement   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_also        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_an          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_and         <dbl> 0.03621827, 0.10412753, 0.10412753, 0.05553468…
## $ tfidf_notes_announced   <dbl> 0.07328246, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_april       <dbl> 0.00000000, 0.00000000, 0.16949346, 0.00000000…
## $ tfidf_notes_as          <dbl> 0.00000000, 0.00000000, 0.05955360, 0.12704769…
## $ tfidf_notes_at          <dbl> 0.00000000, 0.07159414, 0.00000000, 0.00000000…
## $ tfidf_notes_based       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_be          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_been        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_billion     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_board       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_business    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_but         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_by          <dbl> 0.00000000, 0.07115023, 0.00000000, 0.00000000…
## $ tfidf_notes_ceo         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_chairman    <dbl> 0.05129030, 0.04915320, 0.14745960, 0.07864512…
## $ tfidf_notes_chief       <dbl> 0.05185503, 0.04969441, 0.07454161, 0.07951105…
## $ tfidf_notes_co          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_company     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ `tfidf_notes_company's` <dbl> 0.10256568, 0.09829211, 0.00000000, 0.00000000…
## $ tfidf_notes_corp        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_corporation <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_december    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_director    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_directors   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_down        <dbl> 0.00000000, 0.00000000, 0.12388487, 0.00000000…
## $ tfidf_notes_effective   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_executive   <dbl> 0.04908427, 0.04703909, 0.07055863, 0.07526254…
## $ tfidf_notes_financial   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_following   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_for         <dbl> 0.00000000, 0.05607810, 0.00000000, 0.08972496…
## $ tfidf_notes_from        <dbl> 0.00000000, 0.05609376, 0.00000000, 0.00000000…
## $ tfidf_notes_group       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_had         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_has         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_have        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_he          <dbl> 0.05600000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_his         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.09445415…
## $ tfidf_notes_in          <dbl> 0.04290549, 0.04111776, 0.12335328, 0.00000000…
## $ tfidf_notes_inc         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_into        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_is          <dbl> 0.00000000, 0.00000000, 0.10635525, 0.00000000…
## $ tfidf_notes_it          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_its         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_january     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_july        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_june        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_march       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_may         <dbl> 0.1110278, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_merger      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_million     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_more        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_mr          <dbl> 0.06692910, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_named       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_new         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_not         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_of          <dbl> 0.07027907, 0.06735077, 0.05051308, 0.00000000…
## $ tfidf_notes_officer     <dbl> 0.00000000, 0.00000000, 0.08210740, 0.00000000…
## $ tfidf_notes_on          <dbl> 0.05791901, 0.00000000, 0.00000000, 0.08880914…
## $ tfidf_notes_operating   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_over        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_position    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_president   <dbl> 0.00000000, 0.00000000, 0.08309839, 0.08863828…
## $ tfidf_notes_resigned    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.1501982, 0.…
## $ tfidf_notes_retire      <dbl> 0.09754320, 0.09347890, 0.00000000, 0.00000000…
## $ tfidf_notes_retired     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retirement  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.15789109…
## $ tfidf_notes_role        <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0…
## $ tfidf_notes_said        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_served      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_share       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_since       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_stepped     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_stock       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_that        <dbl> 0.05968588, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_the         <dbl> 0.16607733, 0.15915744, 0.00000000, 0.00000000…
## $ tfidf_notes_this        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_time        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_to          <dbl> 0.00000000, 0.00000000, 0.05843937, 0.00000000…
## $ tfidf_notes_today       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_until       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.14938704…
## $ tfidf_notes_vice        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_was         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_when        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_which       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_who         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_will        <dbl> 0.00000000, 0.06472499, 0.00000000, 0.10355998…
## $ tfidf_notes_with        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_would       <dbl> 0.1229135, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_year        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_years       <dbl> 0.08768202, 0.08402860, 0.00000000, 0.00000000…
## $ ceo_dismissal           <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(),
             tree_depth = tune(),
             min_n = tune(),
             learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec) 

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(65743)

xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 10)
## Warning: package 'xgboost' was built under R version 4.3.3
xgboost_tune %>% collect_metrics()
## # A tibble: 30 × 10
##    trees min_n tree_depth learn_rate .metric    .estimator    mean     n std_err
##    <int> <int>      <int>      <dbl> <chr>      <chr>        <dbl> <int>   <dbl>
##  1  1870     5          9    0.0481  accuracy   binary     0.997      10 7.75e-4
##  2  1870     5          9    0.0481  brier_cla… binary     0.00276    10 5.83e-4
##  3  1870     5          9    0.0481  roc_auc    binary     1.00       10 4.55e-5
##  4   963     7         14    0.291   accuracy   binary     0.997      10 6.78e-4
##  5   963     7         14    0.291   brier_cla… binary     0.00270    10 5.70e-4
##  6   963     7         14    0.291   roc_auc    binary     1.00       10 6.12e-5
##  7  1077    11         10    0.00312 accuracy   binary     0.996      10 6.95e-4
##  8  1077    11         10    0.00312 brier_cla… binary     0.00339    10 5.52e-4
##  9  1077    11         10    0.00312 roc_auc    binary     1.00       10 4.93e-5
## 10   643    14         11    0.00413 accuracy   binary     0.996      10 6.95e-4
## # ℹ 20 more rows
## # ℹ 1 more variable: .config <chr>