The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Goal is to predict CEO departure (ceo_dismissal) by using the departures dataset.

Import Data

data <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv")

Explore Data

skimr::skim(data)

Data summary
Name	data
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	9
numeric	10
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
leftofc	1802	0.81	20	20	3627
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
X_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Issues With The Data

Missing Values

Interim_coceo (97% missing)
still_there (78% missing)
eight_ks (48% missing)

Factors or Numeric Variables

departure_code (categorical but currently numeric)
interim_coceo (needs to be a factor)
leftofc (needs to be a factor)
still_there (needs to be a factor)

0 Variance Variables

X_merge

Character Names

coname
exec_fullname
sources

Unbalanced Target Variable

ceo_dismissal

ID Variable

dismissal_dataset_id
gvkey
cik

Data cleaning

# Clean the data and ensure ceo_dismissal is a factor
data_clean <- data %>%
  # Convert ceo_dismissal and factors to proper types
  filter(!is.na(ceo_dismissal)) %>%
  mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
  mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>% 
  
  # Remove variables with missing values in key columns
  select(-c(interim_coceo, still_there, eight_ks)) %>%

  # Remove irrelevant variables that don't seem to have predictive power
  select(-c(X_merge, sources)) %>%

  # Remove variable with info that only becomes available after the fact
  select(-departure_code) %>%

  # Remove redundant variables 
  select(-c(gvkey, cik, co_per_rol, leftofc, fyear)) %>%

  # Remove duplicates in dismissal_dataset_id, which is the id variable
  distinct(dismissal_dataset_id, .keep_all = TRUE) %>%

  # Remove 2997 in fyear_gone
  filter(fyear_gone < 2025) %>%

  # Convert numeric variables that should be factors
  mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%

  # Convert all character variables to factors
  mutate(across(where(is.character), as.factor)) %>%

  # Convert notes to character
  mutate(notes = as.character(notes)) %>%
  
  # Remove missing values
  na.omit()

skimr::skim(data_clean)

Data summary
Name	data_clean
Number of rows	7458
Number of columns	8
_______________________
Column type frequency:
character	1
factor	6
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
notes	0	1	5	3117	0	7448	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
coname	1	FALSE	3427	BAR: 8, CLA: 8, FED: 8, NTN: 8
exec_fullname	1	FALSE	6961	Joh: 4, Mel: 4, Alb: 3, Ami: 3
ceo_dismissal	1	FALSE	2	not: 5976, dis: 1482
tenure_no_ceodb	1	FALSE	3	1: 7274, 2: 177, 3: 7
max_tenure_ceodb	1	FALSE	4	1: 7123, 2: 317, 3: 15, 4: 3
fyear_gone	1	FALSE	34	200: 378, 199: 350, 200: 332, 200: 320

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1	5570.24	25786.43	1	2170.25	4321.5	6575.75	559044	▇▁▁▁▁

Data Exploration

# Bar plot for CEO Dismissal
data_clean %>%
  ggplot(aes(ceo_dismissal)) + 
  geom_bar() +
  labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")

data_clean %>%    
  ggplot(aes(x = ceo_dismissal, y = tenure_no_ceodb)) + 
  geom_boxplot() +
  labs(title = "CEO Dismissal vs. Tenure", x = "CEO Dismissal", y = "CEO Tenure")

data_clean %>% 
  select(-dismissal_dataset_id, -notes) %>%
  binarize() -> data_binarized

data_binarized %>% glimpse()

## Rows: 7,458
## Columns: 40
## $ coname__BARRICK_GOLD_CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__dismissed    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ ceo_dismissal__not_dis      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ tenure_no_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1993            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__1994            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1995            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone__1996            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1997            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1998            <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1999            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2002            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2003            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2004            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2006            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2008            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2009            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2013            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2014            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2017            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2018            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Correlation for both categories of ceo_dismissal

correlation_results <- data_binarized %>%
  correlate(`ceo_dismissal__not_dis`)

correlation_results

## # A tibble: 40 × 3
##    feature          bin       correlation
##    <fct>            <chr>           <dbl>
##  1 ceo_dismissal    dismissed     -1     
##  2 ceo_dismissal    not_dis        1     
##  3 max_tenure_ceodb 1             -0.0577
##  4 max_tenure_ceodb 2              0.0533
##  5 fyear_gone       1999           0.0390
##  6 fyear_gone       2002          -0.0378
##  7 fyear_gone       2003          -0.0303
##  8 fyear_gone       2009          -0.0292
##  9 fyear_gone       2008          -0.0261
## 10 fyear_gone       1997           0.0255
## # ℹ 30 more rows

# Step 3: Plot the correlation funnel
correlation_results %>% 
  correlationfunnel::plot_correlation_funnel() +
  labs(title = "Correlation Funnel for CEO Dismissal")

## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Sample to smaller size

data_clean <- data_clean %>% 
  group_by(ceo_dismissal) %>% 
  sample_n(50) %>% 
  ungroup()

Split Data

# Set seed for reproducibility
set.seed(1234)

# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [66/8]> Fold01
##  2 <split [66/8]> Fold02
##  3 <split [66/8]> Fold03
##  4 <split [66/8]> Fold04
##  5 <split [66/8]> Fold05
##  6 <split [66/8]> Fold06
##  7 <split [66/8]> Fold07
##  8 <split [68/6]> Fold08
##  9 <split [68/6]> Fold09
## 10 <split [68/6]> Fold10

Preprocess Data

xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
  update_role(dismissal_dataset_id, new_role = "id") %>%
  step_tokenize(notes) %>% 
  step_tokenfilter(notes, max_tokens = 100) %>%
  step_tfidf(notes) %>%
  step_other(coname, exec_fullname) %>%
  step_dummy(all_nominal_predictors()) %>%  
  step_smote(ceo_dismissal)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 74
## Columns: 142
## $ dismissal_dataset_id     <int> 8884, 1741, 3459, 830, 7874, 7747, 3138, 235,…
## $ ceo_dismissal            <fct> dismissed, dismissed, dismissed, dismissed, d…
## $ tfidf_notes_1            <dbl> 0.00000000, 0.06154502, 0.00000000, 0.0000000…
## $ tfidf_notes_11           <dbl> 0.00000000, 0.00000000, 0.14526368, 0.0000000…
## $ tfidf_notes_1994         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_1997         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_1999         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_2012         <dbl> 0.00000000, 0.13923818, 0.00000000, 0.0000000…
## $ tfidf_notes_3            <dbl> 0.00000000, 0.03636371, 0.00000000, 0.0000000…
## $ tfidf_notes_a            <dbl> 0.00000000, 0.04909308, 0.00000000, 0.0000000…
## $ tfidf_notes_acquired     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_acquisition  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_after        <dbl> 0.11514806, 0.00000000, 0.09090637, 0.0000000…
## $ tfidf_notes_also         <dbl> 0.00000000, 0.03325362, 0.00000000, 0.0000000…
## $ tfidf_notes_an           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_and          <dbl> 0.05825968, 0.02730922, 0.04599448, 0.0397225…
## $ tfidf_notes_announced    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_appointed    <dbl> 0.00000000, 0.04312516, 0.00000000, 0.0000000…
## $ tfidf_notes_are          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_as           <dbl> 0.000000000, 0.046920648, 0.000000000, 0.0000…
## $ tfidf_notes_at           <dbl> 0.00000000, 0.02481501, 0.00000000, 0.0721891…
## $ tfidf_notes_axa          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_notes_based        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_notes_be           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_been         <dbl> 0.00000000, 0.08094585, 0.00000000, 0.0000000…
## $ tfidf_notes_before       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_board        <dbl> 0.00000000, 0.03884980, 0.00000000, 0.0000000…
## $ tfidf_notes_by           <dbl> 0.00000000, 0.02549089, 0.08586404, 0.0741553…
## $ tfidf_notes_ceo          <dbl> 0.15684284, 0.01838002, 0.06191165, 0.0000000…
## $ tfidf_notes_chairman     <dbl> 0.00000000, 0.03676004, 0.06191165, 0.0000000…
## $ tfidf_notes_changed      <dbl> 0.00000000, 0.03636371, 0.00000000, 0.1057853…
## $ tfidf_notes_chief        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_co           <dbl> 0.00000000, 0.03825842, 0.00000000, 0.0000000…
## $ tfidf_notes_company      <dbl> 0.14892819, 0.00000000, 0.00000000, 0.0000000…
## $ `tfidf_notes_company’s`  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_continue     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_corporation  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_director     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_directors    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.1443782…
## $ tfidf_notes_down         <dbl> 0.11184308, 0.05242644, 0.00000000, 0.0000000…
## $ tfidf_notes_due          <dbl> 0.16323593, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_effective    <dbl> 0.00000000, 0.03194931, 0.00000000, 0.0000000…
## $ tfidf_notes_energy       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_executive    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_financial    <dbl> 0.00000000, 0.00000000, 0.12248830, 0.0000000…
## $ tfidf_notes_for          <dbl> 0.00000000, 0.02150381, 0.00000000, 0.0000000…
## $ tfidf_notes_from         <dbl> 0.00000000, 0.01980364, 0.06670701, 0.0000000…
## $ tfidf_notes_had          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.1057853…
## $ tfidf_notes_has          <dbl> 0.00000000, 0.05744623, 0.00000000, 0.0000000…
## $ tfidf_notes_he           <dbl> 0.00000000, 0.00000000, 0.07943971, 0.0686070…
## $ tfidf_notes_his          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.1372140…
## $ tfidf_notes_illinois     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_notes_in           <dbl> 0.00000000, 0.06077881, 0.00000000, 0.0884055…
## $ tfidf_notes_inc          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_iowa         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_notes_is           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0809357…
## $ tfidf_notes_it           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_its          <dbl> 0.00000000, 0.05564332, 0.00000000, 0.0000000…
## $ tfidf_notes_j            <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_management   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_march        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_may          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_mcmahon      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_notes_member       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_million      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_mr           <dbl> 0.00000000, 0.00000000, 0.08586404, 0.0000000…
## $ tfidf_notes_named        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_net          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_not          <dbl> 0.17268448, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_of           <dbl> 0.11229103, 0.07895463, 0.08865082, 0.0000000…
## $ tfidf_notes_officer      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_on           <dbl> 0.00000000, 0.00000000, 0.07574825, 0.0000000…
## $ tfidf_notes_over         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_performance  <dbl> 0.16323593, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_position     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_president    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_resignation  <dbl> 0.00000000, 0.00000000, 0.11201220, 0.0000000…
## $ tfidf_notes_resigned     <dbl> 0.00000000, 0.00000000, 0.12248830, 0.0000000…
## $ tfidf_notes_retired      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_retirement   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_notes_said         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_served       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_notes_share        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_shareholders <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_since        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_step         <dbl> 0.00000000, 0.03825842, 0.00000000, 0.0000000…
## $ tfidf_notes_stepped      <dbl> 0.16323593, 0.03825842, 0.00000000, 0.0000000…
## $ tfidf_notes_than         <dbl> 0.00000000, 0.03825842, 0.00000000, 0.0000000…
## $ tfidf_notes_that         <dbl> 0.00000000, 0.03611783, 0.00000000, 0.1576051…
## $ tfidf_notes_the          <dbl> 0.05236803, 0.06136879, 0.12402955, 0.0357054…
## $ tfidf_notes_this         <dbl> 0.00000000, 0.03194931, 0.00000000, 0.0929434…
## $ tfidf_notes_three        <dbl> 0.00000000, 0.04047292, 0.00000000, 0.0000000…
## $ tfidf_notes_to           <dbl> 0.06483073, 0.03038941, 0.00000000, 0.0884055…
## $ tfidf_notes_until        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.1057853…
## $ tfidf_notes_vice         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_notes_was          <dbl> 0.00000000, 0.01942490, 0.06543124, 0.0565088…
## $ tfidf_notes_which        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_who          <dbl> 0.00000000, 0.03636371, 0.00000000, 0.0000000…
## $ tfidf_notes_will         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_with         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_notes_year         <dbl> 0.00000000, 0.02872312, 0.00000000, 0.0000000…
## $ tfidf_notes_years        <dbl> 0.00000000, 0.02872312, 0.00000000, 0.0000000…
## $ coname_other             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ exec_fullname_other      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb_X2       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tenure_no_ceodb_X3       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X2      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X3      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X4      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1988         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1990         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1991         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1992         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1993         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1994         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone_X1995         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1996         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1997         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X1998         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone_X1999         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ fyear_gone_X2000         <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2001         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2002         <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2003         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2004         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2005         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2006         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2007         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2008         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2009         <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2010         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone_X2011         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2012         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2013         <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2014         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2015         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2016         <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2017         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2018         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2019         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ fyear_gone_X2020         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone_X2021         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(),
             tree_depth = tune(),
             min_n = tune(),
             learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(65743)

xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 10) 
xgboost_tune %>% collect_metrics()

## # A tibble: 30 × 10
##    trees min_n tree_depth learn_rate .metric     .estimator  mean     n std_err
##    <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>   <dbl>
##  1  1870     5          9    0.0481  accuracy    binary     0.546    10  0.0662
##  2  1870     5          9    0.0481  brier_class binary     0.299    10  0.0422
##  3  1870     5          9    0.0481  roc_auc     binary     0.568    10  0.0762
##  4   963     7         14    0.291   accuracy    binary     0.592    10  0.0515
##  5   963     7         14    0.291   brier_class binary     0.273    10  0.0277
##  6   963     7         14    0.291   roc_auc     binary     0.521    10  0.0858
##  7  1077    11         10    0.00312 accuracy    binary     0.5      10  0     
##  8  1077    11         10    0.00312 brier_class binary     0.25     10  0     
##  9  1077    11         10    0.00312 roc_auc     binary     0.5      10  0     
## 10   643    14         11    0.00413 accuracy    binary     0.5      10  0     
## # ℹ 20 more rows
## # ℹ 1 more variable: .config <chr>

Apply it to your data 6

Nils Skogestig

2025-03-06