The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Goal is to predict CEO departure (ceo_dismissal) by using the departures dataset.

Import Data

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.1

## Warning: package 'tidyr' was built under R version 4.3.1

## Warning: package 'dplyr' was built under R version 4.3.1

## Warning: package 'stringr' was built under R version 4.3.1

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.3.1

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10

## Warning: package 'broom' was built under R version 4.3.3

## Warning: package 'dials' was built under R version 4.3.1

## Warning: package 'scales' was built under R version 4.3.1

## Warning: package 'infer' was built under R version 4.3.1

## Warning: package 'modeldata' was built under R version 4.3.3

## Warning: package 'parsnip' was built under R version 4.3.1

## Warning: package 'recipes' was built under R version 4.3.1

## Warning: package 'rsample' was built under R version 4.3.1

## Warning: package 'tune' was built under R version 4.3.1

## Warning: package 'workflows' was built under R version 4.3.1

## Warning: package 'workflowsets' was built under R version 4.3.1

## Warning: package 'yardstick' was built under R version 4.3.1

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

library(textrecipes)

## Warning: package 'textrecipes' was built under R version 4.3.1

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.3.1

library(correlationfunnel)

## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)

library(usemodels)

data <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv")

Explore Data

skimr::skim(data)

Data summary
Name	data
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	9
numeric	10
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
leftofc	1802	0.81	20	20	3627
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
X_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Issues with the data:

Missing Values:

* interim_coceo (91% missing)
* eight_ks (52% missing)

Factors or Numeric Variables:

* departure_code (categorical but currently numeric)
* interim_coceo (needs to be a factor)
* leftofc (needs to be a factor)
* still_there (needs to be a factor)

Zero Variance Variables:

* X-merge

Character Variables:

* coname 
* exec_fullname
* sources

Unbalanced Target Variable:

* ceo_dismissal

ID Variable:

* dismissal_dataset_id
* gvkey
* cik

Clean Data

# Clean the data and ensure ceo_dismissal is a factor
data_clean <- data %>%
    mutate(leftofc = as.Date(leftofc, format = "%Y-%m-%dT"))

# Identify factor variables
factors_vec <- c("departure_code", "leftofc")

data_clean <- data_clean %>%
   
    # Convert ceo_dismissal and factors to proper types
    mutate(ceo_dismissal = as.factor(ceo_dismissal),
           across(all_of(factors_vec), as.factor)) %>%
    
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1,
                                   "dismissed", "not dismissed"))%>%
    
    # Remove rows with missing values in key columns
    drop_na(ceo_dismissal, tenure_no_ceodb, fyear_gone, departure_code, leftofc) %>%
    
    # Drop zero-variance and irrelevant columns
    select(-c(X_merge, interim_coceo, eight_ks, dismissal_dataset_id, gvkey, cik, coname, exec_fullname, sources, still_there, leftofc, departure_code))

skimr::skim(data_clean)

Data summary
Name	data_clean
Number of rows	7478
Number of columns	7
_______________________
Column type frequency:
character	2
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
ceo_dismissal	0	1	9	13	0	2	0
notes	17	1	5	3117	0	7451	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
fyear	1	2005.61	7.45	1987	1999.0	2006.0	2012.00	2020	▁▇▆▇▆
co_per_rol	1	21446.28	16354.08	-1	6970.5	18267.5	33422.75	64601	▇▅▅▂▁
tenure_no_ceodb	1	1.03	0.16	1	1.0	1.0	1.00	3	▇▁▁▁▁
max_tenure_ceodb	1	1.05	0.23	1	1.0	1.0	1.00	4	▇▁▁▁▁
fyear_gone	1	2006.54	13.69	1980	2000.0	2006.0	2013.00	2997	▇▁▁▁▁

Explore Data

# Bar plot for CEO Dismissal
data_clean %>%
    ggplot(aes(ceo_dismissal)) + 
    geom_bar() +
    labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")

data_clean %>%    
    ggplot(aes(x = ceo_dismissal, y = tenure_no_ceodb)) + 
    geom_boxplot() +
    labs(title = "CEO Dismissal vs. Tenure", x = "CEO Dismissal", y = "CEO Tenure")

data_clean <- data_clean %>%
    drop_na(notes)

# Step 1: Binarize the data
data_binarized <- data_clean %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 7,461
## Columns: 22
## $ `fyear__-Inf_1999`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ fyear__1999_2006                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ fyear__2006_2012                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ fyear__2012_Inf                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <dbl> …
## $ `co_per_rol__-Inf_6978`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ co_per_rol__6978_18259                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  <dbl> …
## $ co_per_rol__18259_33390                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ co_per_rol__33390_Inf                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   <dbl> …
## $ ceo_dismissal__dismissed                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <dbl> …
## $ ceo_dismissal__not_dismissed                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <dbl> …
## $ tenure_no_ceodb__1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ tenure_no_ceodb__2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ `tenure_no_ceodb__-OTHER`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               <dbl> …
## $ max_tenure_ceodb__1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     <dbl> …
## $ max_tenure_ceodb__2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     <dbl> …
## $ `max_tenure_ceodb__-OTHER`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              <dbl> …
## $ `fyear_gone__-Inf_2000`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ fyear_gone__2000_2006                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   <dbl> …
## $ fyear_gone__2006_2013                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   <dbl> …
## $ fyear_gone__2013_Inf                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <dbl> …
## $ `notes__Constantine_S._Macricostas_is_Chairman_of_the_Board_and_founder_of_the_Company._Mr._Macricostas_was_Executive_Chairman_of_the_Company_until_January_20,_2018._Mr._Macricostas_previously_served_as_Chief_Executive_Officer_of_the_Company_on_three_different_occasions_from_1974_until_August_1997,_from_February_2004_to_June_2005,_and_from_April_2009_until_May_2015._Mr._Macricostas_is_a_former_director_of_RagingWire_Data_Centers,_Inc.,_(“RagingWire”)._Mr._Macricostas_is_the_father_of_George_Macricostas._Mr._Macricostas’_knowledge_of_the_Company_and_its_operations,_as_well_as,_the_industry_is_invaluable_to_the_Board_of_Directors_in_evaluating_and_directing_the_Company’s_future._Through_his_long_service_to_the_Company_and_experience_in_the_photomask_industry,_he_has_developed_extensive_knowledge_in_the_areas_of_leadership,_safety,_risk_oversight,_management,_and_corporate_governance,_each_of_which_provides_great_value_to_the_Board_of_Directors._Mr._Macricostas_is_a_member_of_the_Cyber_Security_Committee_of_the_Board.` <dbl> …
## $ `notes__-OTHER`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <dbl> …

# Correlation for both categories of ceo_dismissal
data_correlation_dismissed <- data_binarized %>%
    correlate(`ceo_dismissal__dismissed`)  # Correlation for one class

data_correlation_other <- data_binarized %>%
    correlate(`ceo_dismissal__not_dismissed`)  # Correlation for the other class

# Display the correlation results
data_correlation_dismissed

## # A tibble: 22 × 3
##    feature          bin           correlation
##    <fct>            <chr>               <dbl>
##  1 ceo_dismissal    dismissed          1     
##  2 ceo_dismissal    not_dismissed     -1     
##  3 fyear            -Inf_1999         -0.0782
##  4 co_per_rol       -Inf_6978         -0.0596
##  5 fyear_gone       -Inf_2000         -0.0591
##  6 max_tenure_ceodb 1                  0.0580
##  7 co_per_rol       33390_Inf          0.0561
##  8 max_tenure_ceodb 2                 -0.0535
##  9 fyear            1999_2006          0.0349
## 10 fyear            2006_2012          0.0304
## # ℹ 12 more rows

data_correlation_other

## # A tibble: 22 × 3
##    feature          bin           correlation
##    <fct>            <chr>               <dbl>
##  1 ceo_dismissal    dismissed         -1     
##  2 ceo_dismissal    not_dismissed      1     
##  3 fyear            -Inf_1999          0.0782
##  4 co_per_rol       -Inf_6978          0.0596
##  5 fyear_gone       -Inf_2000          0.0591
##  6 max_tenure_ceodb 1                 -0.0580
##  7 co_per_rol       33390_Inf         -0.0561
##  8 max_tenure_ceodb 2                  0.0535
##  9 fyear            1999_2006         -0.0349
## 10 fyear            2006_2012         -0.0304
## # ℹ 12 more rows

# Step 3: Plot the correlation funnel
data_correlation_dismissed %>% 
    correlationfunnel::plot_correlation_funnel() +
    labs(title = "Correlation Funnel for CEO Dismissal (Class 0)")

data_correlation_other %>% 
    correlationfunnel::plot_correlation_funnel() +
    labs(title = "Correlation Funnel for CEO Dismissal (Class Other)")

Model Building

Split Data

library(tidymodels)

# Set seed for reproducibility
set.seed(1234)

# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5034/561]> Fold01
##  2 <split [5034/561]> Fold02
##  3 <split [5035/560]> Fold03
##  4 <split [5036/559]> Fold04
##  5 <split [5036/559]> Fold05
##  6 <split [5036/559]> Fold06
##  7 <split [5036/559]> Fold07
##  8 <split [5036/559]> Fold08
##  9 <split [5036/559]> Fold09
## 10 <split [5036/559]> Fold10

Preprocess data

library(themis)

#data_train <- data_train %>%
    #mutate(leftofc = as.Date(as.character(leftofc), format = "%Y-%m-%d"))

xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
    step_dummy(all_nominal_predictors(), -notes) %>%
    step_tokenize(notes) %>% 
    step_tokenfilter(notes, max_tokens = 100) %>%
    step_tfidf(notes) %>%  
    #step_date(leftofc, features = c("year", "month", "doy")) %>%
    #step_mutate(leftofc_month = as.numeric(leftofc_month)) %>%
    #step_select(-leftofc) %>%
    step_smote(ceo_dismissal)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 8,966
## Columns: 106
## $ fyear                   <dbl> 1993, 1998, 1995, 2004, 1993, 1995, 1993, 2002…
## $ co_per_rol              <dbl> 50, 51, 82, 134, 176, 177, 190, 212, 226, 258,…
## $ tenure_no_ceodb         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ max_tenure_ceodb        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ fyear_gone              <dbl> 1993, 1998, 1996, 2005, 1993, 1996, 1993, 2003…
## $ tfidf_notes_1           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_1997        <dbl> 0.00000000, 0.20198814, 0.00000000, 0.00000000…
## $ tfidf_notes_1998        <dbl> 0.0000000, 0.2024027, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_a           <dbl> 0.08641255, 0.06754085, 0.00000000, 0.07165919…
## $ tfidf_notes_acquisition <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_after       <dbl> 0.05079521, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_agreement   <dbl> 0.07868081, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_also        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_an          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_and         <dbl> 0.12254163, 0.14366950, 0.08928033, 0.08129591…
## $ tfidf_notes_announced   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_april       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_as          <dbl> 0.055999051, 0.065654060, 0.000000000, 0.02321…
## $ tfidf_notes_at          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_based       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_be          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_been        <dbl> 0.00000000, 0.00000000, 0.07447320, 0.05085975…
## $ tfidf_notes_billion     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_board       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_business    <dbl> 0.08031956, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_but         <dbl> 0.00000000, 0.08860053, 0.00000000, 0.00000000…
## $ tfidf_notes_by          <dbl> 0.05067565, 0.05941283, 0.00000000, 0.04202371…
## $ tfidf_notes_ceo         <dbl> 0.03545272, 0.04156526, 0.04304974, 0.00000000…
## $ tfidf_notes_chairman    <dbl> 0.06965640, 0.04083306, 0.00000000, 0.00000000…
## $ tfidf_notes_chief       <dbl> 0.00000000, 0.00000000, 0.04263608, 0.00000000…
## $ tfidf_notes_co          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_company     <dbl> 0.00000000, 0.03647601, 0.00000000, 0.02580010…
## $ `tfidf_notes_company's` <dbl> 0.06942831, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_corp        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_corporation <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_december    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_director    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_directors   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_down        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_effective   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_executive   <dbl> 0.00000000, 0.00000000, 0.04022907, 0.00000000…
## $ tfidf_notes_financial   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06631612…
## $ tfidf_notes_following   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_for         <dbl> 0.00000000, 0.00000000, 0.04822856, 0.03293658…
## $ tfidf_notes_from        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03301992…
## $ tfidf_notes_group       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_had         <dbl> 0.00000000, 0.00000000, 0.08036618, 0.05488422…
## $ tfidf_notes_has         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03796486…
## $ tfidf_notes_have        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06818303…
## $ tfidf_notes_he          <dbl> 0.00000000, 0.00000000, 0.04624420, 0.03158140…
## $ tfidf_notes_his         <dbl> 0.08277418, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_in          <dbl> 0.11708268, 0.03431734, 0.03554296, 0.04854648…
## $ tfidf_notes_inc         <dbl> 0.00000000, 0.05311718, 0.00000000, 0.00000000…
## $ tfidf_notes_into        <dbl> 0.07604371, 0.00000000, 0.00000000, 0.06306063…
## $ tfidf_notes_is          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_it          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04617391…
## $ tfidf_notes_its         <dbl> 0.00000000, 0.06050891, 0.00000000, 0.00000000…
## $ tfidf_notes_january     <dbl> 0.00000000, 0.09103061, 0.00000000, 0.00000000…
## $ tfidf_notes_july        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_june        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_march       <dbl> 0.00000000, 0.00000000, 0.09938214, 0.00000000…
## $ tfidf_notes_may         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06209416…
## $ tfidf_notes_member      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_merger      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_million     <dbl> 0.00000000, 0.00000000, 0.17592239, 0.00000000…
## $ tfidf_notes_mr          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.07469250…
## $ tfidf_notes_named       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_new         <dbl> 0.00000000, 0.07618033, 0.00000000, 0.00000000…
## $ tfidf_notes_not         <dbl> 0.07376446, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_of          <dbl> 0.02379990, 0.00000000, 0.08669962, 0.03947300…
## $ tfidf_notes_officer     <dbl> 0.00000000, 0.00000000, 0.04681721, 0.00000000…
## $ tfidf_notes_on          <dbl> 0.00000000, 0.04588521, 0.09504794, 0.00000000…
## $ tfidf_notes_operating   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_over        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_position    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_president   <dbl> 0.00000000, 0.04606270, 0.09541558, 0.00000000…
## $ tfidf_notes_resigned    <dbl> 0.00000000, 0.00000000, 0.07945938, 0.00000000…
## $ tfidf_notes_retire      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retired     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retirement  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_role        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_said        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_served      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_share       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_since       <dbl> 0.0000000, 0.0000000, 0.1481641, 0.0000000, 0.…
## $ tfidf_notes_stepped     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_stock       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_that        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03332022…
## $ tfidf_notes_the         <dbl> 0.04504186, 0.02640385, 0.02734685, 0.11205537…
## $ tfidf_notes_this        <dbl> 0.06762394, 0.00000000, 0.00000000, 0.05607839…
## $ tfidf_notes_time        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_to          <dbl> 0.02737796, 0.06419659, 0.00000000, 0.04540734…
## $ tfidf_notes_today       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_until       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_vice        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_was         <dbl> 0.03507000, 0.00000000, 0.00000000, 0.02908244…
## $ tfidf_notes_when        <dbl> 0.07376446, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_which       <dbl> 0.00000000, 0.00000000, 0.08211479, 0.11215678…
## $ tfidf_notes_who         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_will        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_with        <dbl> 0.04351577, 0.00000000, 0.05284058, 0.00000000…
## $ tfidf_notes_would       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_year        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04966110…
## $ tfidf_notes_years       <dbl> 0.00000000, 0.06882007, 0.00000000, 0.00000000…
## $ ceo_dismissal           <fct> dismissed, dismissed, dismissed, dismissed, di…

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(),
             tree_depth = tune(),
             min_n = tune(),
             learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(65743)

xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5, 
            control = control_grid(save_pred = TRUE))

## Warning: package 'xgboost' was built under R version 4.3.3

Model Evaluation

Identify Optimal Values for Hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 10
##    trees min_n tree_depth learn_rate .metric     .estimator  mean     n std_err
##    <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>   <dbl>
##  1   885     4          1    0.0670  accuracy    binary     0.804    10 0.00373
##  2   885     4          1    0.0670  brier_class binary     0.135    10 0.00208
##  3   885     4          1    0.0670  roc_auc     binary     0.828    10 0.00507
##  4   541    16          9    0.0266  accuracy    binary     0.830    10 0.00333
##  5   541    16          9    0.0266  brier_class binary     0.119    10 0.00178
##  6   541    16          9    0.0266  roc_auc     binary     0.852    10 0.00403
##  7   325    18         10    0.00276 accuracy    binary     0.767    10 0.00403
##  8   325    18         10    0.00276 brier_class binary     0.172    10 0.00126
##  9   325    18         10    0.00276 roc_auc     binary     0.776    10 0.00759
## 10  1754    32         13    0.00495 accuracy    binary     0.825    10 0.00227
## 11  1754    32         13    0.00495 brier_class binary     0.121    10 0.00168
## 12  1754    32         13    0.00495 roc_auc     binary     0.850    10 0.00364
## 13  1312    38          7    0.141   accuracy    binary     0.830    10 0.00375
## 14  1312    38          7    0.141   brier_class binary     0.126    10 0.00254
## 15  1312    38          7    0.141   roc_auc     binary     0.843    10 0.00571
## # ℹ 1 more variable: .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the Model for the Last Time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.838 Preprocessor1_Model1
## 2 roc_auc     binary         0.860 Preprocessor1_Model1
## 3 brier_class binary         0.117 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

Variable Importance

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Apply it to Your Data 7: CEO Departures

Job Boonstoppel

2024-10-31

Import Data

Explore Data

Issues with the data:

Missing Values:

Factors or Numeric Variables:

Zero Variance Variables:

Character Variables:

Unbalanced Target Variable:

ID Variable:

Clean Data

Explore Data

Model Building

Split Data

Preprocess data

Specify Model

Tune hyperparameters

Model Evaluation

Identify Optimal Values for Hyperparameters

Fit the Model for the Last Time

Variable Importance