Import data

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv')
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore Data

skimr::skim(departures)
Data summary
Name departures
Number of rows 9423
Number of columns 19
_______________________
Column type frequency:
character 8
numeric 10
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coname 0 1.00 2 30 0 3860 0
exec_fullname 0 1.00 5 790 0 8701 0
interim_coceo 9105 0.03 6 7 0 6 0
still_there 7311 0.22 3 10 0 77 0
notes 1644 0.83 5 3117 0 7755 0
sources 1475 0.84 18 1843 0 7915 0
eight_ks 4499 0.52 69 3884 0 4914 0
_merge 0 1.00 11 11 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1.00 5684.10 25005.46 1 2305.5 4593 6812.5 559044 ▇▁▁▁▁
gvkey 0 1.00 40132.48 53921.34 1004 7337.0 14385 60900.5 328795 ▇▁▁▁▁
fyear 0 1.00 2007.74 8.19 1987 2000.0 2008 2016.0 2020 ▁▆▅▅▇
co_per_rol 0 1.00 25580.22 18202.38 -1 8555.5 22980 39275.5 64602 ▇▆▅▃▃
departure_code 1667 0.82 5.20 1.53 1 5.0 5 7.0 9 ▁▃▇▅▁
ceo_dismissal 1813 0.81 0.20 0.40 0 0.0 0 0.0 1 ▇▁▁▁▂
tenure_no_ceodb 0 1.00 1.03 0.17 0 1.0 1 1.0 3 ▁▇▁▁▁
max_tenure_ceodb 0 1.00 1.05 0.24 1 1.0 1 1.0 4 ▇▁▁▁▁
fyear_gone 1802 0.81 2006.64 13.63 1980 2000.0 2007 2013.0 2997 ▇▁▁▁▁
cik 245 0.97 741469.17 486551.43 1750 106413.0 857323 1050375.8 1808065 ▆▁▇▂▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
leftofc 1802 0.81 1981-01-01 2998-04-27 2006-12-31 3627

Issues with the data

Missing Values *interim_coceo, still_there, notes (string variable), sources, eight_ks, departure_code, ceo_dismissal, fyear_gone, cik

Converting Numeric Variables to Factors * tenure_no_ceodb, max_tenure_ceodb, fyear_gone

Zero variance variables * _merge, gvkey, dismissal_dataset_id

Character Variables * coname, exec_fullname, sources, eight_ks, notes (string variable and not factor)

Unbalanced Target Variable ceo_dismissal #adress in recipe section step_smoke function

Handling ID Variables * dismissal_dataset_id #dataset primary key

Explore data

# Clean data
departures_clean <- departures %>%
    
    # Clean the target variable
    filter(!is.na(ceo_dismissal)) %>%
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
    mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>%
    
    # Remove variables with too many missing values
    select(-c(interim_coceo, still_there, eight_ks))%>%
    
    # Remove irrelevant variables
    select(-`_merge`, -sources) %>%
               
    # Remove variables with info that only becomes 
    select(-departure_code) %>%
    
    # Remove redundant variables 
    select(-c(gvkey, cik, co_per_rol)) %>% #need leftofc as date variable later
    
    #Remove duplicated in dismissal_dataset_id our id variable
    distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
    
    #Remove 2997 in fyear_gone
    filter(fyear_gone < 2025) %>%
    
    # Convert factors that are incorrectly imported as numeric variables
    mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%
    mutate(across(where(is.character), as.factor)) %>%
    
    mutate(notes = as.character(notes))
    
skimr::skim(departures_clean) 
Data summary
Name departures_clean
Number of rows 7475
Number of columns 10
_______________________
Column type frequency:
character 1
factor 6
numeric 2
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
notes 17 1 5 3117 0 7448 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
coname 0 1 FALSE 3427 BAR: 8, CLA: 8, FED: 8, GRE: 8
exec_fullname 0 1 FALSE 6975 Joh: 4, Mel: 4, Alb: 3, Ami: 3
ceo_dismissal 0 1 FALSE 2 not: 5992, dis: 1483
tenure_no_ceodb 0 1 FALSE 3 1: 7289, 2: 179, 3: 7
max_tenure_ceodb 0 1 FALSE 4 1: 7138, 2: 319, 3: 15, 4: 3
fyear_gone 0 1 FALSE 34 200: 379, 199: 351, 200: 334, 200: 321

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1 5570.32 25757.33 1 2175.5 4326 6579.5 559044 ▇▁▁▁▁
fyear 0 1 2005.61 7.45 1987 1999.0 2006 2012.0 2020 ▁▇▆▇▆

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
leftofc 0 1 1981-01-01 2021-12-01 2006-11-15 3576

Explore data

departures_clean %>% count(ceo_dismissal)
## # A tibble: 2 × 2
##   ceo_dismissal     n
##   <fct>         <int>
## 1 dismissed      1483
## 2 not_dis        5992
departures_clean %>%
    ggplot(aes(fyear)) + 
    geom_bar() +
    labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")

CEO Dismissal or Tenure

departures_clean %>%
  count(ceo_dismissal, tenure_no_ceodb) %>%
  ggplot(mapping = aes(x = ceo_dismissal, y = tenure_no_ceodb)) +
  geom_tile(mapping = aes(fill = n)) +
  labs(title = "CEO Dismissal or Tenure", x = "CEO Dismissal", y = "CEO Tenure")

Correlation Plot

 departures_clean <- departures_clean 

# Step 1: Binarize the data
data_binarized <- departures_clean %>%
    select(-notes, -dismissal_dataset_id, -leftofc) %>%
    binarize()

data_binarized %>% glimpse()
## Rows: 7,475
## Columns: 44
## $ coname__BARRICK_GOLD_CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `fyear__-Inf_1999`          <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, …
## $ fyear__1999_2006            <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear__2006_2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear__2012_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__dismissed    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ ceo_dismissal__not_dis      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ tenure_no_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1993            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__1994            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1995            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone__1996            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1997            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1998            <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1999            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2002            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2003            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2004            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2006            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2008            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2009            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2013            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2014            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2017            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2018            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlation
data_correlation_dismissed <- data_binarized %>%
    correlate(ceo_dismissal__dismissed)

data_correlation_notdis <- data_binarized %>%
    correlate(ceo_dismissal__not_dis)

data_correlation_dismissed
## # A tibble: 44 × 3
##    feature          bin       correlation
##    <fct>            <chr>           <dbl>
##  1 ceo_dismissal    dismissed      1     
##  2 ceo_dismissal    not_dis       -1     
##  3 fyear            -Inf_1999     -0.0774
##  4 max_tenure_ceodb 1              0.0580
##  5 max_tenure_ceodb 2             -0.0536
##  6 fyear_gone       1999          -0.0391
##  7 fyear_gone       2002           0.0374
##  8 fyear            1999_2006      0.0345
##  9 fyear            2006_2012      0.0301
## 10 fyear_gone       2003           0.0296
## # ℹ 34 more rows
# Step 3: Plot
data_correlation_dismissed %>% 
    correlationfunnel::plot_correlation_funnel() +
    labs(title = "Correlation Funnel for CEO Dismissal")
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model building

Split data

library(tidymodels)

# Set seed for reproducibility
set.seed(1234)
data_clean <- departures_clean #%>% group_by(ceo_dismissal)%>% sample_n(100) %>% ungroup

# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5044/562]> Fold01
##  2 <split [5044/562]> Fold02
##  3 <split [5045/561]> Fold03
##  4 <split [5045/561]> Fold04
##  5 <split [5046/560]> Fold05
##  6 <split [5046/560]> Fold06
##  7 <split [5046/560]> Fold07
##  8 <split [5046/560]> Fold08
##  9 <split [5046/560]> Fold09
## 10 <split [5046/560]> Fold10

Preprocess data

#departures_clean <- departures_clean %>%
#mutate(leftofc = as.Date(leftofc, format = "%Y-%m-%d"))
    
xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
    update_role(dismissal_dataset_id, new_role = "ID")  %>%
    step_other(coname, exec_fullname, threshold = 0.05) %>%
    step_tokenize(notes) %>%
    step_tokenfilter(notes, max_tokens = 100) %>%
    step_tfidf(notes)  %>%
    step_date(leftofc, features = c("year", "month", "doy"), keep_original_cols = FALSE) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_smote(ceo_dismissal)

xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 8,988
## Columns: 156
## $ dismissal_dataset_id    <dbl> 84, 85, 119, 162, 243, 244, 263, 280, 300, 346…
## $ fyear                   <dbl> 1993, 1998, 1995, 2004, 1993, 1995, 1993, 2002…
## $ ceo_dismissal           <fct> dismissed, dismissed, dismissed, dismissed, di…
## $ tfidf_notes_1           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_1998        <dbl> 0.0000000, 0.2084846, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_2000        <dbl> 0.00000000, 0.10688168, 0.00000000, 0.07125445…
## $ tfidf_notes_a           <dbl> 0.08640627, 0.06994794, 0.00000000, 0.06994794…
## $ tfidf_notes_acquisition <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_after       <dbl> 0.05080277, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_agreement   <dbl> 0.07913465, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_also        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_an          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_and         <dbl> 0.12289881, 0.14923426, 0.08954056, 0.07959161…
## $ tfidf_notes_announced   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_april       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_as          <dbl> 0.05605974, 0.06807255, 0.00000000, 0.02269085…
## $ tfidf_notes_at          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_based       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_be          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_been        <dbl> 0.00000000, 0.00000000, 0.07477204, 0.04984803…
## $ tfidf_notes_billion     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_board       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_business    <dbl> 0.07880085, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_but         <dbl> 0.00000000, 0.09133601, 0.00000000, 0.00000000…
## $ tfidf_notes_by          <dbl> 0.05068340, 0.06154413, 0.00000000, 0.04102942…
## $ tfidf_notes_ceo         <dbl> 0.03545019, 0.04304666, 0.04304666, 0.00000000…
## $ tfidf_notes_chairman    <dbl> 0.06958794, 0.04224982, 0.00000000, 0.00000000…
## $ tfidf_notes_chief       <dbl> 0.00000000, 0.00000000, 0.04244088, 0.00000000…
## $ tfidf_notes_co          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_company     <dbl> 0.00000000, 0.03753689, 0.00000000, 0.02502459…
## $ `tfidf_notes_company's` <dbl> 0.06893837, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_corp        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_corporation <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_december    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_director    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_directors   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_down        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_effective   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_executive   <dbl> 0.00000000, 0.00000000, 0.04024952, 0.00000000…
## $ tfidf_notes_financial   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06587907…
## $ tfidf_notes_following   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_for         <dbl> 0.00000000, 0.00000000, 0.04797170, 0.03198113…
## $ tfidf_notes_from        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03216000…
## $ tfidf_notes_group       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_had         <dbl> 0.00000000, 0.00000000, 0.07994803, 0.05329868…
## $ tfidf_notes_has         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03766112…
## $ tfidf_notes_have        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06773914…
## $ tfidf_notes_he          <dbl> 0.00000000, 0.00000000, 0.04622153, 0.03081435…
## $ tfidf_notes_his         <dbl> 0.08380476, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_in          <dbl> 0.11624220, 0.03528781, 0.03528781, 0.04705041…
## $ tfidf_notes_inc         <dbl> 0.00000000, 0.05536558, 0.00000000, 0.00000000…
## $ tfidf_notes_into        <dbl> 0.07539130, 0.00000000, 0.00000000, 0.06103106…
## $ tfidf_notes_is          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_it          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04521652…
## $ tfidf_notes_its         <dbl> 0.00000000, 0.06183488, 0.00000000, 0.00000000…
## $ tfidf_notes_january     <dbl> 0.00000000, 0.09481317, 0.00000000, 0.00000000…
## $ tfidf_notes_july        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_june        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_march       <dbl> 0.0000000, 0.0000000, 0.0998133, 0.0000000, 0.…
## $ tfidf_notes_may         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06038407…
## $ tfidf_notes_merger      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_million     <dbl> 0.00000000, 0.00000000, 0.17517900, 0.00000000…
## $ tfidf_notes_mr          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.07298742…
## $ tfidf_notes_named       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_new         <dbl> 0.00000000, 0.07841573, 0.00000000, 0.00000000…
## $ tfidf_notes_not         <dbl> 0.07338546, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_of          <dbl> 0.02374139, 0.00000000, 0.08648649, 0.03843844…
## $ tfidf_notes_officer     <dbl> 0.00000000, 0.00000000, 0.04670480, 0.00000000…
## $ tfidf_notes_on          <dbl> 0.00000000, 0.04741923, 0.09483847, 0.00000000…
## $ tfidf_notes_operating   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_or          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_over        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_position    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_president   <dbl> 0.00000000, 0.04747122, 0.09494245, 0.00000000…
## $ tfidf_notes_resigned    <dbl> 0.00000000, 0.00000000, 0.08023561, 0.00000000…
## $ tfidf_notes_retire      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retired     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retirement  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_role        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_said        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_served      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_share       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_since       <dbl> 0.0000000, 0.0000000, 0.1489905, 0.0000000, 0.…
## $ tfidf_notes_stepped     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_stock       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_that        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03241431…
## $ tfidf_notes_the         <dbl> 0.04501321, 0.02732945, 0.02732945, 0.10931780…
## $ tfidf_notes_this        <dbl> 0.06810383, 0.00000000, 0.00000000, 0.05513167…
## $ tfidf_notes_time        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_to          <dbl> 0.02731539, 0.06633738, 0.00000000, 0.04422492…
## $ tfidf_notes_today       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_until       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_vice        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_was         <dbl> 0.03493467, 0.00000000, 0.00000000, 0.02828045…
## $ tfidf_notes_when        <dbl> 0.07311918, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_which       <dbl> 0.00000000, 0.00000000, 0.08259285, 0.11012380…
## $ tfidf_notes_who         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_will        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_with        <dbl> 0.04367090, 0.00000000, 0.05302895, 0.00000000…
## $ tfidf_notes_would       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_year        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04842102…
## $ tfidf_notes_years       <dbl> 0.00000000, 0.07172717, 0.00000000, 0.00000000…
## $ leftofc_year            <dbl> 1993, 1998, 1996, 2005, 1993, 1996, 1993, 2002…
## $ leftofc_doy             <dbl> 212, 189, 74, 73, 152, 33, 181, 273, 182, 304,…
## $ coname_other            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ exec_fullname_other     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ tenure_no_ceodb_X2      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tenure_no_ceodb_X3      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X2     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X3     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X4     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1988        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1990        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1991        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1992        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1993        <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ fyear_gone_X1994        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ fyear_gone_X1995        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ fyear_gone_X1996        <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ fyear_gone_X1997        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ fyear_gone_X1998        <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2000        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2001        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2002        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2003        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0…
## $ fyear_gone_X2004        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2005        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2006        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2007        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2008        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2009        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2010        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2011        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2012        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2013        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2014        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2015        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2016        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2017        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2018        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2019        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2020        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2021        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Feb       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Mar       <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Apr       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_May       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Jun       <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Jul       <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ leftofc_month_Aug       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Sep       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0…
## $ leftofc_month_Oct       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1…
## $ leftofc_month_Nov       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Dec       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…

Specify model

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune(), min_n = tune(),   learn_rate = tune()) %>%  
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec) 

Tune Hyperparameters

doParallel::registerDoParallel()

set.seed(65743)
xgboost_tune <- 
    tune_grid(xgboost_workflow,
              resamples = data_cv,
              grid = 5,
              control = control_grid(save_pred = TRUE))

Model Evaluation

Identify Optimal Values for Hyperparameters

collect_metrics(xgboost_tune)
## # A tibble: 15 × 10
##    trees min_n tree_depth learn_rate .metric     .estimator  mean     n std_err
##    <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>   <dbl>
##  1   885     4          1    0.0670  accuracy    binary     0.811    10 0.00591
##  2   885     4          1    0.0670  brier_class binary     0.133    10 0.00307
##  3   885     4          1    0.0670  roc_auc     binary     0.829    10 0.00883
##  4   541    16          9    0.0266  accuracy    binary     0.836    10 0.00375
##  5   541    16          9    0.0266  brier_class binary     0.117    10 0.00284
##  6   541    16          9    0.0266  roc_auc     binary     0.855    10 0.00799
##  7   325    18         10    0.00276 accuracy    binary     0.780    10 0.00633
##  8   325    18         10    0.00276 brier_class binary     0.170    10 0.00191
##  9   325    18         10    0.00276 roc_auc     binary     0.786    10 0.0112 
## 10  1754    32         13    0.00495 accuracy    binary     0.829    10 0.00379
## 11  1754    32         13    0.00495 brier_class binary     0.121    10 0.00279
## 12  1754    32         13    0.00495 roc_auc     binary     0.848    10 0.00766
## 13  1312    38          7    0.141   accuracy    binary     0.826    10 0.00425
## 14  1312    38          7    0.141   brier_class binary     0.129    10 0.00343
## 15  1312    38          7    0.141   roc_auc     binary     0.832    10 0.00847
## # ℹ 1 more variable: .config <chr>
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the Model for the Last Time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.837 Preprocessor1_Model1
## 2 roc_auc     binary         0.853 Preprocessor1_Model1
## 3 brier_class binary         0.120 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

Variable Importance

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had accuracy of 0.838 and AUC of 0.856

*Feature slection: PCA didn’t make an improvement.