departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')
skimr::skim(departures)
Data summary
Name departures
Number of rows 9423
Number of columns 19
_______________________
Column type frequency:
character 8
numeric 10
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coname 0 1.00 2 30 0 3860 0
exec_fullname 0 1.00 5 790 0 8701 0
interim_coceo 9105 0.03 6 7 0 6 0
still_there 7311 0.22 3 10 0 77 0
notes 1644 0.83 5 3117 0 7755 0
sources 1475 0.84 18 1843 0 7915 0
eight_ks 4499 0.52 69 3884 0 4914 0
_merge 0 1.00 11 11 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1.00 5684.10 25005.46 1 2305.5 4593 6812.5 559044 ▇▁▁▁▁
gvkey 0 1.00 40132.48 53921.34 1004 7337.0 14385 60900.5 328795 ▇▁▁▁▁
fyear 0 1.00 2007.74 8.19 1987 2000.0 2008 2016.0 2020 ▁▆▅▅▇
co_per_rol 0 1.00 25580.22 18202.38 -1 8555.5 22980 39275.5 64602 ▇▆▅▃▃
departure_code 1667 0.82 5.20 1.53 1 5.0 5 7.0 9 ▁▃▇▅▁
ceo_dismissal 1813 0.81 0.20 0.40 0 0.0 0 0.0 1 ▇▁▁▁▂
tenure_no_ceodb 0 1.00 1.03 0.17 0 1.0 1 1.0 3 ▁▇▁▁▁
max_tenure_ceodb 0 1.00 1.05 0.24 1 1.0 1 1.0 4 ▇▁▁▁▁
fyear_gone 1802 0.81 2006.64 13.63 1980 2000.0 2007 2013.0 2997 ▇▁▁▁▁
cik 245 0.97 741469.17 486551.43 1750 106413.0 857323 1050375.8 1808065 ▆▁▇▂▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
leftofc 1802 0.81 1981-01-01 2998-04-27 2006-12-31 3627
# Select relevant columns
factors_vec <- departures %>% 
    select(departure_code, co_per_rol, fyear, tenure_no_ceodb, max_tenure_ceodb, fyear_gone) %>% 
    names()

library(dplyr)
library(lubridate)

data_clean <- departures %>% 
    select(-c(interim_coceo, still_there, eight_ks, gvkey, co_per_rol, cik, fyear, '_merge', notes, sources)) %>%
    filter(fyear_gone != "2997") %>%
    filter(!is.na(ceo_dismissal)) %>%
    mutate(
        departure_code = factor(departure_code),
        tenure_no_ceodb = factor(tenure_no_ceodb),
        max_tenure_ceodb = factor(max_tenure_ceodb),
        ceo_dismissal = factor(ceo_dismissal),
        leftofc = as.Date(leftofc),  # Ensure leftofc is a Date
        year = year(leftofc),         # Create year directly
        doy = yday(leftofc),          # Create day of the year directly
        month = month(leftofc)        # Create month directly
    ) %>%
    select(-leftofc) %>%            # Remove leftofc as it's no longer needed
    # Drop zero-variance variables
    select(-c(tenure_no_ceodb, max_tenure_ceodb)) %>%
    # Ensure ceo_dismissal is character, then recode
    mutate(ceo_dismissal = if_else(ceo_dismissal == "1", "dismissed",
                             if_else(ceo_dismissal == "0", "not dismissed",
                             as.character(ceo_dismissal))))  # Handle NA implicitly

 data_clean <- data_clean %>% sample_n(100) 

Explore data

skimr::skim(departures)
Data summary
Name departures
Number of rows 9423
Number of columns 19
_______________________
Column type frequency:
character 8
numeric 10
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coname 0 1.00 2 30 0 3860 0
exec_fullname 0 1.00 5 790 0 8701 0
interim_coceo 9105 0.03 6 7 0 6 0
still_there 7311 0.22 3 10 0 77 0
notes 1644 0.83 5 3117 0 7755 0
sources 1475 0.84 18 1843 0 7915 0
eight_ks 4499 0.52 69 3884 0 4914 0
_merge 0 1.00 11 11 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1.00 5684.10 25005.46 1 2305.5 4593 6812.5 559044 ▇▁▁▁▁
gvkey 0 1.00 40132.48 53921.34 1004 7337.0 14385 60900.5 328795 ▇▁▁▁▁
fyear 0 1.00 2007.74 8.19 1987 2000.0 2008 2016.0 2020 ▁▆▅▅▇
co_per_rol 0 1.00 25580.22 18202.38 -1 8555.5 22980 39275.5 64602 ▇▆▅▃▃
departure_code 1667 0.82 5.20 1.53 1 5.0 5 7.0 9 ▁▃▇▅▁
ceo_dismissal 1813 0.81 0.20 0.40 0 0.0 0 0.0 1 ▇▁▁▁▂
tenure_no_ceodb 0 1.00 1.03 0.17 0 1.0 1 1.0 3 ▁▇▁▁▁
max_tenure_ceodb 0 1.00 1.05 0.24 1 1.0 1 1.0 4 ▇▁▁▁▁
fyear_gone 1802 0.81 2006.64 13.63 1980 2000.0 2007 2013.0 2997 ▇▁▁▁▁
cik 245 0.97 741469.17 486551.43 1750 106413.0 857323 1050375.8 1808065 ▆▁▇▂▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
leftofc 1802 0.81 1981-01-01 2998-04-27 2006-12-31 3627
data_clean %>% count(ceo_dismissal)
## # A tibble: 2 × 2
##   ceo_dismissal     n
##   <chr>         <int>
## 1 dismissed        20
## 2 not dismissed    80
data_clean %>%
    ggplot(aes(ceo_dismissal)) +
    geom_bar()

ceo_dismissal vs. max tenure

#data_clean %>%
    #ggplot(aes(max_tenure_ceodb)) +
    #geom_boxplot()
# Doesn't represent the data well in my case

correlation plot

# Step 1: binarize
data_binarized <- data_clean %>%
    select(-exec_fullname, -coname) %>%
    binarize()

data_binarized %>% glimpse
## Rows: 100
## Columns: 28
## $ `dismissal_dataset_id__-Inf_3067.5` <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ dismissal_dataset_id__3067.5_4914   <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0…
## $ dismissal_dataset_id__4914_7066     <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ dismissal_dataset_id__7066_Inf      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1…
## $ departure_code__1                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code__3                   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ departure_code__4                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code__5                   <dbl> 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0…
## $ departure_code__6                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ departure_code__7                   <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0…
## $ ceo_dismissal__dismissed            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ ceo_dismissal__not_dismissed        <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1…
## $ `fyear_gone__-Inf_2001`             <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ fyear_gone__2001_2006               <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0…
## $ fyear_gone__2006_2013.25            <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ fyear_gone__2013.25_Inf             <dbl> 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0…
## $ `year__-Inf_2000.75`                <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ year__2000.75_2006                  <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0…
## $ year__2006_2013.25                  <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ year__2013.25_Inf                   <dbl> 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0…
## $ `doy__-Inf_61.5`                    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ doy__61.5_151                       <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1…
## $ doy__151_263.75                     <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ doy__263.75_Inf                     <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0…
## $ `month__-Inf_3`                     <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1…
## $ month__3_5                          <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0…
## $ month__5_9                          <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ month__9_Inf                        <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0…
# Step 2: correlation
data_correlation <- data_binarized %>%
    correlate(ceo_dismissal__dismissed)

data_correlation
## # A tibble: 28 × 3
##    feature        bin           correlation
##    <fct>          <chr>               <dbl>
##  1 ceo_dismissal  dismissed           1    
##  2 ceo_dismissal  not_dismissed      -1    
##  3 departure_code 3                   0.873
##  4 departure_code 5                  -0.480
##  5 departure_code 4                   0.408
##  6 departure_code 7                  -0.312
##  7 fyear_gone     2006_2013.25        0.246
##  8 year           2006_2013.25        0.246
##  9 fyear_gone     -Inf_2001          -0.200
## 10 year           -Inf_2000.75       -0.173
## # ℹ 18 more rows
# Step 3: plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

There is a moderate correlation between departure codes and ceo dismissals so some departures codes are more indicative of ceo dismissals than others.

Model Building

Split Data

library(dplyr)
library(rsample)

set.seed(1234) 
#data_clean <- data_clean %>% sample_n(100)  
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
data_train <- data_train %>%
  mutate(unique_id = paste(dismissal_dataset_id, exec_fullname, year, sep = "_")) %>%
  group_by(unique_id) %>%
  summarize(across(everything(), first))

Preprocess Data

library(themis)
library(recipes)

# Remove unique_id from data_train before creating the recipe
data_train_cleaned <- data_train %>%
  select(-unique_id, -departure_code)

# Create the recipe using the cleaned dataset
xgboost_rec <- recipe(ceo_dismissal ~ ., data = data_train_cleaned) %>%
  step_dummy(all_nominal_predictors(), -all_outcomes()) %>%
  step_smote(ceo_dismissal)

# Prepare and check the recipe
xgboost_rec_prep <- xgboost_rec %>% prep()
data_prepped <- xgboost_rec_prep %>% juice() %>% glimpse()
## Rows: 120
## Columns: 152
## $ dismissal_dataset_id                    <dbl> 1120, 12, 1666, 1687, 194, 202…
## $ fyear_gone                              <dbl> 2017, 1998, 2005, 2017, 2000, …
## $ year                                    <dbl> 2017, 1998, 2005, 2017, 2000, …
## $ doy                                     <dbl> 243, 140, 149, 352, 122, 278, …
## $ month                                   <dbl> 8, 5, 5, 12, 5, 10, 1, 2, 10, …
## $ coname_ADV.NEUROMODULATION.SYS.INC      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_AHMANSON..H.F....CO              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_AIRGAS.INC                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_ALBERTO.CULVER.CO                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_AMERICAN.AIRLINES.GROUP.INC      <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_AMGEN.INC                        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ coname_ANGIODYNAMICS.INC                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_APOGENT.TECHNOLOGIES.INC         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_AVAYA.INC                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BEAR.STEARNS.COMPANIES.INC       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BERKLEY..W.R..CORP               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BJ.S.RESTAURANTS.INC             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BOSTON.PROPERTIES.INC            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_C.COR.INC                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHRISTOPHER...BANKS.CORP         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CME.GROUP.INC                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CNX.RESOURCES.CORPORATION        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COOPER.COMPANIES.INC             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CORAM.HEALTHCARE.CORP            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CORVEL.CORP                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CRACKER.BARREL.OLD.CTRY.STOR     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DEKALB.GENETICS.CORP..CL.B       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DOWDUPONT.INC                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_EARTHLINK.HOLDINGS.CORP          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_EDISON.INTERNATIONAL             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_ENERGY.FUTURE.HOLDINGS.CORP      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GENWORTH.FINANCIAL.INC           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GOTTSCHALKS.INC                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GRANITE.CONSTRUCTION.INC         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GREEN.PLAINS.INC                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HP.INC                           <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_ICU.MEDICAL.INC                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_INFOSEEK.CORP                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_INGRAM.MICRO.INC                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_INTERSIL.CORP..CL.A              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_IXIA                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KBR.INC                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KEMET.CORP                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KIMCO.REALTY.CORP                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KULICKE...SOFFA.INDUSTRIES       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ coname_LILLIAN.VERNON.CORP              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LOWE.S.COMPANIES.INC             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ coname_MATTEL.INC                       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ coname_MERRILL.LYNCH...CO.INC           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ coname_NORTEL.NETWORKS.CORP             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OLIN.CORP                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OREGON.STEEL.MILLS.INC           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OVERSEAS.SHIPHOLDING.GROUP       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PENNEY..J.C..CO                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PEOPLESOFT.INC                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PERFORMANCE.FOOD.GROUP.CO        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PINNACLE.ENTERTAINMENT.INC       <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ coname_QUANTUM.HEALTH.RESOURCES.INC     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_RADIAN.GROUP.INC                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_ROCKWELL.COLLINS                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SENSORMATIC.ELECTRONICS          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SHUTTERFLY.INC                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SILICON.LABORATORIES.INC         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SM.ENERGY.CO                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SPRINT.PCS.GROUP                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_STANCORP.FINANCIAL.GROUP.INC     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_STEEL.EXCEL.INC                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_STONE.ENERGY.CORP                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SUNDSTRAND.CORP                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SUPER.FOOD.SERVICES.INC          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SYMANTEC.CORP                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TALK.AMERICA.HOLDINGS.INC        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TBC.CORP                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TELE.COMM.TCI.GROUP..SER.A       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_THIRD.POINT.REINSURANCE.LTD      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_ULTRATECH.INC                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_WINN.DIXIE.STORES.INC            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_WOLVERINE.WORLD.WIDE             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Alan.C..Greenberg         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Andrew.N..Liveris.A.O..AO <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Anthony.J..Best           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Anthony.M..Sanfilippo     <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Arthur.W..Zafiropoulo     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Barry.A..Ellsworth        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Bruce.P..Bickner          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Bruno.Guilmart            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ exec_fullname_Charles.R..Rinehart       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Christopher.G..Chavez     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Christopher.North         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Clayton.M..Jones          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Craig.A..Conway           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dan.W..Evins              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Daniel.D..Crowley         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.B..Henry            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.E..Maguire          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.H..Watts            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.H..Welch            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Don.R..O.Hare             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Donald.K..Peterson        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Douglas.H..Stickney       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Eamonn.P..Hobbs           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Edward.H..Linde           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Enrique.T..Salem          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ernest.Stanley.O.Neal     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ exec_fullname_Frank.Lazaran             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Frank.P..Filipps          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Gabriel.Battista          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Gary.D..Forsee            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Geoffrey.B..Bloom         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_George.A..Lopez           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Gerald.W..Deitchle        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Gordon.M..Binder          <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ exec_fullname_J..Brett.Harvey           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jack.Lawrence.Howard      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jack.Twyman               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jerry.S..Farrington       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jill.E..Barad             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ exec_fullname_Joe.Levy                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.C..Malone..Ph.D.     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.E..Bryson            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.R..Berger            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Joseph.D..Rupp            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Joseph.E..Pennington      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Joseph.F..Eazor           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kenneth.F..Yontz          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Lillian.Vernon            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Livio.D..DeSimone         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ exec_fullname_Louis.S..DiPasqua         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Martin.Philip.Klein       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.A..Woodhouse      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.L..Molinini       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Morton.P..Hyman           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Necip.Sayiner             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Paviter.Singh.Binning     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Phupinder.S..Gill         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Richard.E..Perry          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.A..Vanourek        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.C..Sledd           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.E.L..Johnson.III   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.L..Crandall        <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.L..Tillman         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ exec_fullname_Robert.Paul.Wayman        <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.S..Weiss           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ronald.B..Johnson         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ronald.E..Timpe           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.B..Boklund         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Victor.Alston             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Victor.Gordon.Clemons.Sr. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Vincent.James.Marino      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.P..Utt            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.Robert.Berkley    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ceo_dismissal                           <fct> not dismissed, not dismissed, …

Specify Model

library(usemodels)
usemodels::use_xgboost(ceo_dismissal ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = ceo_dismissal ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(67594)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(workflows)
library(parsnip)

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec) 

Tune hyperparameters

library(tune)

doParallel::registerDoParallel()

set.seed(17375)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5,
            control = control_grid(save_pred = TRUE))
## Warning: package 'xgboost' was built under R version 4.3.3

Model Evaluation

Identify Optimal Values for Hyperparameters

library(yardstick)
collect_metrics(xgboost_tune)
## # A tibble: 15 × 12
##    trees min_n tree_depth learn_rate loss_reduction sample_size .metric    
##    <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>      
##  1   677     7         14    0.00195       5.18e- 4       0.477 accuracy   
##  2   677     7         14    0.00195       5.18e- 4       0.477 brier_class
##  3   677     7         14    0.00195       5.18e- 4       0.477 roc_auc    
##  4  1016    11          4    0.00394       3.13e-10       0.723 accuracy   
##  5  1016    11          4    0.00394       3.13e-10       0.723 brier_class
##  6  1016    11          4    0.00394       3.13e-10       0.723 roc_auc    
##  7  1626    19          8    0.0202        2.06e- 7       0.257 accuracy   
##  8  1626    19          8    0.0202        2.06e- 7       0.257 brier_class
##  9  1626    19          8    0.0202        2.06e- 7       0.257 roc_auc    
## 10  1483    30          5    0.0873        5.18e- 3       0.401 accuracy   
## 11  1483    30          5    0.0873        5.18e- 3       0.401 brier_class
## 12  1483    30          5    0.0873        5.18e- 3       0.401 roc_auc    
## 13   111    39         12    0.238         4.91e- 1       0.850 accuracy   
## 14   111    39         12    0.238         4.91e- 1       0.850 brier_class
## 15   111    39         12    0.238         4.91e- 1       0.850 roc_auc    
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the Model for the Last Time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.48  Preprocessor1_Model1
## 2 roc_auc     binary         0.585 Preprocessor1_Model1
## 3 brier_class binary         0.250 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class)
##                Truth
## Prediction      dismissed not dismissed
##   dismissed             3            11
##   not dismissed         2             9

Variable Importance

library(vip)

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()