departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')
skimr::skim(departures)
Data summary
Name departures
Number of rows 9423
Number of columns 19
_______________________
Column type frequency:
character 8
numeric 10
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coname 0 1.00 2 30 0 3860 0
exec_fullname 0 1.00 5 790 0 8701 0
interim_coceo 9105 0.03 6 7 0 6 0
still_there 7311 0.22 3 10 0 77 0
notes 1644 0.83 5 3117 0 7755 0
sources 1475 0.84 18 1843 0 7915 0
eight_ks 4499 0.52 69 3884 0 4914 0
_merge 0 1.00 11 11 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1.00 5684.10 25005.46 1 2305.5 4593 6812.5 559044 ▇▁▁▁▁
gvkey 0 1.00 40132.48 53921.34 1004 7337.0 14385 60900.5 328795 ▇▁▁▁▁
fyear 0 1.00 2007.74 8.19 1987 2000.0 2008 2016.0 2020 ▁▆▅▅▇
co_per_rol 0 1.00 25580.22 18202.38 -1 8555.5 22980 39275.5 64602 ▇▆▅▃▃
departure_code 1667 0.82 5.20 1.53 1 5.0 5 7.0 9 ▁▃▇▅▁
ceo_dismissal 1813 0.81 0.20 0.40 0 0.0 0 0.0 1 ▇▁▁▁▂
tenure_no_ceodb 0 1.00 1.03 0.17 0 1.0 1 1.0 3 ▁▇▁▁▁
max_tenure_ceodb 0 1.00 1.05 0.24 1 1.0 1 1.0 4 ▇▁▁▁▁
fyear_gone 1802 0.81 2006.64 13.63 1980 2000.0 2007 2013.0 2997 ▇▁▁▁▁
cik 245 0.97 741469.17 486551.43 1750 106413.0 857323 1050375.8 1808065 ▆▁▇▂▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
leftofc 1802 0.81 1981-01-01 2998-04-27 2006-12-31 3627
# Select relevant columns
factors_vec <- departures %>% 
    select(departure_code, co_per_rol, fyear, tenure_no_ceodb, max_tenure_ceodb, fyear_gone) %>% 
    names()

library(dplyr)
library(lubridate)

data_clean <- departures %>% 
    select(-c(interim_coceo, still_there, eight_ks, gvkey, co_per_rol, cik, fyear, '_merge', notes, sources)) %>%
    filter(fyear_gone != "2997") %>%
    filter(!is.na(ceo_dismissal)) %>%
    mutate(
        departure_code = factor(departure_code),
        tenure_no_ceodb = factor(tenure_no_ceodb),
        max_tenure_ceodb = factor(max_tenure_ceodb),
        ceo_dismissal = factor(ceo_dismissal),
        leftofc = as.Date(leftofc),  # Ensure leftofc is a Date
        year = year(leftofc),         # Create year directly
        doy = yday(leftofc),          # Create day of the year directly
        month = month(leftofc)        # Create month directly
    ) %>%
    select(-leftofc) %>%            # Remove leftofc as it's no longer needed
    # Drop zero-variance variables
    select(-c(tenure_no_ceodb, max_tenure_ceodb)) %>%
    # Ensure ceo_dismissal is character, then recode
    mutate(ceo_dismissal = if_else(ceo_dismissal == "1", "dismissed",
                             if_else(ceo_dismissal == "0", "not dismissed",
                             as.character(ceo_dismissal))))  # Handle NA implicitly

 data_clean <- data_clean %>% sample_n(100) 

Explore data

skimr::skim(departures)
Data summary
Name departures
Number of rows 9423
Number of columns 19
_______________________
Column type frequency:
character 8
numeric 10
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coname 0 1.00 2 30 0 3860 0
exec_fullname 0 1.00 5 790 0 8701 0
interim_coceo 9105 0.03 6 7 0 6 0
still_there 7311 0.22 3 10 0 77 0
notes 1644 0.83 5 3117 0 7755 0
sources 1475 0.84 18 1843 0 7915 0
eight_ks 4499 0.52 69 3884 0 4914 0
_merge 0 1.00 11 11 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1.00 5684.10 25005.46 1 2305.5 4593 6812.5 559044 ▇▁▁▁▁
gvkey 0 1.00 40132.48 53921.34 1004 7337.0 14385 60900.5 328795 ▇▁▁▁▁
fyear 0 1.00 2007.74 8.19 1987 2000.0 2008 2016.0 2020 ▁▆▅▅▇
co_per_rol 0 1.00 25580.22 18202.38 -1 8555.5 22980 39275.5 64602 ▇▆▅▃▃
departure_code 1667 0.82 5.20 1.53 1 5.0 5 7.0 9 ▁▃▇▅▁
ceo_dismissal 1813 0.81 0.20 0.40 0 0.0 0 0.0 1 ▇▁▁▁▂
tenure_no_ceodb 0 1.00 1.03 0.17 0 1.0 1 1.0 3 ▁▇▁▁▁
max_tenure_ceodb 0 1.00 1.05 0.24 1 1.0 1 1.0 4 ▇▁▁▁▁
fyear_gone 1802 0.81 2006.64 13.63 1980 2000.0 2007 2013.0 2997 ▇▁▁▁▁
cik 245 0.97 741469.17 486551.43 1750 106413.0 857323 1050375.8 1808065 ▆▁▇▂▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
leftofc 1802 0.81 1981-01-01 2998-04-27 2006-12-31 3627
data_clean %>% count(ceo_dismissal)
## # A tibble: 2 × 2
##   ceo_dismissal     n
##   <chr>         <int>
## 1 dismissed        25
## 2 not dismissed    75
data_clean %>%
    ggplot(aes(ceo_dismissal)) +
    geom_bar()

ceo_dismissal vs. max tenure

#data_clean %>%
    #ggplot(aes(max_tenure_ceodb)) +
    #geom_boxplot()
# Doesn't represent the data well in my case

correlation plot

# Step 1: binarize
data_binarized <- data_clean %>%
    select(-exec_fullname, -coname) %>%
    binarize()

data_binarized %>% glimpse
## Rows: 100
## Columns: 29
## $ `dismissal_dataset_id__-Inf_1616.75` <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, …
## $ dismissal_dataset_id__1616.75_3793.5 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ dismissal_dataset_id__3793.5_5810.75 <dbl> 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ dismissal_dataset_id__5810.75_Inf    <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ departure_code__1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__2                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ departure_code__3                    <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, …
## $ departure_code__4                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__5                    <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ departure_code__6                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__7                    <dbl> 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ ceo_dismissal__dismissed             <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, …
## $ ceo_dismissal__not_dismissed         <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ `fyear_gone__-Inf_1999`              <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, …
## $ fyear_gone__1999_2004                <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, …
## $ fyear_gone__2004_2010                <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010_Inf                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_1999`                    <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, …
## $ year__1999_2004                      <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, …
## $ year__2004_2010                      <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_Inf                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `doy__-Inf_94`                       <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ doy__94_167.5                        <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ doy__167.5_274.5                     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ doy__274.5_Inf                       <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ `month__-Inf_4`                      <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ month__4_6                           <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ month__6_9.25                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ month__9.25_Inf                      <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
# Step 2: correlation
data_correlation <- data_binarized %>%
    correlate(ceo_dismissal__dismissed)

data_correlation
## # A tibble: 29 × 3
##    feature              bin            correlation
##    <fct>                <chr>                <dbl>
##  1 ceo_dismissal        dismissed            1    
##  2 ceo_dismissal        not_dismissed       -1    
##  3 departure_code       3                    0.947
##  4 departure_code       5                   -0.522
##  5 departure_code       7                   -0.307
##  6 dismissal_dataset_id 5810.75_Inf         -0.173
##  7 month                4_6                 -0.173
##  8 dismissal_dataset_id -Inf_1616.75         0.147
##  9 dismissal_dataset_id 3793.5_5810.75       0.147
## 10 doy                  274.5_Inf            0.147
## # ℹ 19 more rows
# Step 3: plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

There is a moderate correlation between departure codes and ceo dismissals so some departures codes are more indicative of ceo dismissals than others.

Model Building

Split Data

library(dplyr)
library(rsample)

set.seed(1234) 
#data_clean <- data_clean %>% sample_n(100)  
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [66/8]> Fold01
##  2 <split [66/8]> Fold02
##  3 <split [66/8]> Fold03
##  4 <split [66/8]> Fold04
##  5 <split [66/8]> Fold05
##  6 <split [66/8]> Fold06
##  7 <split [67/7]> Fold07
##  8 <split [67/7]> Fold08
##  9 <split [68/6]> Fold09
## 10 <split [68/6]> Fold10
data_train <- data_train %>%
  mutate(unique_id = paste(dismissal_dataset_id, exec_fullname, year, sep = "_")) %>%
  group_by(unique_id) %>%
  summarize(across(everything(), first))

Preprocess Data

library(themis)
library(recipes)

# Remove unique_id from data_train before creating the recipe
data_train_cleaned <- data_train %>%
  select(-unique_id, -departure_code)

# Create the recipe using the cleaned dataset
xgboost_rec <- recipe(ceo_dismissal ~ ., data = data_train_cleaned) %>%
  step_dummy(all_nominal_predictors(), -all_outcomes()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_smote(ceo_dismissal)

# Prepare and check the recipe
xgboost_rec_prep <- xgboost_rec %>% prep()
data_prepped <- xgboost_rec_prep %>% juice() %>% glimpse()
## Rows: 112
## Columns: 152
## $ dismissal_dataset_id                <dbl> -1.0912647, -1.0202653, -0.9921104…
## $ fyear_gone                          <dbl> -0.67737819, 1.11776731, 0.0130623…
## $ year                                <dbl> -0.67368413, 1.12156277, 0.0167954…
## $ doy                                 <dbl> 1.06546612, 1.25201408, -1.6861162…
## $ month                               <dbl> 0.9880798, 1.2805514, -1.6441648, …
## $ coname_ADVENT.SOFTWARE.INC          <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_AGILENT.TECHNOLOGIES.INC     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_ALLEGHENY.ENERGY.INC         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_ALLEGHENY.TECHNOLOGIES.INC   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_AMBAC.FINANCIAL.GROUP.INC    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_APRIA.HEALTHCARE.GROUP.INC   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_ARMCO.INC                    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_AT.T.WIRELESS.SERVICES.INC   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_BANKBOSTON.CORP              <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_BARRETT.RESOURCES.CORP       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_BAXTER.INTERNATIONAL.INC     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_BLANCH.E.W.HOLDINGS.INC      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_BMC.STOCK.HOLDINGS.INC       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_BOWNE...CO.INC               <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_CAMBREX.CORP                 <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_CATALINA.MARKETING.CORP      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_CATALYST.HEALTH.SOLUTIONS    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_CATO.CORP..CL.A              <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_CBEYOND.INC                  <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_CERADYNE.INC                 <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_CIT.GROUP.INC.OLD            <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_CORUS.BANKSHARES.INC         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_CPI.CORP                     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_CURATIVE.HEALTH.SERVICES.INC <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_DSP.COMMUNICATIONS.INC       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_DUN...BRADSTREET.CORP        <dbl> 8.4860776, -0.1162476, -0.1162476,…
## $ coname_EPRESENCE.INC                <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_EQUITY.OFFICE.PROPERTIES.TR  <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_FEDERAL.HOME.LOAN.MORTG.CORP <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_FERRO.CORP                   <dbl> -0.1162476, 8.4860776, -0.1162476,…
## $ coname_FLOWERS.FOODS.INC            <dbl> -0.1162476, -0.1162476, 8.4860776,…
## $ coname_FULLER..H..B...CO            <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_GENERAL.DYNAMICS.CORP        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_GLOBAL.INDUSTRIAL.TECH.INC   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_GREAT.ELM.CAPITAL.GROUP.INC  <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_GREEN.PLAINS.INC             <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_HEXION.INC                   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_HUTCHINSON.TECHNOLOGY.INC    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_ICONIX.BRAND.GROUP.INC       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_IKON.OFFICE.SOLUTIONS        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_INFORMATION.RESOURCES.INC    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_INTERMET.CORP                <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_INTERSTATE.BAKERIES.CORP     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_INVACARE.CORP                <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_JARDEN.CORP                  <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_KEMET.CORP                   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_LEGATO.SYSTEMS.INC           <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_LUBYS.INC                    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_MANOR.CARE.INC               <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_MARSHALL.INDUSTRIES          <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_MAYTAG.CORP                  <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_MBNA.CORP                    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_NASH.FINCH.CO                <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_NATURES.SUNSHINE.PRODS.INC   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_NISOURCE.INC                 <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_OFFSHORE.PIPELINES.INC       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_ORBITAL.ATK.INC              <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_OVERSEAS.SHIPHOLDING.GROUP   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_QUANTUM.HEALTH.RESOURCES.INC <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_RH                           <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_ROBBINS...MYERS.INC          <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_ROCKWELL.AUTOMATION          <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_SHERWIN.WILLIAMS.CO          <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_STATE.STREET.CORP            <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_STRATEX.NETWORKS.INC         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_SYSCO.CORP                   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_TNP.ENTERPRISES.INC          <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_TREDEGAR.CORP                <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_TRIBUNE.MEDIA.CO             <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_WELLMAN.INC                  <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_WHITNEY.HOLDING.CORP         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_WILLAMETTE.INDUSTRIES        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ coname_XL.GROUP.LTD                 <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Albert.R..Gamper..Jr. <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Amos.R..McMullian     <dbl> -0.1162476, -0.1162476, 8.4860776,…
## $ exec_fullname_Barry.A..Ellsworth    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Barry.J..C..Parker    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Brian.Michael.O.Hara  <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Bruce.L..Hammonds     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Charles.A..Sullivan   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Charles.D..Kissner    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Charles.H..Cotros     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Charles.T..Brumback   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Christopher.M..Connor <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Craig.O..Morrison     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Daniel.P..Howells     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Daniel.W..Duval       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_David.A..Spina        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_David.B..Wright       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_David.K..Laniak       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_David.T..Blair        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_David.W..Thompson     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_David.William.Wallis  <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Don.H..Davis.Jr.      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Donald.J..Listwin     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Douglas.H..Stickney   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Dwight.R..Spurlock    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Edgar.W..Blanch.Jr.   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Edward.W..Barnholt    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Frank.C..Wade         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Gary.G..Friedman      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_George.W..Off         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Gordon.S..Marshall    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Harold.B..Finch.Jr.   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Ira.Stepanian         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_James.A..Mack         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_James.E..Lillie       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_James.F..Geiger       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_James.F..Kirsch       <dbl> -0.1162476, 8.4860776, -0.1162476,…
## $ exec_fullname_James.G..Andress      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Jeffrey.W..Green      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Jeremy.M..Jones       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Joel.P..Moskowitz     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_John.C..Hope..III     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_John.D..Gottwald      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_John.D..Zeglis        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_John.Doddridge        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_John.E..Stuart        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_John.N..Haugh         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_John.Vakoutis         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Klaus.Bergman         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Leland.C..Brendsel    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Leonard.A..Hadley     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Michele.Volpi         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Morton.P..Hyman       <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Nathan.Hod            <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Paul.A..Ormond        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Per.Olof.Loof         <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Peter.C..Alexander    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Peter.M..Caswell      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Rawles.Fulgham        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Renato.Cataldo.Jr.    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Richard.H..Koontz     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Robert.C..Skaggs.Jr.  <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Robert.J..Glickman    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Robert.L..Purdum      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Samuel.Zell.J.D.      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Thomas.A..Corcoran    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Thomas.M..Duff        <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Vernon.R..Loucks.Jr.  <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_Volney.Taylor         <dbl> 8.4860776, -0.1162476, -0.1162476,…
## $ exec_fullname_Wayland.H..Cato.Jr.   <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_William.A..Anders     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_William.J..Barrett    <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_William.P..Ferry      <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ exec_fullname_William.Swindells     <dbl> -0.1162476, -0.1162476, -0.1162476…
## $ ceo_dismissal                       <fct> dismissed, dismissed, not dismisse…

Specify Model

library(usemodels)
usemodels::use_xgboost(ceo_dismissal ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = ceo_dismissal ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(6993)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(workflows)
library(parsnip)

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune()) %>%
#loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec) 

Tune hyperparameters

library(tune)
library(ggplot2)
library(dials)

tree_grid <- grid_regular(trees(),
                          tree_depth(),
                          levels = 5)

doParallel::registerDoParallel()

set.seed(17375)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5,
            control = control_grid(save_pred = TRUE))
## Warning: package 'xgboost' was built under R version 4.3.3

Model Evaluation

Identify Optimal Values for Hyperparameters

library(yardstick)
collect_metrics(xgboost_tune)
## # A tibble: 15 × 8
##    trees tree_depth .metric     .estimator  mean     n std_err .config          
##    <int>      <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>            
##  1   277          3 accuracy    binary     0.576    10  0.0773 Preprocessor1_Mo…
##  2   277          3 brier_class binary     0.309    10  0.0516 Preprocessor1_Mo…
##  3   277          3 roc_auc     binary     0.647    10  0.0770 Preprocessor1_Mo…
##  4   427          4 accuracy    binary     0.590    10  0.0725 Preprocessor1_Mo…
##  5   427          4 brier_class binary     0.322    10  0.0522 Preprocessor1_Mo…
##  6   427          4 roc_auc     binary     0.638    10  0.0772 Preprocessor1_Mo…
##  7  1816          7 accuracy    binary     0.611    10  0.0522 Preprocessor1_Mo…
##  8  1816          7 brier_class binary     0.327    10  0.0472 Preprocessor1_Mo…
##  9  1816          7 roc_auc     binary     0.59     10  0.0993 Preprocessor1_Mo…
## 10  1483         11 accuracy    binary     0.611    10  0.0522 Preprocessor1_Mo…
## 11  1483         11 brier_class binary     0.310    10  0.0443 Preprocessor1_Mo…
## 12  1483         11 roc_auc     binary     0.653    10  0.0829 Preprocessor1_Mo…
## 13   910         15 accuracy    binary     0.611    10  0.0522 Preprocessor1_Mo…
## 14   910         15 brier_class binary     0.306    10  0.0428 Preprocessor1_Mo…
## 15   910         15 roc_auc     binary     0.653    10  0.0829 Preprocessor1_Mo…
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the Model for the Last Time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.654 Preprocessor1_Model1
## 2 roc_auc     binary         0.421 Preprocessor1_Model1
## 3 brier_class binary         0.303 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class)
##                Truth
## Prediction      dismissed not dismissed
##   dismissed             2             4
##   not dismissed         5            15

Variable Importance

library(vip)

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had accuracy of 0.56 and AUC of 0.395