Goal is to predict CEO departure

Import Data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 4.3.2
## ══ correlationfunnel Tip #3 ════════════════════════════════════════════════════
## Using `binarize()` with data containing many columns or many rows can increase dimensionality substantially.
## Try subsetting your data column-wise or row-wise to avoid creating too many columns.
## You can always make a big problem smaller by sampling. :)
departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean Data

skimr::skim(departures)
Data summary
Name departures
Number of rows 9423
Number of columns 19
_______________________
Column type frequency:
character 8
numeric 10
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coname 0 1.00 2 30 0 3860 0
exec_fullname 0 1.00 5 790 0 8701 0
interim_coceo 9105 0.03 6 7 0 6 0
still_there 7311 0.22 3 10 0 77 0
notes 1644 0.83 5 3117 0 7755 0
sources 1475 0.84 18 1843 0 7915 0
eight_ks 4499 0.52 69 3884 0 4914 0
_merge 0 1.00 11 11 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1.00 5684.10 25005.46 1 2305.5 4593 6812.5 559044 ▇▁▁▁▁
gvkey 0 1.00 40132.48 53921.34 1004 7337.0 14385 60900.5 328795 ▇▁▁▁▁
fyear 0 1.00 2007.74 8.19 1987 2000.0 2008 2016.0 2020 ▁▆▅▅▇
co_per_rol 0 1.00 25580.22 18202.38 -1 8555.5 22980 39275.5 64602 ▇▆▅▃▃
departure_code 1667 0.82 5.20 1.53 1 5.0 5 7.0 9 ▁▃▇▅▁
ceo_dismissal 1813 0.81 0.20 0.40 0 0.0 0 0.0 1 ▇▁▁▁▂
tenure_no_ceodb 0 1.00 1.03 0.17 0 1.0 1 1.0 3 ▁▇▁▁▁
max_tenure_ceodb 0 1.00 1.05 0.24 1 1.0 1 1.0 4 ▇▁▁▁▁
fyear_gone 1802 0.81 2006.64 13.63 1980 2000.0 2007 2013.0 2997 ▇▁▁▁▁
cik 245 0.97 741469.17 486551.43 1750 106413.0 857323 1050375.8 1808065 ▆▁▇▂▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
leftofc 1802 0.81 1981-01-01 2998-04-27 2006-12-31 3627
factors_vec <- departures %>% select( departure_code, tenure_no_ceodb, max_tenure_ceodb, ceo_dismissal) %>% names()

data_clean <- departures %>%
  select(-interim_coceo, -still_there, -eight_ks, -notes, -sources, -leftofc) %>%
  
    # remove NA's
  na.omit() %>%
  
  # address factors imported as numeric
  mutate(across(all_of(factors_vec), as.factor)) %>%
  
  # drop zero variance variable name
  select(-c(`_merge`)) %>%
  
  # Recode CEO Dismissal
  mutate(ceo_dismissal = if_else(ceo_dismissal == "Yes", "1", ceo_dismissal))

Explore Data

data_clean %>% count(ceo_dismissal)
## # A tibble: 2 × 2
##   ceo_dismissal     n
##   <chr>         <int>
## 1 0              5822
## 2 1              1439
data_clean %>%
  ggplot(aes(ceo_dismissal)) +
  geom_bar()

fyear vs interim_coceo

data_clean %>%
  ggplot(aes(ceo_dismissal, fyear)) + 
  geom_boxplot()

correlation plot

# step 1: binarize
data_binarized <- data_clean %>%
  binarize()

data_binarized %>% glimpse()
## Rows: 7,261
## Columns: 43
## $ `dismissal_dataset_id__-Inf_2159` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ dismissal_dataset_id__2159_4330   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dismissal_dataset_id__4330_6564   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dismissal_dataset_id__6564_Inf    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname__BARRICK_GOLD_CORP         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `gvkey__-Inf_6867`                <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gvkey__6867_13283                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gvkey__13283_30025                <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gvkey__30025_Inf                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear__-Inf_1999`                <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, …
## $ fyear__1999_2006                  <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ fyear__2006_2012                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear__2012_Inf                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `co_per_rol__-Inf_6968`           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ co_per_rol__6968_18252            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__18252_33294           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__33294_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname__John_W._Rowe       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ departure_code__1                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__2                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__3                 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ departure_code__4                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__5                 <dbl> 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, …
## $ departure_code__6                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__7                 <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ ceo_dismissal__0                  <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ ceo_dismissal__1                  <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ tenure_no_ceodb__1                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-Inf_2000`           <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, …
## $ fyear_gone__2000_2006             <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ fyear_gone__2006_2013             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__2013_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cik__-Inf_101063`                <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ cik__101063_832428                <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ cik__832428_1024302               <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cik__1024302_Inf                  <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, …
# step 2: correlation
data_correlation <- data_binarized %>%
  correlate(ceo_dismissal__1)

data_correlation
## # A tibble: 43 × 3
##    feature        bin       correlation
##    <fct>          <chr>           <dbl>
##  1 ceo_dismissal  0             -1     
##  2 ceo_dismissal  1              1     
##  3 departure_code 3              0.929 
##  4 departure_code 5             -0.482 
##  5 departure_code 7             -0.298 
##  6 departure_code 4              0.274 
##  7 fyear          -Inf_1999     -0.0785
##  8 departure_code 6             -0.0784
##  9 co_per_rol     -Inf_6968     -0.0598
## 10 fyear_gone     -Inf_2000     -0.0589
## # ℹ 33 more rows
# step 3: plot
data_correlation %>%
  correlationfunnel::plot_correlation_funnel()

Model building

Split data

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(1234)
data_clean <- data_clean %>% sample_n(100)

data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [65/9]> Fold01
##  2 <split [66/8]> Fold02
##  3 <split [66/8]> Fold03
##  4 <split [67/7]> Fold04
##  5 <split [67/7]> Fold05
##  6 <split [67/7]> Fold06
##  7 <split [67/7]> Fold07
##  8 <split [67/7]> Fold08
##  9 <split [67/7]> Fold09
## 10 <split [67/7]> Fold10

Preprocess data

library(themis)
## Warning: package 'themis' was built under R version 4.3.3
xgboost_ceo <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
  update_role(cik, new_role = "ID") %>%
  step_dummy(all_nominal_predictors()) %>%
  step_smote(ceo_dismissal)

xgboost_ceo %>% prep() %>% juice() %>% glimpse()
## Rows: 122
## Columns: 164
## $ dismissal_dataset_id                 <dbl> 3351, 6849, 1976, 5742, 3929, 517…
## $ gvkey                                <dbl> 10247, 61399, 6347, 24997, 11858,…
## $ fyear                                <dbl> 1999, 2015, 2006, 1994, 1998, 201…
## $ co_per_rol                           <dbl> 2117, 54385, 13847, 8317, 15455, …
## $ fyear_gone                           <dbl> 1999, 2016, 2007, 1994, 1997, 201…
## $ cik                                  <dbl> 96021, 899923, 906469, 878549, 31…
## $ ceo_dismissal                        <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BERGEN.BRUNSWIG.CORP..CL.A    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BOMBAY.CO.INC                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BRADY.CORP                    <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ coname_CAMPBELL.SOUP.CO              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHECKFREE.CORP                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHICOS.FAS.INC                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHILDRENS.PLACE.INC           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CINCINNATI.BELL.INC           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COMDISCO.HOLDING.CO.INC       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COMMERCE.BANCORP.INC.NJ       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COMMERCIAL.FEDERAL.CORP       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CORAM.HEALTHCARE.CORP         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CST.BRANDS.INC                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CUTERA.INC                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DAMES...MOORE.GROUP           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DIALOGIC.CORP.OLD             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DUPONT.PHOTOMASKS.INC         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DURACELL.INTERNATIONAL        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FEDERAL.MOGUL.HOLDINGS.CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FEDERAL.NATIONAL.MORTGA.ASSN  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FIRST.FINL.BANCORP.INC.OH     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FLIR.SYSTEMS.INC              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GENTIVA.HEALTH.SERVICES.INC   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GENUINE.PARTS.CO              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HA2003.INC                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HARRIS.CORP                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HARTMARX.CORP                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_IBP.INC                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KING.PHARMACEUTICALS.INC      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LABORATORY.CP.OF.AMER.HLDGS   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LEVITZ.FURNITURE.INC..VTG     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LUBRIZOL.CORP                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MCI.INC                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MEAD.JOHNSON.NUTRITION.CO     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ coname_MERCK...CO                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_METTLER.TOLEDO.INTL.INC       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ coname_MGIC.INVESTMENT.CORP.WI       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MICRON.TECHNOLOGY.INC         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MYRIAD.GENETICS.INC           <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_NATIONAL.COMMERCE.FINANCIAL   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_NEW.CENTURY.ENERGIES.INC      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_NIKE.INC..CL.B                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_O.REILLY.AUTOMOTIVE.INC       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OAKWOOD.HOMES.CORP            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OFFICE.DEPOT.INC              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OXFORD.HEALTH.PLANS.INC       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PALM.INC                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PARKER.HANNIFIN.CORP          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PEOPLES.ENERGY.CORP           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PULSE.ELECTRONICS.CORP        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_RAYTHEON.CO                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_RYAN.S.RESTAURANT.GROUP.INC   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SAFETY.KLEEN.CORP             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SBS.TECHNOLOGIES.INC          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SCANA.CORP                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ coname_SENSIENT.TECHNOLOGIES.CORP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SHARED.MEDICAL.SYSTEMS.CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SMUCKER..JM..CO               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SOUTHERN.CO                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_STURM.RUGER...CO.INC          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SYSCO.CORP                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TCF.FINANCIAL.CORP            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TELEFLEX.INC                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TIFFANY...CO                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TIVO.CORP                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TRADESTATION.GROUP.INC        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TRIBUNE.MEDIA.CO              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_UST.CORP                      <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ coname_VALSPAR.CORP                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ coname_VOLT.INFO.SCIENCES.INC        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_WASHINGTON.GROUP.INTL.INC     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_WELLPOINT.HEALTH.NETWRKS.INC  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ coname_YOUNKERS.INC                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Andrew.C..Teich        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Bill.D..Helton         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Bill.M..Lindig         <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Carmie.Mehrlander      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Charles.D..Way         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Charles.R..Perrin      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Christopher.J..Amenson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Claude.E..Davis        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Daniel.D..Crowley      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.E..O.Reilly      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.P..King          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.S..Boyer         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.Willis.Johnson   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dennis.J..FitzSimons   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dennis.J..Gormley      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dennis.J..Picard       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Donald.E..Washkewicz   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Donald.R..Roden        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Edward.L..Grund        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Elliot.Bernstein       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Eric.A..Benhamou       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ezra.Dabah             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_George.D..Leal         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Homi.B..Patel          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Howard.G..Bubb         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Howard.L..Lance        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_James.A..Johnson       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_James.A..Reinstein     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jeffrey.J..Zwick       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.M..Gregory        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.W..Rollins        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kenneth.P..Manning     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kevin.W..Mooney        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kimberly.S..Lubel      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Leonard.D..Schaeffer   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ exec_fullname_Lou.Weisbach           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Mark.G..Parker         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Mark.T..Smucker        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Marshall.C..Turner.Jr. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.D..Capellas    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.D..Dean        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.J..Kowalski    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Neal.F..Finnegan       <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Neil.R..Austrian       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Nicholas.J..St..George <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Norman.P..Blake.Jr.    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_P..Roy.Vagelos         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Peter.D..Meldrum       <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Peter.Jeffrey.Kight    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_R..James.Macaleer      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ralph.E..Faison        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.F..Spoerry.MBA  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ exec_fullname_Robert.L..Peterson     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ronald.A..Malone       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Salomon.Sredni         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Stephen.F..Wiggins     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Stephen.G..Hanks       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Stephen.W..Golsby      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ exec_fullname_Steven.R..Appleton     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.C..Gallagher    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.Carson          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.J..Felmer       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.M..Garrott      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.M..Patrick      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Vernon.W..Hill.II      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_W..Thomas.Gould        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.A..Fitzgerald  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.Allen.Cooper   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.B..Ruger.Jr.   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.B..Timmerman   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ exec_fullname_William.G..Bares       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.H..Lacy        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.L..Mansfield   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ departure_code_X2                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X3                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X4                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X5                    <dbl> 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, …
## $ departure_code_X6                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X7                    <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, …
## $ tenure_no_ceodb_X2                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tenure_no_ceodb_X3                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X2                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X3                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X4                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Specify model

library(usemodels)
## Warning: package 'usemodels' was built under R version 4.3.2
usemodels::use_xgboost(ceo_dismissal ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = ceo_dismissal ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(95190)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_spec <- 
  boost_tree(trees = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_ceo) %>% 
  add_model(xgboost_spec) 

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(45034)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv,
            grid = 5,
            control = control_grid(save_pred = TRUE))

Model evaluation

Identify optimal value for hyperparameters

collect_metrics(xgboost_tune)
## # A tibble: 10 × 7
##    trees .metric  .estimator  mean     n std_err .config             
##    <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
##  1   111 accuracy binary     0.971    10  0.0190 Preprocessor1_Model1
##  2   111 roc_auc  binary     0.95     10  0.0356 Preprocessor1_Model1
##  3   683 accuracy binary     0.957    10  0.0305 Preprocessor1_Model2
##  4   683 roc_auc  binary     0.933    10  0.0444 Preprocessor1_Model2
##  5  1015 accuracy binary     0.957    10  0.0305 Preprocessor1_Model3
##  6  1015 roc_auc  binary     0.933    10  0.0444 Preprocessor1_Model3
##  7  1205 accuracy binary     0.957    10  0.0305 Preprocessor1_Model4
##  8  1205 roc_auc  binary     0.933    10  0.0444 Preprocessor1_Model4
##  9  1840 accuracy binary     0.957    10  0.0305 Preprocessor1_Model5
## 10  1840 roc_auc  binary     0.933    10  0.0444 Preprocessor1_Model5
collect_predictions(xgboost_tune) %>%
  group_by("id") %>%
  roc_curve(ceo_dismissal, .pred_1) %>%
  autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
  finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
  last_fit(data_split)
## Warning: package 'xgboost' was built under R version 4.3.2
## → A | warning: There are new levels in a factor: MCCORMICK & CO INC, CROWN HOLDINGS INC, GLATFELTER, REGAL BELOIT CORP, BIOTELEMETRY INC, JACK IN THE BOX INC, E TRADE FINANCIAL CORP, PAYLESS CASHWAYS, STRATEGIC EDUCATION INC, INLAND REAL ESTATE CORP, TENNECO INC, WENDY'S CO, DMC GLOBAL INC, ALEX BROWN INC, BANK ONE CORP, ABM INDUSTRIES INC, UNS ENERGY CORP, INFORMATION RESOURCES INC, SCRIPPS NETWORKS INTERACTIVE, WATTS WATER TECHNOLOGIES INC, BJ'S WHOLESALE CLUB INC, HCA HEALTHCARE INC, DRAVO CORP, LINDE PLC, FIRST MIDWEST BANCORP INC, STRATOS INTERNATIONAL INC, There are new levels in a factor: Bailey A. Thomas, John W. Conway Jr., George H. Glatfelter II, Henry W. Knueppel, Ralph H. Thurman, Linda A. Lang, Mitchell Harris Caplan J.D., David Stanley, Robert S. Silberman, Mark E. Zalatoris, CPA, Dana G. Mead, Roland C. Smith, Yvon Pierre Cariou, Alvin Bernard Krongard, John Bonnet McCoy, Henrik C. Slipsager, James Stuart Pignatelli, Gian Mark Fulgoni, Kenneth W. Lowe, Patrick S. O'Keefe, Michael T. Wedge, Jack O. Bovender Jr., Carl A. Gilbert, H. William Lichtenberger, John M. O'Meara, James W. McGinley
## 
There were issues with some computations   A: x1

There were issues with some computations   A: x1
collect_metrics(xgboost_last)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary             1 Preprocessor1_Model1
## 2 roc_auc  binary             1 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
  yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
  autoplot()

Variable importance

library(vip)
## Warning: package 'vip' was built under R version 4.3.3
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgboost_last %>%
  workflows::extract_fit_engine() %>%
  vip()

Conclusion

The previous model had accuracy of 0.889 and AUC of .753

Feature transformations: normalized numeric data. It resulted in an improvement of 1 to both accuracy and AUC. Feature transformations: YeoJohnson transformation had no improvements to the model Feature selection: PCA did not make an impact Feature selection: did not make improvement