Goal is to predict CEO departure

Import Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## Warning: package 'correlationfunnel' was built under R version 4.3.2

## ══ correlationfunnel Tip #3 ════════════════════════════════════════════════════
## Using `binarize()` with data containing many columns or many rows can increase dimensionality substantially.
## Try subsetting your data column-wise or row-wise to avoid creating too many columns.
## You can always make a big problem smaller by sampling. :)

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean Data

skimr::skim(departures)

Data summary
Name	departures
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

factors_vec <- departures %>% select( departure_code, tenure_no_ceodb, max_tenure_ceodb, ceo_dismissal) %>% names()

data_clean <- departures %>%
  select(-interim_coceo, -still_there, -eight_ks, -notes, -sources, -leftofc) %>%
  
    # remove NA's
  na.omit() %>%
  
  # address factors imported as numeric
  mutate(across(all_of(factors_vec), as.factor)) %>%
  
  # drop zero variance variable name
  select(-c(`_merge`)) %>%
  
  # Recode CEO Dismissal
  mutate(ceo_dismissal = if_else(ceo_dismissal == "Yes", "1", ceo_dismissal))

Explore Data

data_clean %>% count(ceo_dismissal)

## # A tibble: 2 × 2
##   ceo_dismissal     n
##   <chr>         <int>
## 1 0              5822
## 2 1              1439

data_clean %>%
  ggplot(aes(ceo_dismissal)) +
  geom_bar()

fyear vs interim_coceo

data_clean %>%
  ggplot(aes(ceo_dismissal, fyear)) + 
  geom_boxplot()

correlation plot

# step 1: binarize
data_binarized <- data_clean %>%
  binarize()

data_binarized %>% glimpse()

## Rows: 7,261
## Columns: 43
## $ `dismissal_dataset_id__-Inf_2159` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ dismissal_dataset_id__2159_4330   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dismissal_dataset_id__4330_6564   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dismissal_dataset_id__6564_Inf    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname__BARRICK_GOLD_CORP         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `gvkey__-Inf_6867`                <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gvkey__6867_13283                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gvkey__13283_30025                <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gvkey__30025_Inf                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear__-Inf_1999`                <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, …
## $ fyear__1999_2006                  <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ fyear__2006_2012                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear__2012_Inf                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `co_per_rol__-Inf_6968`           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ co_per_rol__6968_18252            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__18252_33294           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__33294_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname__John_W._Rowe       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ departure_code__1                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__2                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__3                 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ departure_code__4                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__5                 <dbl> 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, …
## $ departure_code__6                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__7                 <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ ceo_dismissal__0                  <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ ceo_dismissal__1                  <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ tenure_no_ceodb__1                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-Inf_2000`           <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, …
## $ fyear_gone__2000_2006             <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ fyear_gone__2006_2013             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__2013_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cik__-Inf_101063`                <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ cik__101063_832428                <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ cik__832428_1024302               <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cik__1024302_Inf                  <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, …

# step 2: correlation
data_correlation <- data_binarized %>%
  correlate(ceo_dismissal__1)

data_correlation

## # A tibble: 43 × 3
##    feature        bin       correlation
##    <fct>          <chr>           <dbl>
##  1 ceo_dismissal  0             -1     
##  2 ceo_dismissal  1              1     
##  3 departure_code 3              0.929 
##  4 departure_code 5             -0.482 
##  5 departure_code 7             -0.298 
##  6 departure_code 4              0.274 
##  7 fyear          -Inf_1999     -0.0785
##  8 departure_code 6             -0.0784
##  9 co_per_rol     -Inf_6968     -0.0598
## 10 fyear_gone     -Inf_2000     -0.0589
## # ℹ 33 more rows

# step 3: plot
data_correlation %>%
  correlationfunnel::plot_correlation_funnel()

Model building

Split data

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/

set.seed(1234)
data_clean <- data_clean %>% sample_n(100)

data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = ceo_dismissal)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [65/9]> Fold01
##  2 <split [66/8]> Fold02
##  3 <split [66/8]> Fold03
##  4 <split [67/7]> Fold04
##  5 <split [67/7]> Fold05
##  6 <split [67/7]> Fold06
##  7 <split [67/7]> Fold07
##  8 <split [67/7]> Fold08
##  9 <split [67/7]> Fold09
## 10 <split [67/7]> Fold10

Preprocess data

library(themis)

## Warning: package 'themis' was built under R version 4.3.3

xgboost_ceo <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
  update_role(cik, new_role = "ID") %>%
  step_dummy(all_nominal_predictors()) %>%
  step_smote(ceo_dismissal)

xgboost_ceo %>% prep() %>% juice() %>% glimpse()

## Rows: 122
## Columns: 164
## $ dismissal_dataset_id                 <dbl> 3351, 6849, 1976, 5742, 3929, 517…
## $ gvkey                                <dbl> 10247, 61399, 6347, 24997, 11858,…
## $ fyear                                <dbl> 1999, 2015, 2006, 1994, 1998, 201…
## $ co_per_rol                           <dbl> 2117, 54385, 13847, 8317, 15455, …
## $ fyear_gone                           <dbl> 1999, 2016, 2007, 1994, 1997, 201…
## $ cik                                  <dbl> 96021, 899923, 906469, 878549, 31…
## $ ceo_dismissal                        <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BERGEN.BRUNSWIG.CORP..CL.A    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BOMBAY.CO.INC                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BRADY.CORP                    <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ coname_CAMPBELL.SOUP.CO              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHECKFREE.CORP                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHICOS.FAS.INC                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHILDRENS.PLACE.INC           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CINCINNATI.BELL.INC           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COMDISCO.HOLDING.CO.INC       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COMMERCE.BANCORP.INC.NJ       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COMMERCIAL.FEDERAL.CORP       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CORAM.HEALTHCARE.CORP         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CST.BRANDS.INC                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CUTERA.INC                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DAMES...MOORE.GROUP           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DIALOGIC.CORP.OLD             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DUPONT.PHOTOMASKS.INC         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DURACELL.INTERNATIONAL        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FEDERAL.MOGUL.HOLDINGS.CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FEDERAL.NATIONAL.MORTGA.ASSN  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FIRST.FINL.BANCORP.INC.OH     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FLIR.SYSTEMS.INC              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GENTIVA.HEALTH.SERVICES.INC   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GENUINE.PARTS.CO              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HA2003.INC                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HARRIS.CORP                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HARTMARX.CORP                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_IBP.INC                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KING.PHARMACEUTICALS.INC      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LABORATORY.CP.OF.AMER.HLDGS   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LEVITZ.FURNITURE.INC..VTG     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LUBRIZOL.CORP                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MCI.INC                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MEAD.JOHNSON.NUTRITION.CO     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ coname_MERCK...CO                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_METTLER.TOLEDO.INTL.INC       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ coname_MGIC.INVESTMENT.CORP.WI       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MICRON.TECHNOLOGY.INC         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MYRIAD.GENETICS.INC           <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_NATIONAL.COMMERCE.FINANCIAL   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_NEW.CENTURY.ENERGIES.INC      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_NIKE.INC..CL.B                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_O.REILLY.AUTOMOTIVE.INC       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OAKWOOD.HOMES.CORP            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OFFICE.DEPOT.INC              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OXFORD.HEALTH.PLANS.INC       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PALM.INC                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PARKER.HANNIFIN.CORP          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PEOPLES.ENERGY.CORP           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PULSE.ELECTRONICS.CORP        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_RAYTHEON.CO                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_RYAN.S.RESTAURANT.GROUP.INC   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SAFETY.KLEEN.CORP             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SBS.TECHNOLOGIES.INC          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SCANA.CORP                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ coname_SENSIENT.TECHNOLOGIES.CORP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SHARED.MEDICAL.SYSTEMS.CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SMUCKER..JM..CO               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SOUTHERN.CO                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_STURM.RUGER...CO.INC          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SYSCO.CORP                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TCF.FINANCIAL.CORP            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TELEFLEX.INC                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TIFFANY...CO                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TIVO.CORP                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TRADESTATION.GROUP.INC        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TRIBUNE.MEDIA.CO              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_UST.CORP                      <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ coname_VALSPAR.CORP                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ coname_VOLT.INFO.SCIENCES.INC        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_WASHINGTON.GROUP.INTL.INC     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_WELLPOINT.HEALTH.NETWRKS.INC  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ coname_YOUNKERS.INC                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Andrew.C..Teich        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Bill.D..Helton         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Bill.M..Lindig         <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Carmie.Mehrlander      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Charles.D..Way         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Charles.R..Perrin      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Christopher.J..Amenson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Claude.E..Davis        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Daniel.D..Crowley      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.E..O.Reilly      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.P..King          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.S..Boyer         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.Willis.Johnson   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dennis.J..FitzSimons   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dennis.J..Gormley      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dennis.J..Picard       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Donald.E..Washkewicz   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Donald.R..Roden        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Edward.L..Grund        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Elliot.Bernstein       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Eric.A..Benhamou       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ezra.Dabah             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_George.D..Leal         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Homi.B..Patel          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Howard.G..Bubb         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Howard.L..Lance        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_James.A..Johnson       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_James.A..Reinstein     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jeffrey.J..Zwick       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.M..Gregory        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.W..Rollins        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kenneth.P..Manning     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kevin.W..Mooney        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kimberly.S..Lubel      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Leonard.D..Schaeffer   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ exec_fullname_Lou.Weisbach           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Mark.G..Parker         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Mark.T..Smucker        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Marshall.C..Turner.Jr. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.D..Capellas    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.D..Dean        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.J..Kowalski    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Neal.F..Finnegan       <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Neil.R..Austrian       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Nicholas.J..St..George <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Norman.P..Blake.Jr.    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_P..Roy.Vagelos         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Peter.D..Meldrum       <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Peter.Jeffrey.Kight    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_R..James.Macaleer      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ralph.E..Faison        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.F..Spoerry.MBA  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ exec_fullname_Robert.L..Peterson     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ronald.A..Malone       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Salomon.Sredni         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Stephen.F..Wiggins     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Stephen.G..Hanks       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Stephen.W..Golsby      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ exec_fullname_Steven.R..Appleton     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.C..Gallagher    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.Carson          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.J..Felmer       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.M..Garrott      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.M..Patrick      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Vernon.W..Hill.II      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_W..Thomas.Gould        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.A..Fitzgerald  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.Allen.Cooper   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.B..Ruger.Jr.   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.B..Timmerman   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ exec_fullname_William.G..Bares       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.H..Lacy        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.L..Mansfield   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ departure_code_X2                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X3                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X4                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X5                    <dbl> 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, …
## $ departure_code_X6                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X7                    <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, …
## $ tenure_no_ceodb_X2                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tenure_no_ceodb_X3                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X2                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X3                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X4                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Specify model

library(usemodels)

## Warning: package 'usemodels' was built under R version 4.3.2

usemodels::use_xgboost(ceo_dismissal ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = ceo_dismissal ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(95190)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_spec <- 
  boost_tree(trees = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_ceo) %>% 
  add_model(xgboost_spec)

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(45034)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv,
            grid = 5,
            control = control_grid(save_pred = TRUE))

Model evaluation

Identify optimal value for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 10 × 7
##    trees .metric  .estimator  mean     n std_err .config             
##    <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
##  1   111 accuracy binary     0.971    10  0.0190 Preprocessor1_Model1
##  2   111 roc_auc  binary     0.95     10  0.0356 Preprocessor1_Model1
##  3   683 accuracy binary     0.957    10  0.0305 Preprocessor1_Model2
##  4   683 roc_auc  binary     0.933    10  0.0444 Preprocessor1_Model2
##  5  1015 accuracy binary     0.957    10  0.0305 Preprocessor1_Model3
##  6  1015 roc_auc  binary     0.933    10  0.0444 Preprocessor1_Model3
##  7  1205 accuracy binary     0.957    10  0.0305 Preprocessor1_Model4
##  8  1205 roc_auc  binary     0.933    10  0.0444 Preprocessor1_Model4
##  9  1840 accuracy binary     0.957    10  0.0305 Preprocessor1_Model5
## 10  1840 roc_auc  binary     0.933    10  0.0444 Preprocessor1_Model5

collect_predictions(xgboost_tune) %>%
  group_by("id") %>%
  roc_curve(ceo_dismissal, .pred_1) %>%
  autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
  finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
  last_fit(data_split)

## Warning: package 'xgboost' was built under R version 4.3.2

## → A | warning: There are new levels in a factor: MCCORMICK & CO INC, CROWN HOLDINGS INC, GLATFELTER, REGAL BELOIT CORP, BIOTELEMETRY INC, JACK IN THE BOX INC, E TRADE FINANCIAL CORP, PAYLESS CASHWAYS, STRATEGIC EDUCATION INC, INLAND REAL ESTATE CORP, TENNECO INC, WENDY'S CO, DMC GLOBAL INC, ALEX BROWN INC, BANK ONE CORP, ABM INDUSTRIES INC, UNS ENERGY CORP, INFORMATION RESOURCES INC, SCRIPPS NETWORKS INTERACTIVE, WATTS WATER TECHNOLOGIES INC, BJ'S WHOLESALE CLUB INC, HCA HEALTHCARE INC, DRAVO CORP, LINDE PLC, FIRST MIDWEST BANCORP INC, STRATOS INTERNATIONAL INC, There are new levels in a factor: Bailey A. Thomas, John W. Conway Jr., George H. Glatfelter II, Henry W. Knueppel, Ralph H. Thurman, Linda A. Lang, Mitchell Harris Caplan J.D., David Stanley, Robert S. Silberman, Mark E. Zalatoris, CPA, Dana G. Mead, Roland C. Smith, Yvon Pierre Cariou, Alvin Bernard Krongard, John Bonnet McCoy, Henrik C. Slipsager, James Stuart Pignatelli, Gian Mark Fulgoni, Kenneth W. Lowe, Patrick S. O'Keefe, Michael T. Wedge, Jack O. Bovender Jr., Carl A. Gilbert, H. William Lichtenberger, John M. O'Meara, James W. McGinley

## 
There were issues with some computations   A: x1

There were issues with some computations   A: x1

collect_metrics(xgboost_last)

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary             1 Preprocessor1_Model1
## 2 roc_auc  binary             1 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
  yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
  autoplot()

Variable importance

library(vip)

## Warning: package 'vip' was built under R version 4.3.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
  workflows::extract_fit_engine() %>%
  vip()

Conclusion

The previous model had accuracy of 0.889 and AUC of .753

Feature transformations: normalized numeric data. It resulted in an improvement of 1 to both accuracy and AUC. Feature transformations: YeoJohnson transformation had no improvements to the model Feature selection: PCA did not make an impact Feature selection: did not make improvement

Apply Data 8: CEO Departures

Olivia Pendergast

2024-04-11