Goal is to predict CEO departure
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 4.3.2
## ══ correlationfunnel Tip #3 ════════════════════════════════════════════════════
## Using `binarize()` with data containing many columns or many rows can increase dimensionality substantially.
## Try subsetting your data column-wise or row-wise to avoid creating too many columns.
## You can always make a big problem smaller by sampling. :)
departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm (1): leftofc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(departures)
| Name | departures |
| Number of rows | 9423 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| numeric | 10 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| coname | 0 | 1.00 | 2 | 30 | 0 | 3860 | 0 |
| exec_fullname | 0 | 1.00 | 5 | 790 | 0 | 8701 | 0 |
| interim_coceo | 9105 | 0.03 | 6 | 7 | 0 | 6 | 0 |
| still_there | 7311 | 0.22 | 3 | 10 | 0 | 77 | 0 |
| notes | 1644 | 0.83 | 5 | 3117 | 0 | 7755 | 0 |
| sources | 1475 | 0.84 | 18 | 1843 | 0 | 7915 | 0 |
| eight_ks | 4499 | 0.52 | 69 | 3884 | 0 | 4914 | 0 |
| _merge | 0 | 1.00 | 11 | 11 | 0 | 1 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| dismissal_dataset_id | 0 | 1.00 | 5684.10 | 25005.46 | 1 | 2305.5 | 4593 | 6812.5 | 559044 | ▇▁▁▁▁ |
| gvkey | 0 | 1.00 | 40132.48 | 53921.34 | 1004 | 7337.0 | 14385 | 60900.5 | 328795 | ▇▁▁▁▁ |
| fyear | 0 | 1.00 | 2007.74 | 8.19 | 1987 | 2000.0 | 2008 | 2016.0 | 2020 | ▁▆▅▅▇ |
| co_per_rol | 0 | 1.00 | 25580.22 | 18202.38 | -1 | 8555.5 | 22980 | 39275.5 | 64602 | ▇▆▅▃▃ |
| departure_code | 1667 | 0.82 | 5.20 | 1.53 | 1 | 5.0 | 5 | 7.0 | 9 | ▁▃▇▅▁ |
| ceo_dismissal | 1813 | 0.81 | 0.20 | 0.40 | 0 | 0.0 | 0 | 0.0 | 1 | ▇▁▁▁▂ |
| tenure_no_ceodb | 0 | 1.00 | 1.03 | 0.17 | 0 | 1.0 | 1 | 1.0 | 3 | ▁▇▁▁▁ |
| max_tenure_ceodb | 0 | 1.00 | 1.05 | 0.24 | 1 | 1.0 | 1 | 1.0 | 4 | ▇▁▁▁▁ |
| fyear_gone | 1802 | 0.81 | 2006.64 | 13.63 | 1980 | 2000.0 | 2007 | 2013.0 | 2997 | ▇▁▁▁▁ |
| cik | 245 | 0.97 | 741469.17 | 486551.43 | 1750 | 106413.0 | 857323 | 1050375.8 | 1808065 | ▆▁▇▂▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| leftofc | 1802 | 0.81 | 1981-01-01 | 2998-04-27 | 2006-12-31 | 3627 |
factors_vec <- departures %>% select( departure_code, tenure_no_ceodb, max_tenure_ceodb, ceo_dismissal) %>% names()
data_clean <- departures %>%
select(-interim_coceo, -still_there, -eight_ks, -notes, -sources, -leftofc) %>%
# remove NA's
na.omit() %>%
# address factors imported as numeric
mutate(across(all_of(factors_vec), as.factor)) %>%
# drop zero variance variable name
select(-c(`_merge`)) %>%
# Recode CEO Dismissal
mutate(ceo_dismissal = if_else(ceo_dismissal == "Yes", "1", ceo_dismissal))
data_clean %>% count(ceo_dismissal)
## # A tibble: 2 × 2
## ceo_dismissal n
## <chr> <int>
## 1 0 5822
## 2 1 1439
data_clean %>%
ggplot(aes(ceo_dismissal)) +
geom_bar()
fyear vs interim_coceo
data_clean %>%
ggplot(aes(ceo_dismissal, fyear)) +
geom_boxplot()
correlation plot
# step 1: binarize
data_binarized <- data_clean %>%
binarize()
data_binarized %>% glimpse()
## Rows: 7,261
## Columns: 43
## $ `dismissal_dataset_id__-Inf_2159` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ dismissal_dataset_id__2159_4330 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dismissal_dataset_id__4330_6564 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dismissal_dataset_id__6564_Inf <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname__BARRICK_GOLD_CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `gvkey__-Inf_6867` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gvkey__6867_13283 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gvkey__13283_30025 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gvkey__30025_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear__-Inf_1999` <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, …
## $ fyear__1999_2006 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ fyear__2006_2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear__2012_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `co_per_rol__-Inf_6968` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ co_per_rol__6968_18252 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__18252_33294 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__33294_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ departure_code__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__3 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ departure_code__4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__5 <dbl> 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, …
## $ departure_code__6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__7 <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ ceo_dismissal__0 <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ ceo_dismissal__1 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ tenure_no_ceodb__1 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-Inf_2000` <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, …
## $ fyear_gone__2000_2006 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ fyear_gone__2006_2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__2013_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cik__-Inf_101063` <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ cik__101063_832428 <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ cik__832428_1024302 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cik__1024302_Inf <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, …
# step 2: correlation
data_correlation <- data_binarized %>%
correlate(ceo_dismissal__1)
data_correlation
## # A tibble: 43 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 ceo_dismissal 0 -1
## 2 ceo_dismissal 1 1
## 3 departure_code 3 0.929
## 4 departure_code 5 -0.482
## 5 departure_code 7 -0.298
## 6 departure_code 4 0.274
## 7 fyear -Inf_1999 -0.0785
## 8 departure_code 6 -0.0784
## 9 co_per_rol -Inf_6968 -0.0598
## 10 fyear_gone -Inf_2000 -0.0589
## # ℹ 33 more rows
# step 3: plot
data_correlation %>%
correlationfunnel::plot_correlation_funnel()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(1234)
data_clean <- data_clean %>% sample_n(100)
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <- rsample::vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [65/9]> Fold01
## 2 <split [66/8]> Fold02
## 3 <split [66/8]> Fold03
## 4 <split [67/7]> Fold04
## 5 <split [67/7]> Fold05
## 6 <split [67/7]> Fold06
## 7 <split [67/7]> Fold07
## 8 <split [67/7]> Fold08
## 9 <split [67/7]> Fold09
## 10 <split [67/7]> Fold10
library(themis)
## Warning: package 'themis' was built under R version 4.3.3
xgboost_ceo <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
update_role(cik, new_role = "ID") %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(ceo_dismissal)
xgboost_ceo %>% prep() %>% juice() %>% glimpse()
## Rows: 122
## Columns: 164
## $ dismissal_dataset_id <dbl> 3351, 6849, 1976, 5742, 3929, 517…
## $ gvkey <dbl> 10247, 61399, 6347, 24997, 11858,…
## $ fyear <dbl> 1999, 2015, 2006, 1994, 1998, 201…
## $ co_per_rol <dbl> 2117, 54385, 13847, 8317, 15455, …
## $ fyear_gone <dbl> 1999, 2016, 2007, 1994, 1997, 201…
## $ cik <dbl> 96021, 899923, 906469, 878549, 31…
## $ ceo_dismissal <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BERGEN.BRUNSWIG.CORP..CL.A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BOMBAY.CO.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BRADY.CORP <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ coname_CAMPBELL.SOUP.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHECKFREE.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHICOS.FAS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHILDRENS.PLACE.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CINCINNATI.BELL.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COMDISCO.HOLDING.CO.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COMMERCE.BANCORP.INC.NJ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COMMERCIAL.FEDERAL.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CORAM.HEALTHCARE.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CST.BRANDS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CUTERA.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DAMES...MOORE.GROUP <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DIALOGIC.CORP.OLD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DUPONT.PHOTOMASKS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DURACELL.INTERNATIONAL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FEDERAL.MOGUL.HOLDINGS.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FEDERAL.NATIONAL.MORTGA.ASSN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FIRST.FINL.BANCORP.INC.OH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_FLIR.SYSTEMS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GENTIVA.HEALTH.SERVICES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GENUINE.PARTS.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HA2003.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HARRIS.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HARTMARX.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_IBP.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KING.PHARMACEUTICALS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LABORATORY.CP.OF.AMER.HLDGS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LEVITZ.FURNITURE.INC..VTG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LUBRIZOL.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MCI.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MEAD.JOHNSON.NUTRITION.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ coname_MERCK...CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_METTLER.TOLEDO.INTL.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ coname_MGIC.INVESTMENT.CORP.WI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MICRON.TECHNOLOGY.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_MYRIAD.GENETICS.INC <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_NATIONAL.COMMERCE.FINANCIAL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_NEW.CENTURY.ENERGIES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_NIKE.INC..CL.B <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_O.REILLY.AUTOMOTIVE.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OAKWOOD.HOMES.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OFFICE.DEPOT.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OXFORD.HEALTH.PLANS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PALM.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PARKER.HANNIFIN.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PEOPLES.ENERGY.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PULSE.ELECTRONICS.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_RAYTHEON.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_RYAN.S.RESTAURANT.GROUP.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SAFETY.KLEEN.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SBS.TECHNOLOGIES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SCANA.CORP <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ coname_SENSIENT.TECHNOLOGIES.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SHARED.MEDICAL.SYSTEMS.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SMUCKER..JM..CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SOUTHERN.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_STURM.RUGER...CO.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SYSCO.CORP <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TCF.FINANCIAL.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TELEFLEX.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TIFFANY...CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TIVO.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TRADESTATION.GROUP.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TRIBUNE.MEDIA.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_UST.CORP <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ coname_VALSPAR.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ coname_VOLT.INFO.SCIENCES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_WASHINGTON.GROUP.INTL.INC <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_WELLPOINT.HEALTH.NETWRKS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ coname_YOUNKERS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Andrew.C..Teich <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Bill.D..Helton <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Bill.M..Lindig <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Carmie.Mehrlander <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Charles.D..Way <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Charles.R..Perrin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Christopher.J..Amenson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Claude.E..Davis <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Daniel.D..Crowley <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.E..O.Reilly <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.P..King <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.S..Boyer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.Willis.Johnson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dennis.J..FitzSimons <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dennis.J..Gormley <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dennis.J..Picard <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Donald.E..Washkewicz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Donald.R..Roden <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Edward.L..Grund <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Elliot.Bernstein <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Eric.A..Benhamou <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ezra.Dabah <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_George.D..Leal <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Homi.B..Patel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Howard.G..Bubb <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Howard.L..Lance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_James.A..Johnson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_James.A..Reinstein <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jeffrey.J..Zwick <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.M..Gregory <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.W..Rollins <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kenneth.P..Manning <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kevin.W..Mooney <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kimberly.S..Lubel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Leonard.D..Schaeffer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ exec_fullname_Lou.Weisbach <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Mark.G..Parker <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Mark.T..Smucker <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Marshall.C..Turner.Jr. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.D..Capellas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.D..Dean <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.J..Kowalski <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Neal.F..Finnegan <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Neil.R..Austrian <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Nicholas.J..St..George <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Norman.P..Blake.Jr. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_P..Roy.Vagelos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Peter.D..Meldrum <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Peter.Jeffrey.Kight <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_R..James.Macaleer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ralph.E..Faison <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.F..Spoerry.MBA <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ exec_fullname_Robert.L..Peterson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ronald.A..Malone <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Salomon.Sredni <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Stephen.F..Wiggins <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Stephen.G..Hanks <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Stephen.W..Golsby <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ exec_fullname_Steven.R..Appleton <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.C..Gallagher <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.Carson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.J..Felmer <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.M..Garrott <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.M..Patrick <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Vernon.W..Hill.II <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_W..Thomas.Gould <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.A..Fitzgerald <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.Allen.Cooper <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.B..Ruger.Jr. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.B..Timmerman <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ exec_fullname_William.G..Bares <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.H..Lacy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.L..Mansfield <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ departure_code_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X5 <dbl> 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, …
## $ departure_code_X6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code_X7 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, …
## $ tenure_no_ceodb_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tenure_no_ceodb_X3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb_X4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.3.2
usemodels::use_xgboost(ceo_dismissal ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = ceo_dismissal ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(95190)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_spec <-
boost_tree(trees = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_ceo) %>%
add_model(xgboost_spec)
doParallel::registerDoParallel()
set.seed(45034)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5,
control = control_grid(save_pred = TRUE))
collect_metrics(xgboost_tune)
## # A tibble: 10 × 7
## trees .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 111 accuracy binary 0.971 10 0.0190 Preprocessor1_Model1
## 2 111 roc_auc binary 0.95 10 0.0356 Preprocessor1_Model1
## 3 683 accuracy binary 0.957 10 0.0305 Preprocessor1_Model2
## 4 683 roc_auc binary 0.933 10 0.0444 Preprocessor1_Model2
## 5 1015 accuracy binary 0.957 10 0.0305 Preprocessor1_Model3
## 6 1015 roc_auc binary 0.933 10 0.0444 Preprocessor1_Model3
## 7 1205 accuracy binary 0.957 10 0.0305 Preprocessor1_Model4
## 8 1205 roc_auc binary 0.933 10 0.0444 Preprocessor1_Model4
## 9 1840 accuracy binary 0.957 10 0.0305 Preprocessor1_Model5
## 10 1840 roc_auc binary 0.933 10 0.0444 Preprocessor1_Model5
collect_predictions(xgboost_tune) %>%
group_by("id") %>%
roc_curve(ceo_dismissal, .pred_1) %>%
autoplot()
xgboost_last <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
last_fit(data_split)
## Warning: package 'xgboost' was built under R version 4.3.2
## → A | warning: There are new levels in a factor: MCCORMICK & CO INC, CROWN HOLDINGS INC, GLATFELTER, REGAL BELOIT CORP, BIOTELEMETRY INC, JACK IN THE BOX INC, E TRADE FINANCIAL CORP, PAYLESS CASHWAYS, STRATEGIC EDUCATION INC, INLAND REAL ESTATE CORP, TENNECO INC, WENDY'S CO, DMC GLOBAL INC, ALEX BROWN INC, BANK ONE CORP, ABM INDUSTRIES INC, UNS ENERGY CORP, INFORMATION RESOURCES INC, SCRIPPS NETWORKS INTERACTIVE, WATTS WATER TECHNOLOGIES INC, BJ'S WHOLESALE CLUB INC, HCA HEALTHCARE INC, DRAVO CORP, LINDE PLC, FIRST MIDWEST BANCORP INC, STRATOS INTERNATIONAL INC, There are new levels in a factor: Bailey A. Thomas, John W. Conway Jr., George H. Glatfelter II, Henry W. Knueppel, Ralph H. Thurman, Linda A. Lang, Mitchell Harris Caplan J.D., David Stanley, Robert S. Silberman, Mark E. Zalatoris, CPA, Dana G. Mead, Roland C. Smith, Yvon Pierre Cariou, Alvin Bernard Krongard, John Bonnet McCoy, Henrik C. Slipsager, James Stuart Pignatelli, Gian Mark Fulgoni, Kenneth W. Lowe, Patrick S. O'Keefe, Michael T. Wedge, Jack O. Bovender Jr., Carl A. Gilbert, H. William Lichtenberger, John M. O'Meara, James W. McGinley
##
There were issues with some computations A: x1
There were issues with some computations A: x1
collect_metrics(xgboost_last)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 1 Preprocessor1_Model1
## 2 roc_auc binary 1 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
autoplot()
library(vip)
## Warning: package 'vip' was built under R version 4.3.3
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgboost_last %>%
workflows::extract_fit_engine() %>%
vip()