departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')
skimr::skim(departures)
Name | departures |
Number of rows | 9423 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
character | 8 |
numeric | 10 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
coname | 0 | 1.00 | 2 | 30 | 0 | 3860 | 0 |
exec_fullname | 0 | 1.00 | 5 | 790 | 0 | 8701 | 0 |
interim_coceo | 9105 | 0.03 | 6 | 7 | 0 | 6 | 0 |
still_there | 7311 | 0.22 | 3 | 10 | 0 | 77 | 0 |
notes | 1644 | 0.83 | 5 | 3117 | 0 | 7755 | 0 |
sources | 1475 | 0.84 | 18 | 1843 | 0 | 7915 | 0 |
eight_ks | 4499 | 0.52 | 69 | 3884 | 0 | 4914 | 0 |
_merge | 0 | 1.00 | 11 | 11 | 0 | 1 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1.00 | 5684.10 | 25005.46 | 1 | 2305.5 | 4593 | 6812.5 | 559044 | ▇▁▁▁▁ |
gvkey | 0 | 1.00 | 40132.48 | 53921.34 | 1004 | 7337.0 | 14385 | 60900.5 | 328795 | ▇▁▁▁▁ |
fyear | 0 | 1.00 | 2007.74 | 8.19 | 1987 | 2000.0 | 2008 | 2016.0 | 2020 | ▁▆▅▅▇ |
co_per_rol | 0 | 1.00 | 25580.22 | 18202.38 | -1 | 8555.5 | 22980 | 39275.5 | 64602 | ▇▆▅▃▃ |
departure_code | 1667 | 0.82 | 5.20 | 1.53 | 1 | 5.0 | 5 | 7.0 | 9 | ▁▃▇▅▁ |
ceo_dismissal | 1813 | 0.81 | 0.20 | 0.40 | 0 | 0.0 | 0 | 0.0 | 1 | ▇▁▁▁▂ |
tenure_no_ceodb | 0 | 1.00 | 1.03 | 0.17 | 0 | 1.0 | 1 | 1.0 | 3 | ▁▇▁▁▁ |
max_tenure_ceodb | 0 | 1.00 | 1.05 | 0.24 | 1 | 1.0 | 1 | 1.0 | 4 | ▇▁▁▁▁ |
fyear_gone | 1802 | 0.81 | 2006.64 | 13.63 | 1980 | 2000.0 | 2007 | 2013.0 | 2997 | ▇▁▁▁▁ |
cik | 245 | 0.97 | 741469.17 | 486551.43 | 1750 | 106413.0 | 857323 | 1050375.8 | 1808065 | ▆▁▇▂▁ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
leftofc | 1802 | 0.81 | 1981-01-01 | 2998-04-27 | 2006-12-31 | 3627 |
# Select relevant columns
factors_vec <- departures %>%
select(departure_code, co_per_rol, fyear, tenure_no_ceodb, max_tenure_ceodb, fyear_gone) %>%
names()
library(dplyr)
library(lubridate)
data_clean <- departures %>%
select(-c(interim_coceo, still_there, eight_ks, gvkey, co_per_rol, cik, fyear, '_merge', notes, sources)) %>%
filter(fyear_gone != "2997") %>%
filter(!is.na(ceo_dismissal)) %>%
mutate(
departure_code = factor(departure_code),
tenure_no_ceodb = factor(tenure_no_ceodb),
max_tenure_ceodb = factor(max_tenure_ceodb),
ceo_dismissal = factor(ceo_dismissal),
leftofc = as.Date(leftofc), # Ensure leftofc is a Date
year = year(leftofc), # Create year directly
doy = yday(leftofc), # Create day of the year directly
month = month(leftofc) # Create month directly
) %>%
select(-leftofc) %>% # Remove leftofc as it's no longer needed
# Drop zero-variance variables
select(-c(tenure_no_ceodb, max_tenure_ceodb)) %>%
# Ensure ceo_dismissal is character, then recode
mutate(ceo_dismissal = if_else(ceo_dismissal == "1", "dismissed",
if_else(ceo_dismissal == "0", "not dismissed",
as.character(ceo_dismissal)))) # Handle NA implicitly
data_clean <- data_clean %>% sample_n(100)
skimr::skim(departures)
Name | departures |
Number of rows | 9423 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
character | 8 |
numeric | 10 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
coname | 0 | 1.00 | 2 | 30 | 0 | 3860 | 0 |
exec_fullname | 0 | 1.00 | 5 | 790 | 0 | 8701 | 0 |
interim_coceo | 9105 | 0.03 | 6 | 7 | 0 | 6 | 0 |
still_there | 7311 | 0.22 | 3 | 10 | 0 | 77 | 0 |
notes | 1644 | 0.83 | 5 | 3117 | 0 | 7755 | 0 |
sources | 1475 | 0.84 | 18 | 1843 | 0 | 7915 | 0 |
eight_ks | 4499 | 0.52 | 69 | 3884 | 0 | 4914 | 0 |
_merge | 0 | 1.00 | 11 | 11 | 0 | 1 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1.00 | 5684.10 | 25005.46 | 1 | 2305.5 | 4593 | 6812.5 | 559044 | ▇▁▁▁▁ |
gvkey | 0 | 1.00 | 40132.48 | 53921.34 | 1004 | 7337.0 | 14385 | 60900.5 | 328795 | ▇▁▁▁▁ |
fyear | 0 | 1.00 | 2007.74 | 8.19 | 1987 | 2000.0 | 2008 | 2016.0 | 2020 | ▁▆▅▅▇ |
co_per_rol | 0 | 1.00 | 25580.22 | 18202.38 | -1 | 8555.5 | 22980 | 39275.5 | 64602 | ▇▆▅▃▃ |
departure_code | 1667 | 0.82 | 5.20 | 1.53 | 1 | 5.0 | 5 | 7.0 | 9 | ▁▃▇▅▁ |
ceo_dismissal | 1813 | 0.81 | 0.20 | 0.40 | 0 | 0.0 | 0 | 0.0 | 1 | ▇▁▁▁▂ |
tenure_no_ceodb | 0 | 1.00 | 1.03 | 0.17 | 0 | 1.0 | 1 | 1.0 | 3 | ▁▇▁▁▁ |
max_tenure_ceodb | 0 | 1.00 | 1.05 | 0.24 | 1 | 1.0 | 1 | 1.0 | 4 | ▇▁▁▁▁ |
fyear_gone | 1802 | 0.81 | 2006.64 | 13.63 | 1980 | 2000.0 | 2007 | 2013.0 | 2997 | ▇▁▁▁▁ |
cik | 245 | 0.97 | 741469.17 | 486551.43 | 1750 | 106413.0 | 857323 | 1050375.8 | 1808065 | ▆▁▇▂▁ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
leftofc | 1802 | 0.81 | 1981-01-01 | 2998-04-27 | 2006-12-31 | 3627 |
data_clean %>% count(ceo_dismissal)
## # A tibble: 2 × 2
## ceo_dismissal n
## <chr> <int>
## 1 dismissed 20
## 2 not dismissed 80
data_clean %>%
ggplot(aes(ceo_dismissal)) +
geom_bar()
ceo_dismissal vs. max tenure
#data_clean %>%
#ggplot(aes(max_tenure_ceodb)) +
#geom_boxplot()
# Doesn't represent the data well in my case
correlation plot
# Step 1: binarize
data_binarized <- data_clean %>%
select(-exec_fullname, -coname) %>%
binarize()
data_binarized %>% glimpse
## Rows: 100
## Columns: 28
## $ `dismissal_dataset_id__-Inf_3067.5` <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ dismissal_dataset_id__3067.5_4914 <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0…
## $ dismissal_dataset_id__4914_7066 <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ dismissal_dataset_id__7066_Inf <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1…
## $ departure_code__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code__3 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ departure_code__4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code__5 <dbl> 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0…
## $ departure_code__6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ departure_code__7 <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0…
## $ ceo_dismissal__dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ ceo_dismissal__not_dismissed <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1…
## $ `fyear_gone__-Inf_2001` <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ fyear_gone__2001_2006 <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0…
## $ fyear_gone__2006_2013.25 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ fyear_gone__2013.25_Inf <dbl> 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0…
## $ `year__-Inf_2000.75` <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ year__2000.75_2006 <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0…
## $ year__2006_2013.25 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ year__2013.25_Inf <dbl> 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0…
## $ `doy__-Inf_61.5` <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ doy__61.5_151 <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1…
## $ doy__151_263.75 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ doy__263.75_Inf <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0…
## $ `month__-Inf_3` <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1…
## $ month__3_5 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0…
## $ month__5_9 <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ month__9_Inf <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0…
# Step 2: correlation
data_correlation <- data_binarized %>%
correlate(ceo_dismissal__dismissed)
data_correlation
## # A tibble: 28 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 ceo_dismissal dismissed 1
## 2 ceo_dismissal not_dismissed -1
## 3 departure_code 3 0.873
## 4 departure_code 5 -0.480
## 5 departure_code 4 0.408
## 6 departure_code 7 -0.312
## 7 fyear_gone 2006_2013.25 0.246
## 8 year 2006_2013.25 0.246
## 9 fyear_gone -Inf_2001 -0.200
## 10 year -Inf_2000.75 -0.173
## # ℹ 18 more rows
# Step 3: plot
data_correlation %>%
correlationfunnel::plot_correlation_funnel()
There is a moderate correlation between departure codes and ceo dismissals so some departures codes are more indicative of ceo dismissals than others.
library(dplyr)
library(rsample)
set.seed(1234)
#data_clean <- data_clean %>% sample_n(100)
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <- rsample::vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [67/8]> Fold01
## 2 <split [67/8]> Fold02
## 3 <split [67/8]> Fold03
## 4 <split [67/8]> Fold04
## 5 <split [67/8]> Fold05
## 6 <split [68/7]> Fold06
## 7 <split [68/7]> Fold07
## 8 <split [68/7]> Fold08
## 9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
data_train <- data_train %>%
mutate(unique_id = paste(dismissal_dataset_id, exec_fullname, year, sep = "_")) %>%
group_by(unique_id) %>%
summarize(across(everything(), first))
library(themis)
library(recipes)
# Remove unique_id from data_train before creating the recipe
data_train_cleaned <- data_train %>%
select(-unique_id, -departure_code)
# Create the recipe using the cleaned dataset
xgboost_rec <- recipe(ceo_dismissal ~ ., data = data_train_cleaned) %>%
step_dummy(all_nominal_predictors(), -all_outcomes()) %>%
step_smote(ceo_dismissal)
# Prepare and check the recipe
xgboost_rec_prep <- xgboost_rec %>% prep()
data_prepped <- xgboost_rec_prep %>% juice() %>% glimpse()
## Rows: 120
## Columns: 152
## $ dismissal_dataset_id <dbl> 1120, 12, 1666, 1687, 194, 202…
## $ fyear_gone <dbl> 2017, 1998, 2005, 2017, 2000, …
## $ year <dbl> 2017, 1998, 2005, 2017, 2000, …
## $ doy <dbl> 243, 140, 149, 352, 122, 278, …
## $ month <dbl> 8, 5, 5, 12, 5, 10, 1, 2, 10, …
## $ coname_ADV.NEUROMODULATION.SYS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_AHMANSON..H.F....CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_AIRGAS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_ALBERTO.CULVER.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_AMERICAN.AIRLINES.GROUP.INC <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_AMGEN.INC <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ coname_ANGIODYNAMICS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_APOGENT.TECHNOLOGIES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_AVAYA.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BEAR.STEARNS.COMPANIES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BERKLEY..W.R..CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BJ.S.RESTAURANTS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_BOSTON.PROPERTIES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_C.COR.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CHRISTOPHER...BANKS.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CME.GROUP.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CNX.RESOURCES.CORPORATION <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_COOPER.COMPANIES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CORAM.HEALTHCARE.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CORVEL.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_CRACKER.BARREL.OLD.CTRY.STOR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DEKALB.GENETICS.CORP..CL.B <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_DOWDUPONT.INC <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_EARTHLINK.HOLDINGS.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_EDISON.INTERNATIONAL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_ENERGY.FUTURE.HOLDINGS.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GENWORTH.FINANCIAL.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GOTTSCHALKS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GRANITE.CONSTRUCTION.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_GREEN.PLAINS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_HP.INC <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_ICU.MEDICAL.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_INFOSEEK.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_INGRAM.MICRO.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_INTERSIL.CORP..CL.A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_IXIA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KBR.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KEMET.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KIMCO.REALTY.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_KULICKE...SOFFA.INDUSTRIES <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ coname_LILLIAN.VERNON.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_LOWE.S.COMPANIES.INC <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ coname_MATTEL.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ coname_MERRILL.LYNCH...CO.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ coname_NORTEL.NETWORKS.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OLIN.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OREGON.STEEL.MILLS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_OVERSEAS.SHIPHOLDING.GROUP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PENNEY..J.C..CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PEOPLESOFT.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PERFORMANCE.FOOD.GROUP.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_PINNACLE.ENTERTAINMENT.INC <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ coname_QUANTUM.HEALTH.RESOURCES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_RADIAN.GROUP.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_ROCKWELL.COLLINS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SENSORMATIC.ELECTRONICS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SHUTTERFLY.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SILICON.LABORATORIES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SM.ENERGY.CO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SPRINT.PCS.GROUP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_STANCORP.FINANCIAL.GROUP.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_STEEL.EXCEL.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_STONE.ENERGY.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SUNDSTRAND.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SUPER.FOOD.SERVICES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_SYMANTEC.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TALK.AMERICA.HOLDINGS.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TBC.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_TELE.COMM.TCI.GROUP..SER.A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_THIRD.POINT.REINSURANCE.LTD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_ULTRATECH.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_WINN.DIXIE.STORES.INC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coname_WOLVERINE.WORLD.WIDE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Alan.C..Greenberg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Andrew.N..Liveris.A.O..AO <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Anthony.J..Best <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Anthony.M..Sanfilippo <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Arthur.W..Zafiropoulo <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Barry.A..Ellsworth <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Bruce.P..Bickner <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Bruno.Guilmart <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ exec_fullname_Charles.R..Rinehart <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Christopher.G..Chavez <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Christopher.North <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Clayton.M..Jones <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Craig.A..Conway <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Dan.W..Evins <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Daniel.D..Crowley <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.B..Henry <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.E..Maguire <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.H..Watts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_David.H..Welch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Don.R..O.Hare <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Donald.K..Peterson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Douglas.H..Stickney <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Eamonn.P..Hobbs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Edward.H..Linde <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Enrique.T..Salem <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ernest.Stanley.O.Neal <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ exec_fullname_Frank.Lazaran <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Frank.P..Filipps <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Gabriel.Battista <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Gary.D..Forsee <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Geoffrey.B..Bloom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_George.A..Lopez <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Gerald.W..Deitchle <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Gordon.M..Binder <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ exec_fullname_J..Brett.Harvey <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jack.Lawrence.Howard <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jack.Twyman <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jerry.S..Farrington <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Jill.E..Barad <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ exec_fullname_Joe.Levy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.C..Malone..Ph.D. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.E..Bryson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_John.R..Berger <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Joseph.D..Rupp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Joseph.E..Pennington <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Joseph.F..Eazor <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Kenneth.F..Yontz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Lillian.Vernon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Livio.D..DeSimone <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ exec_fullname_Louis.S..DiPasqua <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Martin.Philip.Klein <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.A..Woodhouse <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Michael.L..Molinini <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Morton.P..Hyman <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Necip.Sayiner <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Paviter.Singh.Binning <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Phupinder.S..Gill <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Richard.E..Perry <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.A..Vanourek <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.C..Sledd <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.E.L..Johnson.III <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.L..Crandall <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.L..Tillman <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ exec_fullname_Robert.Paul.Wayman <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Robert.S..Weiss <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ronald.B..Johnson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Ronald.E..Timpe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Thomas.B..Boklund <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Victor.Alston <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Victor.Gordon.Clemons.Sr. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_Vincent.James.Marino <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.P..Utt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname_William.Robert.Berkley <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ceo_dismissal <fct> not dismissed, not dismissed, …
library(usemodels)
usemodels::use_xgboost(ceo_dismissal ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = ceo_dismissal ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(67594)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(workflows)
library(parsnip)
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_rec) %>%
add_model(xgboost_spec)
library(tune)
doParallel::registerDoParallel()
set.seed(17375)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5,
control = control_grid(save_pred = TRUE))
## Warning: package 'xgboost' was built under R version 4.3.3
library(yardstick)
collect_metrics(xgboost_tune)
## # A tibble: 15 × 12
## trees min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 677 7 14 0.00195 5.18e- 4 0.477 accuracy
## 2 677 7 14 0.00195 5.18e- 4 0.477 brier_class
## 3 677 7 14 0.00195 5.18e- 4 0.477 roc_auc
## 4 1016 11 4 0.00394 3.13e-10 0.723 accuracy
## 5 1016 11 4 0.00394 3.13e-10 0.723 brier_class
## 6 1016 11 4 0.00394 3.13e-10 0.723 roc_auc
## 7 1626 19 8 0.0202 2.06e- 7 0.257 accuracy
## 8 1626 19 8 0.0202 2.06e- 7 0.257 brier_class
## 9 1626 19 8 0.0202 2.06e- 7 0.257 roc_auc
## 10 1483 30 5 0.0873 5.18e- 3 0.401 accuracy
## 11 1483 30 5 0.0873 5.18e- 3 0.401 brier_class
## 12 1483 30 5 0.0873 5.18e- 3 0.401 roc_auc
## 13 111 39 12 0.238 4.91e- 1 0.850 accuracy
## 14 111 39 12 0.238 4.91e- 1 0.850 brier_class
## 15 111 39 12 0.238 4.91e- 1 0.850 roc_auc
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(ceo_dismissal, .pred_dismissed) %>%
autoplot()
xgboost_last <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
last_fit(data_split)
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.48 Preprocessor1_Model1
## 2 roc_auc binary 0.585 Preprocessor1_Model1
## 3 brier_class binary 0.250 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
yardstick::conf_mat(ceo_dismissal, .pred_class)
## Truth
## Prediction dismissed not dismissed
## dismissed 3 11
## not dismissed 2 9
library(vip)
xgboost_last %>%
workflows::extract_fit_engine() %>%
vip()