Explore data
# Step 1: Binarize
data_binarized <- data_clean %>%
binarize()
data_binarized %>% glimpse()
## Rows: 7,476
## Columns: 29
## $ dismissal_dataset_id__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `dismissal_dataset_id__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `fyear__-Inf_1999` <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, …
## $ fyear__1999_2006 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ fyear__2006_2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ fyear__2012_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `co_per_rol__-Inf_6975.5` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ co_per_rol__6975.5_18267.5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__18267.5_33414.25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__33414.25_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__3 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ departure_code__4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__5 <dbl> 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, …
## $ departure_code__6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ departure_code__7 <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ ceo_dismissal__0 <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, …
## $ ceo_dismissal__1 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ tenure_no_ceodb__1 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-Inf_2000` <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, …
## $ fyear_gone__2000_2006 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ fyear_gone__2006_2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ fyear_gone__2013_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlation
data_correlation <- data_binarized %>%
correlate(ceo_dismissal__0)
data_correlation
## # A tibble: 29 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 ceo_dismissal 0 1
## 2 ceo_dismissal 1 -1
## 3 departure_code 3 -0.929
## 4 departure_code 5 0.476
## 5 departure_code 7 0.304
## 6 departure_code 4 -0.273
## 7 departure_code 6 0.0786
## 8 fyear -Inf_1999 0.0775
## 9 co_per_rol -Inf_6975.5 0.0595
## 10 fyear_gone -Inf_2000 0.0585
## # ℹ 19 more rows
# Step 3: Plot
data_correlation %>%
correlationfunnel::plot_correlation_funnel()

Model Building
Split Data
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.3.0 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.1.0
## Warning: package 'dials' was built under R version 4.3.3
## Warning: package 'infer' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## Warning: package 'rsample' was built under R version 4.3.3
## Warning: package 'tune' was built under R version 4.3.3
## Warning: package 'workflows' was built under R version 4.3.3
## Warning: package 'workflowsets' was built under R version 4.3.3
## Warning: package 'yardstick' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
set.seed(1234)
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <- rsample::vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [5044/562]> Fold01
## 2 <split [5044/562]> Fold02
## 3 <split [5045/561]> Fold03
## 4 <split [5045/561]> Fold04
## 5 <split [5046/560]> Fold05
## 6 <split [5046/560]> Fold06
## 7 <split [5046/560]> Fold07
## 8 <split [5046/560]> Fold08
## 9 <split [5046/560]> Fold09
## 10 <split [5046/560]> Fold10
Preprocess Data
library(themis)
## Warning: package 'themis' was built under R version 4.3.3
# data_train <- data_train %>% mutate(ceo_dismissal = as.factor(ceo_dismissal))
xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
update_role(dismissal_dataset_id, new_role = "ID") %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(ceo_dismissal)
xgboost_rec %>%
prep() %>%
juice() %>%
glimpse()
## Rows: 8,988
## Columns: 15
## $ dismissal_dataset_id <fct> 12, 31, 43, 51, 63, 75, 76, 80, 99, 109, 110, 112…
## $ fyear <dbl> 1997, 1998, 2001, 1997, 1997, 1993, 2007, 1993, 1…
## $ co_per_rol <dbl> 1, 6, 11, 16, 22, 33, 34, 43, 60, 66, 68, 71, 73,…
## $ tenure_no_ceodb <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ max_tenure_ceodb <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ fyear_gone <dbl> 1998, 1998, 2002, 1997, 1998, 1995, 2007, 1993, 2…
## $ ceo_dismissal <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code_X3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code_X4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code_X5 <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0…
## $ departure_code_X6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code_X7 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1…
## $ departure_code_X8 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ departure_code_X9 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
Specify Model
xgboost_spec <-
boost_tree(trees = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_rec) %>%
add_model(xgboost_spec)
Tune Hyperparameters
doParallel::registerDoParallel()
set.seed(65743)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)