CEO Departures: The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Build a classification model to predict CEO departure (ceo_dismissal). Use the departures dataset.
library(tidyverse)
library(tidyquant)
departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')
departures %>% skimr::skim()
Name | Piped data |
Number of rows | 9423 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
character | 8 |
numeric | 10 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
coname | 0 | 1.00 | 2 | 30 | 0 | 3860 | 0 |
exec_fullname | 0 | 1.00 | 5 | 790 | 0 | 8701 | 0 |
interim_coceo | 9105 | 0.03 | 6 | 7 | 0 | 6 | 0 |
still_there | 7311 | 0.22 | 3 | 10 | 0 | 77 | 0 |
notes | 1644 | 0.83 | 5 | 3117 | 0 | 7755 | 0 |
sources | 1475 | 0.84 | 18 | 1843 | 0 | 7915 | 0 |
eight_ks | 4499 | 0.52 | 69 | 3884 | 0 | 4914 | 0 |
_merge | 0 | 1.00 | 11 | 11 | 0 | 1 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1.00 | 5684.10 | 25005.46 | 1 | 2305.5 | 4593 | 6812.5 | 559044 | ▇▁▁▁▁ |
gvkey | 0 | 1.00 | 40132.48 | 53921.34 | 1004 | 7337.0 | 14385 | 60900.5 | 328795 | ▇▁▁▁▁ |
fyear | 0 | 1.00 | 2007.74 | 8.19 | 1987 | 2000.0 | 2008 | 2016.0 | 2020 | ▁▆▅▅▇ |
co_per_rol | 0 | 1.00 | 25580.22 | 18202.38 | -1 | 8555.5 | 22980 | 39275.5 | 64602 | ▇▆▅▃▃ |
departure_code | 1667 | 0.82 | 5.20 | 1.53 | 1 | 5.0 | 5 | 7.0 | 9 | ▁▃▇▅▁ |
ceo_dismissal | 1813 | 0.81 | 0.20 | 0.40 | 0 | 0.0 | 0 | 0.0 | 1 | ▇▁▁▁▂ |
tenure_no_ceodb | 0 | 1.00 | 1.03 | 0.17 | 0 | 1.0 | 1 | 1.0 | 3 | ▁▇▁▁▁ |
max_tenure_ceodb | 0 | 1.00 | 1.05 | 0.24 | 1 | 1.0 | 1 | 1.0 | 4 | ▇▁▁▁▁ |
fyear_gone | 1802 | 0.81 | 2006.64 | 13.63 | 1980 | 2000.0 | 2007 | 2013.0 | 2997 | ▇▁▁▁▁ |
cik | 245 | 0.97 | 741469.17 | 486551.43 | 1750 | 106413.0 | 857323 | 1050375.8 | 1808065 | ▆▁▇▂▁ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
leftofc | 1802 | 0.81 | 1981-01-01 | 2998-04-27 | 2006-12-31 | 3627 |
Notes about the data
data <- departures %>%
# Clean ceo_dismissal
filter(!is.na(ceo_dismissal)) %>%
mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not dismissed")) %>%
# Drop variables with too many missing values
select(-interim_coceo, - still_there, - eight_ks) %>%
# Treat dismissal_dataset_id
mutate(dismissal_dataset_id = as.character(dismissal_dataset_id)) %>%
distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
# Delete year of 2997 in fyear_geon
filter(fyear_gone < 2023) %>%
# Drop redundant variables
select(-departure_code, -fyear, -gvkey, - co_per_rol, - leftofc, - cik, - sources, - `_merge`) %>%
# Drop high cardinality predictors
select(-coname, -exec_fullname) %>%
# Convert to factor the variables with a few unique values
mutate(across(tenure_no_ceodb:fyear_gone, factor)) %>%
# Convert to factor all character variables, except the string variable - notes
mutate(across(where(is.character), factor)) %>%
# Keep notes as character
mutate(notes = as.character(notes)) %>%
filter(!is.na(notes))
*** Identify variables with correlation with the target variable.***
top_companies <- data %>%
count(coname) %>%
slice_max(n, n = 10) %>% pull(coname)
data %>%
filter(coname %in% top_companies) %>%
ggplot(aes(coname, fill = ceo_dismissal)) +
geom_bar(position = "fill") +
coord_flip() +
scale_fill_tq() +
labs(title = "Proportion of Dismissed CEOs by Companies",
x = NULL, y = "Proportion", fill = NULL)
top_executives <- data %>%
count(exec_fullname) %>%
slice_max(n, n = 10) %>% pull(exec_fullname)
data %>%
filter(exec_fullname %in% top_executives) %>%
ggplot(aes(exec_fullname, fill = ceo_dismissal)) +
geom_bar(position = "fill") +
coord_flip() +
scale_fill_tq() +
labs(title = "Proportion of Dismissed CEOs by Executives",
x = NULL, y = "Proportion", fill = NULL)
For CEOs who return, this value should capture whether this is the first or second time in office.
data %>%
ggplot(aes(tenure_no_ceodb, fill = ceo_dismissal)) +
geom_bar(position = "fill") +
coord_flip() +
scale_fill_tq() +
labs(title = "Proportion of Dismissed CEOs by Tenure Number",
x = NULL, y = "Proportion", fill = NULL)
For this CEO, how many times did s/he serve as CEO
data %>%
ggplot(aes(max_tenure_ceodb, fill = ceo_dismissal)) +
geom_bar(position = "fill") +
coord_flip() +
scale_fill_tq() +
labs(title = "Proportion of Dismissed CEOs by Max Tenure Number",
x = NULL, y = "Proportion", fill = NULL)
fiscal year of the CEO’s effective departure date
data %>%
ggplot(aes(fyear_gone, fill = ceo_dismissal)) +
geom_bar(position = "fill") +
coord_flip() +
scale_fill_tq() +
labs(title = "Proportion of Dismissed CEOs by Companies",
x = NULL, y = "Proportion", fill = NULL)
Long-form description and justification for the coding scheme assignment.
library(tidytext) # for tokenization
library(tidylo) # for bind_log_odds
log_odds_weighted_tb <- data %>%
# Extract most frequent words from notes
unnest_tokens(word, notes) %>%
anti_join(stop_words) %>%
count(ceo_dismissal, word) %>%
# slice_max(order_by = n, n = 100) %>%
# Calculate how likely each word show up from dismissed CEO's notes
bind_log_odds(ceo_dismissal, word, n) %>%
arrange(-log_odds_weighted)
# Select the category
log_odds_weighted_tb %>%
filter(ceo_dismissal == "dismissed") %>%
slice_max(order_by = n, n = 20) %>%
ggplot(aes(log_odds_weighted, fct_reorder(word, log_odds_weighted))) +
geom_col(fill = "midnightblue") +
labs(title = "The most likely word in the dismissed CEO's departure notes",
y = "Words from Notes", x = "Log Odds Weighted")
log_odds_weighted_tb %>%
filter(ceo_dismissal == "not dismissed") %>%
slice_max(order_by = n, n = 20) %>%
ggplot(aes(log_odds_weighted, fct_reorder(word, log_odds_weighted))) +
geom_col(fill = "midnightblue") +
labs(title = "The most likely word in the dismissed CEO's departure notes",
y = "Words from Notes", x = "Log Odds Weighted")
# data <- sample_n(data, 100)
library(tidymodels)
set.seed(123)
departure_split <- initial_split(data, strata = ceo_dismissal)
departure_train <- training(departure_split)
departure_test <- testing(departure_split)
set.seed(234)
departure_folds <- vfold_cv(departure_train, strata = ceo_dismissal)
departure_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [5032/561]> Fold01
## 2 <split [5033/560]> Fold02
## 3 <split [5034/559]> Fold03
## 4 <split [5034/559]> Fold04
## 5 <split [5034/559]> Fold05
## 6 <split [5034/559]> Fold06
## 7 <split [5034/559]> Fold07
## 8 <split [5034/559]> Fold08
## 9 <split [5034/559]> Fold09
## 10 <split [5034/559]> Fold10
library(embed)
library(textrecipes)
departures_rec <-
recipe(ceo_dismissal ~ ., data = departure_train) %>%
update_role(dismissal_dataset_id, new_role = "id") %>%
step_tokenize(notes) %>%
step_stopwords(notes) %>%
step_tokenfilter(notes, max_tokens = 100) %>%
step_tfidf(notes) %>%
# step_lencode_glm(coname, outcome = vars(ceo_dismissal)) %>%
step_dummy(all_nominal_predictors())
departures_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 5,593
## Columns: 140
## $ dismissal_dataset_id <fct> 13, 78, 85, 88, 119, 143, 162, 198, 243, 244, …
## $ ceo_dismissal <fct> dismissed, dismissed, dismissed, dismissed, di…
## $ tfidf_notes_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_1995 <dbl> 0.0000000, 0.0000000, 0.1790195, 0.0000000, 0.…
## $ tfidf_notes_1996 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_1997 <dbl> 0.0000000, 0.0000000, 0.3477482, 0.0000000, 0.…
## $ tfidf_notes_1998 <dbl> 0.0000000, 0.0000000, 0.3414034, 0.0000000, 0.…
## $ tfidf_notes_1999 <dbl> 0.0000000, 0.2824006, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_2000 <dbl> 0.0000000, 0.1474853, 0.1735121, 0.0000000, 0.…
## $ tfidf_notes_2001 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_2006 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_2007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_3 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_30 <dbl> 0.0000000, 0.1530249, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_31 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_acquired <dbl> 0.0000000, 0.1395012, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_acquisition <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_agreement <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_also <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_announced <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_appointed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_april <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_august <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_bank <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_based <dbl> 0.00000000, 0.00000000, 0.00000000, 0.28791778…
## $ tfidf_notes_became <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_become <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_billion <dbl> 0.0000000, 0.1262248, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_board <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_business <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_ceo <dbl> 0.00000000, 0.12122123, 0.07130661, 0.00000000…
## $ tfidf_notes_chairman <dbl> 0.00000000, 0.00000000, 0.06929626, 0.13089294…
## $ tfidf_notes_chief <dbl> 0.00000000, 0.00000000, 0.00000000, 0.13138957…
## $ tfidf_notes_co <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_companies <dbl> 0.0000000, 0.0000000, 0.1844912, 0.0000000, 0.…
## $ tfidf_notes_company <dbl> 0.21038038, 0.10519019, 0.06187658, 0.23375598…
## $ `tfidf_notes_company's` <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ `tfidf_notes_company’s` <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_continue <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_corp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_corporation <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_december <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_departure <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_director <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_directors <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_effective <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_elected <dbl> 0.0000000, 0.0000000, 0.1847131, 0.0000000, 0.…
## $ tfidf_notes_end <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_energy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_executive <dbl> 0.00000000, 0.00000000, 0.00000000, 0.12437997…
## $ tfidf_notes_february <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_financial <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_following <dbl> 0.0000000, 0.1365265, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_former <dbl> 0.0000000, 0.1525080, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_group <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_inc <dbl> 0.00000000, 0.07713483, 0.09074686, 0.00000000…
## $ tfidf_notes_interim <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_j <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_january <dbl> 0.0000000, 0.1319778, 0.1552680, 0.0000000, 0.…
## $ tfidf_notes_john <dbl> 0.0000000, 0.0000000, 0.1812667, 0.0000000, 0.…
## $ tfidf_notes_july <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_june <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_left <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_management <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_march <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_may <dbl> 0.5045218, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_member <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_merger <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_million <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_mr <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_named <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_new <dbl> 0.4443834, 0.0000000, 0.1307010, 0.2468797, 0.…
## $ tfidf_notes_november <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_october <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_officer <dbl> 0.00000000, 0.06493599, 0.00000000, 0.00000000…
## $ tfidf_notes_one <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_operating <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_position <dbl> 0.0000000, 0.1137372, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_president <dbl> 0.00000000, 0.00000000, 0.07761948, 0.00000000…
## $ tfidf_notes_remain <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_resignation <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_resigned <dbl> 0.4499866, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_retire <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_retired <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_retirement <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_role <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_said <dbl> 0.000000, 0.000000, 0.000000, 0.236835, 0.0000…
## $ tfidf_notes_september <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_serve <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_served <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_share <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_shares <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_since <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_step <dbl> 0.0000000, 0.0000000, 0.0000000, 0.3416076, 0.…
## $ tfidf_notes_stepped <dbl> 0.0000000, 0.1318699, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_stock <dbl> 0.0000000, 0.1271799, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_time <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_today <dbl> 0.4586292, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_two <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_vice <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_year <dbl> 0.00000000, 0.10163478, 0.00000000, 0.00000000…
## $ tfidf_notes_years <dbl> 0.00000000, 0.10036738, 0.11807927, 0.00000000…
## $ tenure_no_ceodb_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tenure_no_ceodb_X3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1988 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1990 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1991 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1992 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1993 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0…
## $ fyear_gone_X1994 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1995 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1996 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1997 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ fyear_gone_X1998 <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ fyear_gone_X2001 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ fyear_gone_X2002 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2003 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ fyear_gone_X2004 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2005 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2006 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2008 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2009 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2010 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2011 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2016 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2017 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2018 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2020 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2021 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
prep(departures_rec) %>%
tidy(number = 1)
prep(departures_rec) %>%
tidy(number = 1) %>%
filter(level == "..new")
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow(departures_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <- tune_grid(
xgb_wf,
resamples = departure_folds,
grid = 5,
control = control_grid(verbose = TRUE, save_pred = TRUE)
)
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [5032/561]> Fold01 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 2 <split [5033/560]> Fold02 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 3 <split [5034/559]> Fold03 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 4 <split [5034/559]> Fold04 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 5 <split [5034/559]> Fold05 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 6 <split [5034/559]> Fold06 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 7 <split [5034/559]> Fold07 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 8 <split [5034/559]> Fold08 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 9 <split [5034/559]> Fold09 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 10 <split [5034/559]> Fold10 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
collect_metrics(xgb_rs)
## # A tibble: 10 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 28 606 28 accuracy binary 0.837 10 0.00407 Preprocessor1_Mode…
## 2 28 606 28 roc_auc binary 0.854 10 0.00665 Preprocessor1_Mode…
## 3 32 1196 24 accuracy binary 0.844 10 0.00351 Preprocessor1_Mode…
## 4 32 1196 24 roc_auc binary 0.861 10 0.00645 Preprocessor1_Mode…
## 5 56 1331 37 accuracy binary 0.842 10 0.00319 Preprocessor1_Mode…
## 6 56 1331 37 roc_auc binary 0.852 10 0.00647 Preprocessor1_Mode…
## 7 91 1641 7 accuracy binary 0.853 10 0.00443 Preprocessor1_Mode…
## 8 91 1641 7 roc_auc binary 0.868 10 0.00639 Preprocessor1_Mode…
## 9 128 194 13 accuracy binary 0.826 10 0.00422 Preprocessor1_Mode…
## 10 128 194 13 roc_auc binary 0.824 10 0.00945 Preprocessor1_Mode…
# collect_predictions(xgb_rs) %>%
# group_by(id) %>%
# roc_curve(ceo_dismissal, .pred_class) %>%
# autoplot()
# conf_mat_resampled(xgb_rs, tidy = FALSE) %>%
# autoplot()
xgb_last <- xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
last_fit(departure_split)
xgb_last
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [5593/1865]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics(xgb_last)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.852 Preprocessor1_Model1
## 2 roc_auc binary 0.868 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
conf_mat(ceo_dismissal, .pred_class) %>%
autoplot()
library(vip)
xgb_last %>%
extract_fit_engine() %>%
vip()