library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidyquant)

## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(correlationfunnel)

## ══ correlationfunnel Tip #2 ════════════════════════════════════════════════════
## Clean your NA's prior to using `binarize()`.
## Missing values and cleaning data are critical to getting great correlations. :)

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

departures %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

Goal: to find reasons for CEO departure in S&P 1500 firms from 2000 through 2018 with the use of classification model.

Exploring Data

data <- departures %>%
    
    # Clean ceo_dismissal
    filter(!is.na(ceo_dismissal)) %>%
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not dismissed")) %>%

    # Drop variables with too many missing values
    select(-interim_coceo, - still_there, - eight_ks) %>%
    
    # Treat dismissal_dataset_id
    mutate(dismissal_dataset_id = as.character(dismissal_dataset_id)) %>%
    distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
    
    # Delete year of 2997 in fyear_geon
    filter(fyear_gone < 2023) %>%
    
    # Drop redundant variables
    select(-departure_code, -fyear, -gvkey, - co_per_rol, - leftofc, - cik, - sources, - `_merge`) %>%
    
    # Drop high cardinality predictors
    select(-exec_fullname) %>%
    
    #Convert to factor the variables with a few unique values
    #mutate(across(tenure_no_ceodb:fyear_gone, factor)) %>%
    
    # Convert to factor all character variables, except the string variable - notes
    mutate(across(where(is.character), factor)) %>%
    
    # Keep notes as character
    mutate(notes = as.character(notes)) %>%
    filter(!is.na(notes))

skimr::skim(data)

Data summary
Name	data
Number of rows	7458
Number of columns	7
_______________________
Column type frequency:
character	1
factor	3
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
notes	0	1	5	3117	0	7448	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
dismissal_dataset_id	1	FALSE	7458	1: 1, 10: 1, 100: 1, 100: 1
coname	1	FALSE	3427	BAR: 8, CLA: 8, FED: 8, NTN: 8
ceo_dismissal	1	FALSE	2	not: 5976, dis: 1482

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
tenure_no_ceodb	1	1.03	0.16	1	1	1	1	3	▇▁▁▁▁
max_tenure_ceodb	1	1.05	0.23	1	1	1	1	4	▇▁▁▁▁
fyear_gone	1	2006.40	7.50	1980	2000	2006	2013	2021	▁▂▇▇▆

factors_vec <- data %>% select(dismissal_dataset_id, ceo_dismissal, tenure_no_ceodb, max_tenure_ceodb, fyear_gone) %>% names()

data_clean <- data %>%

mutate(across(all_of(factors_vec), as.factor))

data_clean %>% count(dismissal_dataset_id, ceo_dismissal, tenure_no_ceodb, max_tenure_ceodb, fyear_gone)

## # A tibble: 7,458 × 6
##    dismissal_dataset_id ceo_dismissal tenure_no_ceodb max_tenure_ceodb
##    <fct>                <fct>         <fct>           <fct>           
##  1 1                    not dismissed 1               1               
##  2 10                   not dismissed 1               1               
##  3 100                  dismissed     1               1               
##  4 1000                 not dismissed 1               1               
##  5 1001                 not dismissed 1               1               
##  6 1002                 not dismissed 1               1               
##  7 1004                 not dismissed 1               1               
##  8 1005                 not dismissed 1               1               
##  9 1007                 not dismissed 1               1               
## 10 1008                 dismissed     1               1               
## # ℹ 7,448 more rows
## # ℹ 2 more variables: fyear_gone <fct>, n <int>

data_clean %>%
    ggplot(aes(ceo_dismissal)) +
    geom_bar()

data %>%
    ggplot(aes(tenure_no_ceodb, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Tenure Number",
         x = NULL, y = "Proportion", fill = NULL)

data %>%
    ggplot(aes(max_tenure_ceodb, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Max Tenure Number",
         x = NULL, y = "Proportion", fill = NULL)

data %>%
    ggplot(aes(fyear_gone, fill = ceo_dismissal)) +
    geom_bar(position = "fill") +
    coord_flip() +
    scale_fill_tq() +
    labs(title = "Proportion of Dismissed CEOs by Companies",
         x = NULL, y = "Proportion", fill = NULL)

Correlationfunel

data_binarized_tbl <- data_clean %>%
    select(- dismissal_dataset_id, - notes) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 7,458
## Columns: 38
## $ coname__BARRICK_GOLD_CORP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `coname__-OTHER`             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ ceo_dismissal__dismissed     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ ceo_dismissal__not_dismissed <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,…
## $ tenure_no_ceodb__1           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ tenure_no_ceodb__2           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `tenure_no_ceodb__-OTHER`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ max_tenure_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ max_tenure_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `max_tenure_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__1993             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,…
## $ fyear_gone__1994             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__1995             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ fyear_gone__1996             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__1997             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__1998             <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__1999             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2000             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2001             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,…
## $ fyear_gone__2002             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2003             <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2004             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2005             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2006             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2007             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ fyear_gone__2008             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2009             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2010             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2011             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2012             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2013             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2014             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2015             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2016             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2017             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2018             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ fyear_gone__2019             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `fyear_gone__-OTHER`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(ceo_dismissal__dismissed) 
    
data_corr_tbl

## # A tibble: 38 × 3
##    feature          bin           correlation
##    <fct>            <chr>               <dbl>
##  1 ceo_dismissal    dismissed          1     
##  2 ceo_dismissal    not_dismissed     -1     
##  3 max_tenure_ceodb 1                  0.0577
##  4 max_tenure_ceodb 2                 -0.0533
##  5 fyear_gone       1999              -0.0390
##  6 fyear_gone       2002               0.0378
##  7 fyear_gone       2003               0.0303
##  8 fyear_gone       2009               0.0292
##  9 fyear_gone       2008               0.0261
## 10 fyear_gone       1997              -0.0255
## # ℹ 28 more rows

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Split Data

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.6      ✔ rsample      1.2.1 
## ✔ dials        1.3.0      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

set.seed(1234)
data_clean <- data_clean %>% sample_n(100)

departure_split <- initial_split(data, strata = ceo_dismissal)

departure_train <- training(departure_split)
departure_test <- testing(departure_split)

set.seed(2345)
departure_folds <- vfold_cv(departure_train, strata = ceo_dismissal)
departure_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5032/561]> Fold01
##  2 <split [5033/560]> Fold02
##  3 <split [5034/559]> Fold03
##  4 <split [5034/559]> Fold04
##  5 <split [5034/559]> Fold05
##  6 <split [5034/559]> Fold06
##  7 <split [5034/559]> Fold07
##  8 <split [5034/559]> Fold08
##  9 <split [5034/559]> Fold09
## 10 <split [5034/559]> Fold10

Prepossess Data

library(embed)
library(textrecipes)

departures_rec <- 
    recipe(ceo_dismissal ~ ., data = departure_train) %>%
    update_role(dismissal_dataset_id, new_role = "id") %>%
    step_tokenize(notes) %>%
    step_tokenfilter(notes, max_tokens = 100) %>%
    step_tfidf(notes) %>%
    step_other(coname) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors())


departures_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 5,593
## Columns: 106
## $ dismissal_dataset_id    <fct> 85, 119, 198, 243, 244, 263, 280, 346, 348, 36…
## $ tenure_no_ceodb         <dbl> -0.1548142, -0.1548142, -0.1548142, -0.1548142…
## $ max_tenure_ceodb        <dbl> -0.2094581, -0.2094581, -0.2094581, -0.2094581…
## $ fyear_gone              <dbl> -1.12184755, -1.38925691, -1.12184755, -1.7903…
## $ ceo_dismissal           <fct> dismissed, dismissed, dismissed, dismissed, di…
## $ tfidf_notes_1           <dbl> -0.2950765, -0.2950765, -0.2950765, -0.2950765…
## $ tfidf_notes_1997        <dbl> 6.516187, -0.201052, -0.201052, -0.201052, -0.…
## $ tfidf_notes_1998        <dbl> 6.7479775, -0.2043864, -0.2043864, -0.2043864,…
## $ tfidf_notes_1999        <dbl> -0.2136374, -0.2136374, -0.2136374, -0.2136374…
## $ tfidf_notes_a           <dbl> 0.72883491, -0.75969150, 0.01115253, -0.759691…
## $ tfidf_notes_acquisition <dbl> -0.2065473, -0.2065473, -0.2065473, -0.2065473…
## $ tfidf_notes_after       <dbl> -0.331960658, -0.331960658, -0.331960658, -0.3…
## $ tfidf_notes_agreement   <dbl> -0.2331309, -0.2331309, -0.2331309, -0.2331309…
## $ tfidf_notes_also        <dbl> -0.2788685, -0.2788685, 3.4235436, -0.2788685,…
## $ tfidf_notes_an          <dbl> -0.352248, -0.352248, -0.352248, -0.352248, -0…
## $ tfidf_notes_and         <dbl> 2.5622769, 1.1387855, -0.4189976, -0.1074410, …
## $ tfidf_notes_announced   <dbl> -0.4369369, -0.4369369, -0.4369369, -0.4369369…
## $ tfidf_notes_april       <dbl> -0.2129577, -0.2129577, -0.2129577, -0.2129577…
## $ tfidf_notes_as          <dbl> 0.86786744, -0.90923362, 0.01105086, 0.3791646…
## $ tfidf_notes_at          <dbl> -0.3534639, -0.3534639, -0.3534639, -0.3534639…
## $ tfidf_notes_based       <dbl> -0.2398946, -0.2398946, -0.2398946, -0.2398946…
## $ tfidf_notes_be          <dbl> -0.3088754, -0.3088754, -0.3088754, -0.3088754…
## $ tfidf_notes_been        <dbl> -0.3336100, 2.3193863, -0.3336100, -0.3336100,…
## $ tfidf_notes_billion     <dbl> -0.2349982, -0.2349982, -0.2349982, -0.2349982…
## $ tfidf_notes_board       <dbl> -0.6448506, -0.6448506, 0.7543015, 1.3139623, …
## $ tfidf_notes_business    <dbl> -0.1951818, -0.1951818, -0.1951818, -0.1951818…
## $ tfidf_notes_but         <dbl> 2.4489029, -0.2346642, -0.2346642, -0.2346642,…
## $ tfidf_notes_by          <dbl> 0.9123051, -0.3493111, 0.9573628, 1.4800323, 1…
## $ tfidf_notes_ceo         <dbl> 0.28400019, 0.31643114, 0.31643114, -0.6240664…
## $ tfidf_notes_chairman    <dbl> 0.48549130, -0.67386705, 1.72766097, -0.673867…
## $ tfidf_notes_chief       <dbl> -0.71538569, 0.66198325, 0.66198325, 1.2129308…
## $ tfidf_notes_co          <dbl> -0.2443545, -0.2443545, -0.2443545, -0.2443545…
## $ tfidf_notes_company     <dbl> 0.16692423, -0.73064103, 0.19898013, -0.730641…
## $ `tfidf_notes_company's` <dbl> -0.2872096, -0.2872096, -0.2872096, -0.2872096…
## $ tfidf_notes_corp        <dbl> -0.2696092, -0.2696092, -0.2696092, -0.2696092…
## $ tfidf_notes_corporation <dbl> -0.2761885, -0.2761885, -0.2761885, -0.2761885…
## $ tfidf_notes_december    <dbl> -0.2231103, -0.2231103, -0.2231103, -0.2231103…
## $ tfidf_notes_director    <dbl> -0.2867851, -0.2867851, -0.2867851, -0.2867851…
## $ tfidf_notes_directors   <dbl> -0.4256755, -0.4256755, -0.4256755, -0.4256755…
## $ tfidf_notes_down        <dbl> -0.3217453, -0.3217453, -0.3217453, -0.3217453…
## $ tfidf_notes_effective   <dbl> -0.3512119, -0.3512119, -0.3512119, -0.3512119…
## $ tfidf_notes_executive   <dbl> -0.7594792, 0.5492902, 0.5492902, 1.0727980, -…
## $ tfidf_notes_financial   <dbl> -0.2100234, -0.2100234, -0.2100234, -0.2100234…
## $ tfidf_notes_for         <dbl> -0.4940311, 0.6318143, -0.4940311, 1.0821525, …
## $ tfidf_notes_from        <dbl> -0.5094769, -0.5094769, 0.6420242, -0.5094769,…
## $ tfidf_notes_group       <dbl> -0.1939352, -0.1939352, -0.1939352, -0.1939352…
## $ tfidf_notes_had         <dbl> -0.3068995, 2.3130662, -0.3068995, -0.3068995,…
## $ tfidf_notes_has         <dbl> -0.4899052, -0.4899052, -0.4899052, -0.4899052…
## $ tfidf_notes_have        <dbl> -0.215825, -0.215825, -0.215825, 5.682400, -0.…
## $ tfidf_notes_he          <dbl> -0.55561285, 0.44534401, -0.55561285, 0.845726…
## $ tfidf_notes_his         <dbl> -0.5094355, -0.5094355, -0.5094355, 1.2057966,…
## $ tfidf_notes_in          <dbl> -0.034599664, -0.008215556, -0.773354696, 1.36…
## $ tfidf_notes_inc         <dbl> 0.97875511, -0.48128939, -0.48128939, -0.48128…
## $ tfidf_notes_into        <dbl> -0.2073987, -0.2073987, -0.2073987, -0.2073987…
## $ tfidf_notes_is          <dbl> -0.4026739, -0.4026739, 1.2314764, -0.4026739,…
## $ tfidf_notes_it          <dbl> -0.3720018, -0.3720018, -0.3720018, -0.3720018…
## $ tfidf_notes_its         <dbl> 1.4310791, -0.4171896, 1.4970887, 2.2628000, -…
## $ tfidf_notes_january     <dbl> 3.0373057, -0.2464586, -0.2464586, -0.2464586,…
## $ tfidf_notes_july        <dbl> -0.2307749, -0.2307749, -0.2307749, -0.2307749…
## $ tfidf_notes_june        <dbl> -0.1958329, -0.1958329, -0.1958329, -0.1958329…
## $ tfidf_notes_march       <dbl> -0.2169344, 3.0273821, -0.2169344, -0.2169344,…
## $ tfidf_notes_may         <dbl> -0.2493137, -0.2493137, -0.2493137, -0.2493137…
## $ tfidf_notes_member      <dbl> -0.2313919, -0.2313919, -0.2313919, -0.2313919…
## $ tfidf_notes_merger      <dbl> -0.2149803, -0.2149803, 1.8601565, -0.2149803,…
## $ tfidf_notes_million     <dbl> -0.2686936, 4.7614203, -0.2686936, -0.2686936,…
## $ tfidf_notes_more        <dbl> -0.2123602, -0.2123602, -0.2123602, -0.2123602…
## $ tfidf_notes_mr          <dbl> -0.5045803, -0.5045803, -0.5045803, -0.5045803…
## $ tfidf_notes_named       <dbl> -0.2162119, -0.2162119, 2.9053015, -0.2162119,…
## $ tfidf_notes_new         <dbl> 2.1897731, -0.2973036, -0.2973036, -0.2973036,…
## $ tfidf_notes_not         <dbl> -0.1851140, -0.1851140, -0.1851140, -0.1851140…
## $ tfidf_notes_of          <dbl> -1.23970661, 0.64360373, -1.23970661, -1.23970…
## $ tfidf_notes_officer     <dbl> -0.6319247, 0.8700193, -0.6319247, -0.6319247,…
## $ tfidf_notes_on          <dbl> 0.6306030, 1.8497044, -0.5072250, -0.5072250, …
## $ tfidf_notes_operating   <dbl> -0.2381417, -0.2381417, -0.2381417, -0.2381417…
## $ tfidf_notes_over        <dbl> -0.1667148, -0.1667148, -0.1667148, -0.1667148…
## $ tfidf_notes_position    <dbl> -0.2937833, -0.2937833, -0.2937833, -0.2937833…
## $ tfidf_notes_president   <dbl> 0.8420980, 2.3999264, -0.6118752, -0.6118752, …
## $ tfidf_notes_resigned    <dbl> -0.2608318, 1.5432959, -0.2608318, -0.2608318,…
## $ tfidf_notes_retire      <dbl> -0.2764948, -0.2764948, -0.2764948, -0.2764948…
## $ tfidf_notes_retired     <dbl> -0.2671846, -0.2671846, -0.2671846, -0.2671846…
## $ tfidf_notes_retirement  <dbl> -0.2066755, -0.2066755, -0.2066755, -0.2066755…
## $ tfidf_notes_role        <dbl> -0.2586828, -0.2586828, -0.2586828, -0.2586828…
## $ tfidf_notes_said        <dbl> -0.329281, -0.329281, 2.204141, -0.329281, -0.…
## $ tfidf_notes_served      <dbl> -0.3114722, -0.3114722, -0.3114722, -0.3114722…
## $ tfidf_notes_share       <dbl> -0.2007568, -0.2007568, -0.2007568, -0.2007568…
## $ tfidf_notes_since       <dbl> -0.3457923, 4.8290118, -0.3457923, -0.3457923,…
## $ tfidf_notes_stepped     <dbl> -0.2053318, -0.2053318, -0.2053318, -0.2053318…
## $ tfidf_notes_stock       <dbl> -0.2519306, -0.2519306, -0.2519306, -0.2519306…
## $ tfidf_notes_that        <dbl> -0.5640468, -0.5640468, -0.5640468, 3.4892569,…
## $ tfidf_notes_the         <dbl> -0.990959727, -0.971061238, 0.760107288, 0.875…
## $ tfidf_notes_this        <dbl> -0.2591261, -0.2591261, -0.2591261, -0.2591261…
## $ tfidf_notes_time        <dbl> -0.2593110, -0.2593110, -0.2593110, -0.2593110…
## $ tfidf_notes_to          <dbl> 0.49119169, -0.80877209, -0.80877209, 0.133701…
## $ tfidf_notes_today       <dbl> -0.3032347, -0.3032347, -0.3032347, -0.3032347…
## $ tfidf_notes_until       <dbl> -0.3043194, -0.3043194, -0.3043194, -0.3043194…
## $ tfidf_notes_vice        <dbl> -0.2383916, -0.2383916, 3.6132587, -0.2383916,…
## $ tfidf_notes_was         <dbl> -0.5861012, -0.5861012, 1.0919530, 0.5885367, …
## $ tfidf_notes_when        <dbl> -0.2647481, -0.2647481, -0.2647481, -0.2647481…
## $ tfidf_notes_which       <dbl> -0.2871059, 2.8594839, -0.2871059, -0.2871059,…
## $ tfidf_notes_who         <dbl> -0.2989841, -0.2989841, 2.0182935, -0.2989841,…
## $ tfidf_notes_will        <dbl> -0.4906758, -0.4906758, 0.8880736, -0.4906758,…
## $ tfidf_notes_with        <dbl> -0.42325915, 0.79543980, 0.79543980, -0.423259…
## $ tfidf_notes_would       <dbl> -0.2073076, -0.2073076, -0.2073076, -0.2073076…
## $ tfidf_notes_year        <dbl> -0.3219871, -0.3219871, -0.3219871, -0.3219871…
## $ tfidf_notes_years       <dbl> 1.1705187, -0.2904182, -0.2904182, -0.2904182,…
## $ coname_other            <dbl> 0.03784379, 0.03784379, 0.03784379, 0.03784379…

Specify model

library(usemodels)
usemodels::use_xgboost(ceo_dismissal ~ ., data = departure_train)

## xgboost_recipe <- 
##   recipe(formula = ceo_dismissal ~ ., data = departure_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(44593)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_recipe <- 
  recipe(formula = ceo_dismissal ~ ., data = departure_train) %>% 
  step_tokenfilter(ceo_dismissal, max_tokens = 100) %>%
  step_zv(all_predictors()) 

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec)

prep(departures_rec) %>%
  tidy(number = 1)

## # A tibble: 1 × 3
##   terms value id            
##   <chr> <chr> <chr>         
## 1 notes words tokenize_YToSK

Tune Hyperparameters

xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(departures_rec, xgb_spec)

library(finetune)
tree_grid <- grid_regular(trees(),
                          tree_depth(),
                          levels = 5)

doParallel::registerDoParallel()

set.seed(94354)
xgb_rs <- tune_grid(
  xgb_wf,
  resamples = departure_folds,
  grid = 5,
  control = control_grid(verbose = TRUE, save_pred = TRUE)
)

## i Creating pre-processing data to finalize unknown parameter: mtry

xgb_rs

## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits             id     .metrics          .notes           .predictions
##    <list>             <chr>  <list>            <list>           <list>      
##  1 <split [5032/561]> Fold01 <tibble [15 × 7]> <tibble [0 × 3]> <tibble>    
##  2 <split [5033/560]> Fold02 <tibble [15 × 7]> <tibble [0 × 3]> <tibble>    
##  3 <split [5034/559]> Fold03 <tibble [15 × 7]> <tibble [0 × 3]> <tibble>    
##  4 <split [5034/559]> Fold04 <tibble [15 × 7]> <tibble [0 × 3]> <tibble>    
##  5 <split [5034/559]> Fold05 <tibble [15 × 7]> <tibble [0 × 3]> <tibble>    
##  6 <split [5034/559]> Fold06 <tibble [15 × 7]> <tibble [0 × 3]> <tibble>    
##  7 <split [5034/559]> Fold07 <tibble [15 × 7]> <tibble [0 × 3]> <tibble>    
##  8 <split [5034/559]> Fold08 <tibble [15 × 7]> <tibble [0 × 3]> <tibble>    
##  9 <split [5034/559]> Fold09 <tibble [15 × 7]> <tibble [0 × 3]> <tibble>    
## 10 <split [5034/559]> Fold10 <tibble [15 × 7]> <tibble [0 × 3]> <tibble>

Evaluation of Model

collect_metrics(xgb_rs)

## # A tibble: 15 × 9
##     mtry trees min_n .metric     .estimator  mean     n  std_err .config        
##    <int> <int> <int> <chr>       <chr>      <dbl> <int>    <dbl> <chr>          
##  1    18  1266    16 accuracy    binary     0.846    10 0.00159  Preprocessor1_…
##  2    18  1266    16 brier_class binary     0.110    10 0.000893 Preprocessor1_…
##  3    18  1266    16 roc_auc     binary     0.861    10 0.00376  Preprocessor1_…
##  4    32   637    32 accuracy    binary     0.832    10 0.00153  Preprocessor1_…
##  5    32   637    32 brier_class binary     0.115    10 0.000949 Preprocessor1_…
##  6    32   637    32 roc_auc     binary     0.851    10 0.00468  Preprocessor1_…
##  7    59  1023    23 accuracy    binary     0.841    10 0.00196  Preprocessor1_…
##  8    59  1023    23 brier_class binary     0.112    10 0.000924 Preprocessor1_…
##  9    59  1023    23 roc_auc     binary     0.859    10 0.00418  Preprocessor1_…
## 10    71  1885     8 accuracy    binary     0.845    10 0.00235  Preprocessor1_…
## 11    71  1885     8 brier_class binary     0.109    10 0.00101  Preprocessor1_…
## 12    71  1885     8 roc_auc     binary     0.862    10 0.00358  Preprocessor1_…
## 13    91   242    36 accuracy    binary     0.815    10 0.00192  Preprocessor1_…
## 14    91   242    36 brier_class binary     0.128    10 0.000886 Preprocessor1_…
## 15    91   242    36 roc_auc     binary     0.825    10 0.00560  Preprocessor1_…

xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, metric = "accuracy")) %>%
  last_fit(departure_split)

xgb_last

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [5593/1865]> train/test split <tibble> <tibble> <tibble>     <workflow>

collect_metrics(xgb_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.839 Preprocessor1_Model1
## 2 roc_auc     binary         0.851 Preprocessor1_Model1
## 3 brier_class binary         0.114 Preprocessor1_Model1

collect_predictions(xgb_last) %>%
    conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgb_last %>%
  extract_fit_engine() %>%
  vip()

Conclusion

The previous model had an accuracy of 0.839 and a AUC of 0.851

Feature Transformation: normalize numeric data. It resulted in : NO IMPROVEMENT
Feature Transformation: NO IMPROVEMENT
Feature Transformation: NO IMPROVEMENT

Apply 7

Julius Mondschein

2024-10-10