Import data

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv')

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore Data

skimr::skim(departures)

Data summary
Name	departures
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

Issues with the data

Missing Values *interim_coceo, still_there, notes (string variable), sources, eight_ks, departure_code, ceo_dismissal, fyear_gone, cik

Converting Numeric Variables to Factors * tenure_no_ceodb, max_tenure_ceodb, fyear_gone

Zero variance variables * _merge, gvkey, dismissal_dataset_id

Character Variables * coname, exec_fullname, sources, eight_ks, notes (string variable and not factor)

Unbalanced Target Variable ceo_dismissal #adress in recipe section step_smoke function

Handling ID Variables * dismissal_dataset_id #dataset primary key

Explore data

# Clean data
departures_clean <- departures %>%
    
    # Clean the target variable
    filter(!is.na(ceo_dismissal)) %>%
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
    mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>%
    
    # Remove variables with too many missing values
    select(-c(interim_coceo, still_there, eight_ks))%>%
    
    # Remove irrelevant variables
    select(-`_merge`, -sources) %>%
               
    # Remove variables with info that only becomes 
    select(-departure_code) %>%
    
    # Remove redundant variables 
    select(-c(gvkey, cik, co_per_rol)) %>% #need leftofc as date variable later
    
    #Remove duplicated in dismissal_dataset_id our id variable
    distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
    
    #Remove 2997 in fyear_gone
    filter(fyear_gone < 2025) %>%
    
    # Convert factors that are incorrectly imported as numeric variables
    mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%
    mutate(across(where(is.character), as.factor)) %>%
    
    mutate(notes = as.character(notes))
    
skimr::skim(departures_clean)

Data summary
Name	departures_clean
Number of rows	7475
Number of columns	10
_______________________
Column type frequency:
character	1
factor	6
numeric	2
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
notes	17	1	5	3117	0	7448	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
coname	1	FALSE	3427	BAR: 8, CLA: 8, FED: 8, GRE: 8
exec_fullname	1	FALSE	6975	Joh: 4, Mel: 4, Alb: 3, Ami: 3
ceo_dismissal	1	FALSE	2	not: 5992, dis: 1483
tenure_no_ceodb	1	FALSE	3	1: 7289, 2: 179, 3: 7
max_tenure_ceodb	1	FALSE	4	1: 7138, 2: 319, 3: 15, 4: 3
fyear_gone	1	FALSE	34	200: 379, 199: 351, 200: 334, 200: 321

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1	5570.32	25757.33	1	2175.5	4326	6579.5	559044	▇▁▁▁▁
fyear	0	1	2005.61	7.45	1987	1999.0	2006	2012.0	2020	▁▇▆▇▆

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	0	1	1981-01-01	2021-12-01	2006-11-15	3576

Explore data

departures_clean %>% count(ceo_dismissal)

## # A tibble: 2 × 2
##   ceo_dismissal     n
##   <fct>         <int>
## 1 dismissed      1483
## 2 not_dis        5992

departures_clean %>%
    ggplot(aes(fyear)) + 
    geom_bar() +
    labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")

CEO Dismissal or Tenure

departures_clean %>%
  count(ceo_dismissal, tenure_no_ceodb) %>%
  ggplot(mapping = aes(x = ceo_dismissal, y = tenure_no_ceodb)) +
  geom_tile(mapping = aes(fill = n)) +
  labs(title = "CEO Dismissal or Tenure", x = "CEO Dismissal", y = "CEO Tenure")

Correlation Plot

 departures_clean <- departures_clean 

# Step 1: Binarize the data
data_binarized <- departures_clean %>%
    select(-notes, -dismissal_dataset_id, -leftofc) %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 7,475
## Columns: 44
## $ coname__BARRICK_GOLD_CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `fyear__-Inf_1999`          <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, …
## $ fyear__1999_2006            <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear__2006_2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear__2012_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__dismissed    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ ceo_dismissal__not_dis      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ tenure_no_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1993            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__1994            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1995            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone__1996            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1997            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1998            <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1999            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2002            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2003            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2004            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2006            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2008            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2009            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2013            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2014            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2017            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2018            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2: Correlation
data_correlation_dismissed <- data_binarized %>%
    correlate(ceo_dismissal__dismissed)

data_correlation_notdis <- data_binarized %>%
    correlate(ceo_dismissal__not_dis)

data_correlation_dismissed

## # A tibble: 44 × 3
##    feature          bin       correlation
##    <fct>            <chr>           <dbl>
##  1 ceo_dismissal    dismissed      1     
##  2 ceo_dismissal    not_dis       -1     
##  3 fyear            -Inf_1999     -0.0774
##  4 max_tenure_ceodb 1              0.0580
##  5 max_tenure_ceodb 2             -0.0536
##  6 fyear_gone       1999          -0.0391
##  7 fyear_gone       2002           0.0374
##  8 fyear            1999_2006      0.0345
##  9 fyear            2006_2012      0.0301
## 10 fyear_gone       2003           0.0296
## # ℹ 34 more rows

# Step 3: Plot
data_correlation_dismissed %>% 
    correlationfunnel::plot_correlation_funnel() +
    labs(title = "Correlation Funnel for CEO Dismissal")

## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model building

Split data

library(tidymodels)

# Set seed for reproducibility
set.seed(1234)
data_clean <- departures_clean #%>% group_by(ceo_dismissal)%>% sample_n(100) %>% ungroup

# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5044/562]> Fold01
##  2 <split [5044/562]> Fold02
##  3 <split [5045/561]> Fold03
##  4 <split [5045/561]> Fold04
##  5 <split [5046/560]> Fold05
##  6 <split [5046/560]> Fold06
##  7 <split [5046/560]> Fold07
##  8 <split [5046/560]> Fold08
##  9 <split [5046/560]> Fold09
## 10 <split [5046/560]> Fold10

Preprocess data

#departures_clean <- departures_clean %>%
#mutate(leftofc = as.Date(leftofc, format = "%Y-%m-%d"))
    
xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
    update_role(dismissal_dataset_id, new_role = "ID")  %>%
    step_other(coname, exec_fullname, threshold = 0.05) %>%
    step_tokenize(notes) %>%
    step_tokenfilter(notes, max_tokens = 100) %>%
    step_tfidf(notes)  %>%
    step_date(leftofc, features = c("year", "month", "doy"), keep_original_cols = FALSE) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_smote(ceo_dismissal)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 8,988
## Columns: 156
## $ dismissal_dataset_id    <dbl> 84, 85, 119, 162, 243, 244, 263, 280, 300, 346…
## $ fyear                   <dbl> 1993, 1998, 1995, 2004, 1993, 1995, 1993, 2002…
## $ ceo_dismissal           <fct> dismissed, dismissed, dismissed, dismissed, di…
## $ tfidf_notes_1           <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_1998        <dbl> 0.0000000, 0.2084846, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_2000        <dbl> 0.00000000, 0.10688168, 0.00000000, 0.07125445…
## $ tfidf_notes_a           <dbl> 0.08640627, 0.06994794, 0.00000000, 0.06994794…
## $ tfidf_notes_acquisition <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_after       <dbl> 0.05080277, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_agreement   <dbl> 0.07913465, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_also        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_an          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_and         <dbl> 0.12289881, 0.14923426, 0.08954056, 0.07959161…
## $ tfidf_notes_announced   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_april       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_as          <dbl> 0.05605974, 0.06807255, 0.00000000, 0.02269085…
## $ tfidf_notes_at          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_based       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_be          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_been        <dbl> 0.00000000, 0.00000000, 0.07477204, 0.04984803…
## $ tfidf_notes_billion     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_board       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_business    <dbl> 0.07880085, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_but         <dbl> 0.00000000, 0.09133601, 0.00000000, 0.00000000…
## $ tfidf_notes_by          <dbl> 0.05068340, 0.06154413, 0.00000000, 0.04102942…
## $ tfidf_notes_ceo         <dbl> 0.03545019, 0.04304666, 0.04304666, 0.00000000…
## $ tfidf_notes_chairman    <dbl> 0.06958794, 0.04224982, 0.00000000, 0.00000000…
## $ tfidf_notes_chief       <dbl> 0.00000000, 0.00000000, 0.04244088, 0.00000000…
## $ tfidf_notes_co          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_company     <dbl> 0.00000000, 0.03753689, 0.00000000, 0.02502459…
## $ `tfidf_notes_company's` <dbl> 0.06893837, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_corp        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_corporation <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_december    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_director    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_directors   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_down        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_effective   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_executive   <dbl> 0.00000000, 0.00000000, 0.04024952, 0.00000000…
## $ tfidf_notes_financial   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06587907…
## $ tfidf_notes_following   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_for         <dbl> 0.00000000, 0.00000000, 0.04797170, 0.03198113…
## $ tfidf_notes_from        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03216000…
## $ tfidf_notes_group       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_had         <dbl> 0.00000000, 0.00000000, 0.07994803, 0.05329868…
## $ tfidf_notes_has         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03766112…
## $ tfidf_notes_have        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06773914…
## $ tfidf_notes_he          <dbl> 0.00000000, 0.00000000, 0.04622153, 0.03081435…
## $ tfidf_notes_his         <dbl> 0.08380476, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_in          <dbl> 0.11624220, 0.03528781, 0.03528781, 0.04705041…
## $ tfidf_notes_inc         <dbl> 0.00000000, 0.05536558, 0.00000000, 0.00000000…
## $ tfidf_notes_into        <dbl> 0.07539130, 0.00000000, 0.00000000, 0.06103106…
## $ tfidf_notes_is          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_it          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04521652…
## $ tfidf_notes_its         <dbl> 0.00000000, 0.06183488, 0.00000000, 0.00000000…
## $ tfidf_notes_january     <dbl> 0.00000000, 0.09481317, 0.00000000, 0.00000000…
## $ tfidf_notes_july        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_june        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_march       <dbl> 0.0000000, 0.0000000, 0.0998133, 0.0000000, 0.…
## $ tfidf_notes_may         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06038407…
## $ tfidf_notes_merger      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_million     <dbl> 0.00000000, 0.00000000, 0.17517900, 0.00000000…
## $ tfidf_notes_mr          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.07298742…
## $ tfidf_notes_named       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_new         <dbl> 0.00000000, 0.07841573, 0.00000000, 0.00000000…
## $ tfidf_notes_not         <dbl> 0.07338546, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_of          <dbl> 0.02374139, 0.00000000, 0.08648649, 0.03843844…
## $ tfidf_notes_officer     <dbl> 0.00000000, 0.00000000, 0.04670480, 0.00000000…
## $ tfidf_notes_on          <dbl> 0.00000000, 0.04741923, 0.09483847, 0.00000000…
## $ tfidf_notes_operating   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_or          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_over        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_position    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_president   <dbl> 0.00000000, 0.04747122, 0.09494245, 0.00000000…
## $ tfidf_notes_resigned    <dbl> 0.00000000, 0.00000000, 0.08023561, 0.00000000…
## $ tfidf_notes_retire      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retired     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retirement  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_role        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_said        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_served      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_share       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_since       <dbl> 0.0000000, 0.0000000, 0.1489905, 0.0000000, 0.…
## $ tfidf_notes_stepped     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_stock       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_that        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03241431…
## $ tfidf_notes_the         <dbl> 0.04501321, 0.02732945, 0.02732945, 0.10931780…
## $ tfidf_notes_this        <dbl> 0.06810383, 0.00000000, 0.00000000, 0.05513167…
## $ tfidf_notes_time        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_to          <dbl> 0.02731539, 0.06633738, 0.00000000, 0.04422492…
## $ tfidf_notes_today       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_until       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_vice        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_was         <dbl> 0.03493467, 0.00000000, 0.00000000, 0.02828045…
## $ tfidf_notes_when        <dbl> 0.07311918, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_which       <dbl> 0.00000000, 0.00000000, 0.08259285, 0.11012380…
## $ tfidf_notes_who         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_will        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_with        <dbl> 0.04367090, 0.00000000, 0.05302895, 0.00000000…
## $ tfidf_notes_would       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_year        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04842102…
## $ tfidf_notes_years       <dbl> 0.00000000, 0.07172717, 0.00000000, 0.00000000…
## $ leftofc_year            <dbl> 1993, 1998, 1996, 2005, 1993, 1996, 1993, 2002…
## $ leftofc_doy             <dbl> 212, 189, 74, 73, 152, 33, 181, 273, 182, 304,…
## $ coname_other            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ exec_fullname_other     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ tenure_no_ceodb_X2      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tenure_no_ceodb_X3      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X2     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X3     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X4     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1988        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1990        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1991        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1992        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1993        <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0…
## $ fyear_gone_X1994        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ fyear_gone_X1995        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ fyear_gone_X1996        <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ fyear_gone_X1997        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ fyear_gone_X1998        <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2000        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2001        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2002        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2003        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0…
## $ fyear_gone_X2004        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2005        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2006        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2007        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2008        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2009        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2010        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2011        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2012        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2013        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2014        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2015        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2016        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2017        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2018        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2019        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2020        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2021        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Feb       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Mar       <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Apr       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_May       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Jun       <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Jul       <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ leftofc_month_Aug       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Sep       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0…
## $ leftofc_month_Oct       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1…
## $ leftofc_month_Nov       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Dec       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…

Specify model

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune(), min_n = tune(),   learn_rate = tune()) %>%  
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune Hyperparameters

doParallel::registerDoParallel()

set.seed(65743)
xgboost_tune <- 
    tune_grid(xgboost_workflow,
              resamples = data_cv,
              grid = 5,
              control = control_grid(save_pred = TRUE))

Model Evaluation

Identify Optimal Values for Hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 10
##    trees min_n tree_depth learn_rate .metric     .estimator  mean     n std_err
##    <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>   <dbl>
##  1   885     4          1    0.0670  accuracy    binary     0.811    10 0.00591
##  2   885     4          1    0.0670  brier_class binary     0.133    10 0.00307
##  3   885     4          1    0.0670  roc_auc     binary     0.829    10 0.00883
##  4   541    16          9    0.0266  accuracy    binary     0.836    10 0.00375
##  5   541    16          9    0.0266  brier_class binary     0.117    10 0.00284
##  6   541    16          9    0.0266  roc_auc     binary     0.855    10 0.00799
##  7   325    18         10    0.00276 accuracy    binary     0.780    10 0.00633
##  8   325    18         10    0.00276 brier_class binary     0.170    10 0.00191
##  9   325    18         10    0.00276 roc_auc     binary     0.786    10 0.0112 
## 10  1754    32         13    0.00495 accuracy    binary     0.829    10 0.00379
## 11  1754    32         13    0.00495 brier_class binary     0.121    10 0.00279
## 12  1754    32         13    0.00495 roc_auc     binary     0.848    10 0.00766
## 13  1312    38          7    0.141   accuracy    binary     0.826    10 0.00425
## 14  1312    38          7    0.141   brier_class binary     0.129    10 0.00343
## 15  1312    38          7    0.141   roc_auc     binary     0.832    10 0.00847
## # ℹ 1 more variable: .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the Model for the Last Time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.837 Preprocessor1_Model1
## 2 roc_auc     binary         0.853 Preprocessor1_Model1
## 3 brier_class binary         0.120 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

Variable Importance

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had accuracy of 0.838 and AUC of 0.856

Feature transformation: Date function transformation. Resulted in almost the same result, but slightly worsen with accuracy of 0.837 and AUC 0.0.853.
Feature transformation: removal #%>% group_by(ceo_dismissal)%>% sample_n(100) %>% ungroup: No improvment accuracy, worsen result on full data set.

*Feature slection: PCA didn’t make an improvement.

Apply 8

Ronja Dahlin

2025-04-01

Import data

Explore Data

Issues with the data

Explore data

Explore data

CEO Dismissal or Tenure

Correlation Plot

Model building

Split data

Preprocess data

Tune Hyperparameters

Model Evaluation

Identify Optimal Values for Hyperparameters

Fit the Model for the Last Time

Variable Importance

Conclusion