The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Goal is to predict CEO departure (ceo_dismissal) by using the departures dataset.

Import Data

data <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv")

Explore Data

skimr::skim(data)

Data summary
Name	data
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	9
numeric	10
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
leftofc	1802	0.81	20	20	3627
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
X_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Issues With The Data

Missing Values

Interim_coceo (97% missing)
still_there (78% missing)
eight_ks (48% missing)

Factors or Numeric Variables

departure_code (categorical but currently numeric)
interim_coceo (needs to be a factor)
leftofc (needs to be a factor)
still_there (needs to be a factor)

0 Variance Variables

X_merge

Character Names

coname
exec_fullname
sources

Unbalanced Target Variable

ceo_dismissal

ID Variable

dismissal_dataset_id
gvkey
cik

Data cleaning

# Clean the data and ensure ceo_dismissal is a factor
data_clean <- data %>%
  # Convert ceo_dismissal and factors to proper types
  filter(!is.na(ceo_dismissal)) %>%
  mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
  mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>% 
  
  # Remove variables with missing values in key columns
  select(-c(interim_coceo, still_there, eight_ks)) %>%

  # Remove irrelevant variables that don't seem to have predictive power
  select(-c(X_merge, sources)) %>%

  # Remove variable with info that only becomes available after the fact
  select(-departure_code) %>%

  # Remove redundant variables 
  select(-c(gvkey, cik, co_per_rol, leftofc, fyear)) %>%

  # Remove duplicates in dismissal_dataset_id, which is the id variable
  distinct(dismissal_dataset_id, .keep_all = TRUE) %>%

  # Remove 2997 in fyear_gone
  filter(fyear_gone < 2025) %>%

  # Convert numeric variables that should be factors
  mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%

  # Convert all character variables to factors
  mutate(across(where(is.character), as.factor)) %>%

  # Convert notes to character
  mutate(notes = as.character(notes)) %>%
  
  # Remove missing values
  na.omit()

skimr::skim(data_clean)

Data summary
Name	data_clean
Number of rows	7458
Number of columns	8
_______________________
Column type frequency:
character	1
factor	6
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
notes	0	1	5	3117	0	7448	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
coname	1	FALSE	3427	BAR: 8, CLA: 8, FED: 8, NTN: 8
exec_fullname	1	FALSE	6961	Joh: 4, Mel: 4, Alb: 3, Ami: 3
ceo_dismissal	1	FALSE	2	not: 5976, dis: 1482
tenure_no_ceodb	1	FALSE	3	1: 7274, 2: 177, 3: 7
max_tenure_ceodb	1	FALSE	4	1: 7123, 2: 317, 3: 15, 4: 3
fyear_gone	1	FALSE	34	200: 378, 199: 350, 200: 332, 200: 320

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1	5570.24	25786.43	1	2170.25	4321.5	6575.75	559044	▇▁▁▁▁

Data Exploration

# Bar plot for CEO Dismissal
data_clean %>%
  ggplot(aes(ceo_dismissal)) + 
  geom_bar() +
  labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")

data_clean %>%    
  ggplot(aes(x = ceo_dismissal, y = tenure_no_ceodb)) + 
  geom_boxplot() +
  labs(title = "CEO Dismissal vs. Tenure", x = "CEO Dismissal", y = "CEO Tenure")

data_clean %>% 
  select(-dismissal_dataset_id, -notes) %>%
  binarize() -> data_binarized

data_binarized %>% glimpse()

## Rows: 7,458
## Columns: 40
## $ coname__BARRICK_GOLD_CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__dismissed    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ ceo_dismissal__not_dis      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ tenure_no_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1993            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__1994            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1995            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone__1996            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1997            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1998            <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1999            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2002            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2003            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2004            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2006            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2008            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2009            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2013            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2014            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2017            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2018            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Correlation for both categories of ceo_dismissal

correlation_results <- data_binarized %>%
  correlate(`ceo_dismissal__not_dis`)

correlation_results

## # A tibble: 40 × 3
##    feature          bin       correlation
##    <fct>            <chr>           <dbl>
##  1 ceo_dismissal    dismissed     -1     
##  2 ceo_dismissal    not_dis        1     
##  3 max_tenure_ceodb 1             -0.0577
##  4 max_tenure_ceodb 2              0.0533
##  5 fyear_gone       1999           0.0390
##  6 fyear_gone       2002          -0.0378
##  7 fyear_gone       2003          -0.0303
##  8 fyear_gone       2009          -0.0292
##  9 fyear_gone       2008          -0.0261
## 10 fyear_gone       1997           0.0255
## # ℹ 30 more rows

# Step 3: Plot the correlation funnel
correlation_results %>% 
  correlationfunnel::plot_correlation_funnel() +
  labs(title = "Correlation Funnel for CEO Dismissal")

## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Sample to smaller size

# set.seed(2025)
# data_clean <- data_clean %>%
#   group_by(ceo_dismissal) %>%
#   sample_n(50) %>%
#   ungroup()

Split Data

# Set seed for reproducibility
set.seed(1234)

# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5032/561]> Fold01
##  2 <split [5033/560]> Fold02
##  3 <split [5034/559]> Fold03
##  4 <split [5034/559]> Fold04
##  5 <split [5034/559]> Fold05
##  6 <split [5034/559]> Fold06
##  7 <split [5034/559]> Fold07
##  8 <split [5034/559]> Fold08
##  9 <split [5034/559]> Fold09
## 10 <split [5034/559]> Fold10

Preprocess Data

xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
    #step_YeoJohnson(co_per_rol) %>%
    step_tokenize(notes) %>% 
    step_tokenfilter(notes, max_tokens = 100) %>%
    step_tfidf(notes) %>%  
    step_other(coname, exec_fullname) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_pca(all_numeric_predictors(), threshold = .99) %>%
      step_smote(ceo_dismissal)


xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 8,964
## Columns: 135
## $ PC001         <dbl> -1.6284406, 0.4270375, 1.5833069, -2.1709041, -0.5342902…
## $ PC002         <dbl> 0.78352516, 0.09523363, -0.91550439, -0.90312207, -0.073…
## $ PC003         <dbl> 1.797294004, 2.706701023, 2.490838270, 1.162134665, -0.0…
## $ PC004         <dbl> 0.41680455, 0.42088337, -0.57972958, 0.05150080, -0.7222…
## $ PC005         <dbl> -0.40284287, -1.12993574, 1.83100116, 2.19968386, 1.4129…
## $ PC006         <dbl> 1.251781146, 0.224236085, -1.926919739, -0.312256269, 1.…
## $ PC007         <dbl> -0.09349713, -0.02737186, -2.12117958, -1.50151521, -0.8…
## $ PC008         <dbl> 0.8694577, 2.0245836, -0.8054977, -0.8371349, -0.2957123…
## $ PC009         <dbl> -0.7388101955, 1.0852747952, 1.3569491798, -0.4062328436…
## $ PC010         <dbl> 0.808950250, 1.715108180, -0.292668007, 0.230742289, -0.…
## $ PC011         <dbl> 0.352543485, -0.187871630, 0.773848033, -0.127928018, -0…
## $ PC012         <dbl> 0.49908054, -1.83884235, -0.69968503, -0.57498004, 0.164…
## $ PC013         <dbl> 1.07099601, 1.99248283, 0.55953164, 0.24501966, -0.19673…
## $ PC014         <dbl> 0.82372498, 0.53901128, 2.05327868, 0.85582580, -1.13468…
## $ PC015         <dbl> 1.80814673, 0.72726215, 0.52540846, -0.09101924, -0.8689…
## $ PC016         <dbl> 0.91538555, -3.78006949, 0.34673968, 0.67202391, 0.81740…
## $ PC017         <dbl> -1.27000497, 0.08805806, 0.46087330, -0.61347138, -1.111…
## $ PC018         <dbl> -1.09609131, 0.05447544, -1.42430391, 0.08385628, 0.5465…
## $ PC019         <dbl> 1.14584366, 1.12705849, 1.17266185, 0.64415099, -0.23605…
## $ PC020         <dbl> -0.28594557, -1.58359581, 0.83335682, 0.66336208, -0.630…
## $ PC021         <dbl> -0.818993709, -0.445854919, -0.754140899, -0.441686250, …
## $ PC022         <dbl> 0.26415258, -1.66175308, 0.03054463, 1.52216215, -0.9075…
## $ PC023         <dbl> -1.09662989, 1.25317532, -1.32865310, -0.24488633, -0.15…
## $ PC024         <dbl> -1.1445900, -0.4470237, -0.3016942, 0.3841351, -0.246605…
## $ PC025         <dbl> -0.53647192, 0.04807141, -0.19088881, 0.30820563, -0.242…
## $ PC026         <dbl> 1.50981114, -0.51615694, 0.16513118, 0.63753748, 0.44205…
## $ PC027         <dbl> -0.07245940, -0.51266761, 1.12488730, -0.32599001, -0.77…
## $ PC028         <dbl> -0.05693672, -0.17102664, -0.54324711, -0.40030113, 0.66…
## $ PC029         <dbl> 1.50539619, 1.35874874, 0.56824482, 1.65194337, 2.626676…
## $ PC030         <dbl> -0.63636824, -0.83672734, -0.43225037, -0.37869069, 0.76…
## $ PC031         <dbl> -0.007521625, -0.473040807, 0.416282552, -0.050367172, 0…
## $ PC032         <dbl> -1.92105757, -1.63608597, -0.61480401, 1.05801649, -0.55…
## $ PC033         <dbl> -0.63032181, -0.46139987, -0.16691122, 0.75439889, 1.274…
## $ PC034         <dbl> -1.25979044, 0.37900923, 0.07952578, 1.26260822, -2.3758…
## $ PC035         <dbl> -0.02550365, 0.19123241, 2.47315501, 0.21888484, 1.62866…
## $ PC036         <dbl> -1.36731243, -0.77573286, -0.37960334, -1.14969337, 0.51…
## $ PC037         <dbl> 2.69303072, -1.66166346, -1.18823281, -0.30185560, 0.683…
## $ PC038         <dbl> -0.63857000, -0.51805365, -0.94859245, -1.30926096, -0.4…
## $ PC039         <dbl> 0.53652088, 2.09626066, -0.47487946, 0.04529004, 0.28460…
## $ PC040         <dbl> -0.51317653, -0.56451553, 0.06609194, -0.03121990, 0.353…
## $ PC041         <dbl> 0.213126613, 0.616577616, -0.166693696, -1.869643410, -0…
## $ PC042         <dbl> 0.64788131, -1.44337495, 0.71640970, -0.03156215, -0.513…
## $ PC043         <dbl> 0.79357866, -0.43332233, 0.79292957, -0.04254581, -0.336…
## $ PC044         <dbl> 1.28327607, 0.71933768, -0.98717127, 0.42008156, 1.93243…
## $ PC045         <dbl> 0.6410789, -0.6178581, -0.1606612, -0.7296902, 0.4220054…
## $ PC046         <dbl> -0.12884084, 0.44778503, -1.06656904, -0.18696149, -0.94…
## $ PC047         <dbl> -1.0641485, -0.5182698, 0.3577788, 1.8094486, -0.8046669…
## $ PC048         <dbl> -0.92753945, 0.23539358, 0.46255590, -1.69031749, -1.358…
## $ PC049         <dbl> 1.30664838, -0.48803518, -1.39065514, -1.01875688, 2.019…
## $ PC050         <dbl> 0.04904405, -0.50178890, 0.45082999, -1.08198804, -0.102…
## $ PC051         <dbl> 2.131406882, -0.017023641, 0.009231029, 0.148124322, 2.0…
## $ PC052         <dbl> -0.15449817, 0.43810351, -1.23973897, -0.14819136, -1.21…
## $ PC053         <dbl> 0.19260529, 1.96382503, -0.20665421, 0.07923503, 1.97357…
## $ PC054         <dbl> -1.93142704, 0.30964273, 1.75039616, -0.25465173, -2.789…
## $ PC055         <dbl> -1.60895445, 1.24471425, -1.25830438, 0.26571799, -1.940…
## $ PC056         <dbl> -0.72215820, -0.04415546, -0.22536909, -0.89896639, 0.18…
## $ PC057         <dbl> 0.51142118, -0.11580627, -0.63675038, -0.31832659, 0.655…
## $ PC058         <dbl> -0.97017725, -1.21464240, 1.34696790, 0.90212052, 0.1281…
## $ PC059         <dbl> 0.75315945, 0.02767181, -0.71617502, 0.49902472, 0.42617…
## $ PC060         <dbl> -0.03902719, 0.18530274, 1.09682130, 0.37835629, -0.6488…
## $ PC061         <dbl> -1.493596305, 1.075699407, 0.765456118, 0.599683637, -1.…
## $ PC062         <dbl> -2.05131366, -0.55458157, 2.02321535, -1.29585018, -1.25…
## $ PC063         <dbl> 2.9527917, -0.7935539, -0.8699252, 0.2931757, 1.6639981,…
## $ PC064         <dbl> -1.37966043, 0.10900777, -0.15947715, 1.29266875, -0.679…
## $ PC065         <dbl> 1.30290587, -0.34814748, -0.75528409, 1.46921587, 1.1311…
## $ PC066         <dbl> 0.12699374, -0.04871897, 2.47103250, -0.71825876, 0.0342…
## $ PC067         <dbl> -1.8870410, -0.2273448, -0.4989705, -0.2392547, -3.15121…
## $ PC068         <dbl> -0.24920745, 2.37790659, -1.10306664, -1.10063188, -0.90…
## $ PC069         <dbl> 0.953126206, 0.921769365, -1.466911316, 0.046239434, -0.…
## $ PC070         <dbl> 1.27264363, -0.54946407, -0.31972235, 0.90952400, 1.0801…
## $ PC071         <dbl> -1.5329678, -0.2417034, -0.4878853, 0.4211702, -1.235711…
## $ PC072         <dbl> -0.191877945, -0.445668908, -1.765512212, -2.107329173, …
## $ PC073         <dbl> 0.08879847, -0.26413927, 0.78038461, 0.05179306, 0.87129…
## $ PC074         <dbl> -0.40045538, 0.32751497, -0.71452553, 0.06169818, -1.081…
## $ PC075         <dbl> 1.13964098, -0.01928949, -0.14901320, -0.09346351, 2.129…
## $ PC076         <dbl> 0.922498280, 0.142542812, 0.342413605, -1.223167492, -1.…
## $ PC077         <dbl> -0.65251173, -0.01658185, -1.77767902, 0.34551817, 1.017…
## $ PC078         <dbl> 0.7212250, -1.7250055, -1.2924428, 0.7585850, 0.3492498,…
## $ PC079         <dbl> 1.43663739, 0.25631997, -1.10223856, 0.48262546, 2.53034…
## $ PC080         <dbl> 0.00906635, 0.17999460, 0.40963494, 1.06494936, 0.074141…
## $ PC081         <dbl> 0.86873354, -0.28240495, -1.45181943, -1.05137161, 0.556…
## $ PC082         <dbl> 0.52109244, 0.11650614, -0.06642021, 0.76320348, 0.02032…
## $ PC083         <dbl> -3.18487580, 0.54628750, -0.36104433, 0.19097054, -1.345…
## $ PC084         <dbl> 1.45045710, -0.55675973, -0.66697593, -0.26464009, 0.889…
## $ PC085         <dbl> 1.535257503, 0.574684993, -0.585225780, 0.007181624, -0.…
## $ PC086         <dbl> -0.551989708, 0.052739272, -0.118771616, 1.080455737, -0…
## $ PC087         <dbl> -0.54356324, 1.06442196, -2.44189066, 0.34401324, 1.1888…
## $ PC088         <dbl> -0.09579508, 0.60470887, -0.22915670, -1.78303941, 0.565…
## $ PC089         <dbl> -0.33016885, -1.64524741, 0.47007581, -1.27647505, 0.081…
## $ PC090         <dbl> 0.295571328, 0.501597775, 1.371195841, -0.012027245, 0.0…
## $ PC091         <dbl> -0.23920501, -1.45628123, -0.44207233, 0.49044559, -0.40…
## $ PC092         <dbl> -0.02159656, -0.46645243, 0.23209852, 0.67951592, -0.179…
## $ PC093         <dbl> 1.19622978, 0.17610307, -1.43716159, -0.17093242, 0.0943…
## $ PC094         <dbl> 0.17504842, -0.30239553, 1.28596978, 0.14646092, 0.33183…
## $ PC095         <dbl> 1.20737557, 0.19955276, 1.02896667, -0.25395975, -0.5019…
## $ PC096         <dbl> 0.70325270, -0.42940598, -0.88382380, -1.06096622, -1.00…
## $ PC097         <dbl> 0.48793783, -0.32786873, -0.31443492, -0.74769337, -1.47…
## $ PC098         <dbl> -0.821374271, 0.201742844, -1.330244881, 0.885833720, -0…
## $ PC099         <dbl> -0.9455541942, 0.4178525990, 0.3468402612, 1.1363935779,…
## $ PC100         <dbl> -0.22088216, -0.33090152, -1.03672502, -0.07762677, 0.74…
## $ PC101         <dbl> -4.040468e-01, -4.595533e-01, -1.119953e+00, -3.766495e-…
## $ PC102         <dbl> -0.63277402, -1.00641652, -0.87290863, 0.27840469, -1.23…
## $ PC103         <dbl> 1.14972895, 0.04600043, 0.32252962, 0.37052229, 0.288131…
## $ PC104         <dbl> -0.538281132, 0.593001958, 0.576996533, 0.313007103, -0.…
## $ PC105         <dbl> 1.27589282, 1.16204694, -0.72424700, -0.38200727, 0.8564…
## $ PC106         <dbl> 0.66945125, -0.04078497, 0.79413743, -0.30236233, 0.2667…
## $ PC107         <dbl> 1.74919540, 0.87742567, -1.24035011, -0.09197240, -2.296…
## $ PC108         <dbl> -0.05094060, 0.07726532, -0.15595084, 0.93615172, 0.3325…
## $ PC109         <dbl> -2.43593309, -0.69609700, -1.28720179, 0.77425434, -1.59…
## $ PC110         <dbl> -0.88206162, 0.48866967, 0.38565580, 0.18169790, -0.7218…
## $ PC111         <dbl> -0.15372249, 1.11225624, 0.53570294, -0.13551887, 1.8560…
## $ PC112         <dbl> -0.4692114, -0.8183311, 0.2138384, 0.3627767, -0.1275456…
## $ PC113         <dbl> -1.27959570, 1.55485971, -0.07274464, -0.59437228, 2.176…
## $ PC114         <dbl> 0.56227707, 1.18945574, 0.59030831, 0.19136660, -0.78010…
## $ PC115         <dbl> -0.90241352, 0.31208834, 0.61634255, -0.47254676, 0.6825…
## $ PC116         <dbl> -0.551916042, -0.543162422, 2.071059902, -0.196954778, 0…
## $ PC117         <dbl> -0.574760750, -0.346781722, -0.768897535, -0.224909081, …
## $ PC118         <dbl> -1.12799445, 1.50131059, -1.30047289, 0.40925606, -0.184…
## $ PC119         <dbl> -0.11286362, -0.17380818, 0.94883651, 0.11296578, -0.594…
## $ PC120         <dbl> -0.043648918, -0.691727458, -0.581224416, -0.373860196, …
## $ PC121         <dbl> -0.06955683, 1.84391264, -0.38489307, 0.14888605, 0.2595…
## $ PC122         <dbl> -0.43319820, 1.00740912, 0.86550622, -0.07968793, -0.333…
## $ PC123         <dbl> 0.67739741, -0.74384343, -0.39486819, 0.08830781, 1.2858…
## $ PC124         <dbl> 0.29405693, -0.08603429, 1.96144662, -1.02242457, 0.1152…
## $ PC125         <dbl> -0.50489899, 2.32060427, -0.46141710, 0.08393367, -0.162…
## $ PC126         <dbl> 0.58186873, 0.62872774, -0.99395680, 0.23513235, -1.1367…
## $ PC127         <dbl> 0.05332216, 1.11184104, 0.82267039, 0.39604248, 0.117065…
## $ PC128         <dbl> -0.44777579, -1.97178543, 1.07818217, -0.18580340, -0.17…
## $ PC129         <dbl> -0.50689285, -1.41521549, 0.86671746, 0.36039933, 0.6627…
## $ PC130         <dbl> 0.21249640, -0.77755442, -0.06605365, -0.45650700, -0.54…
## $ PC131         <dbl> 0.002318063, 0.353422079, -0.682594405, 0.316645043, 0.7…
## $ PC132         <dbl> -0.807333738, -0.438413053, -0.227659805, -0.409797866, …
## $ PC133         <dbl> -0.74184284, -0.04890875, -0.55329175, -0.73380161, -0.6…
## $ PC134         <dbl> -0.44436782, -0.55781138, -0.63206478, 0.17875853, 0.432…
## $ ceo_dismissal <fct> dismissed, dismissed, dismissed, dismissed, dismissed, d…

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(),
             tree_depth = tune(),
             min_n = tune(),
             learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(65743)

xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5, 
            control = control_grid(save_pred = TRUE))

Model Evaluation

Identify Optimal Values for Hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 10
##    trees min_n tree_depth learn_rate .metric     .estimator  mean     n std_err
##    <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>   <dbl>
##  1   885     4          1    0.0670  accuracy    binary     0.771    10 0.00716
##  2   885     4          1    0.0670  brier_class binary     0.156    10 0.00391
##  3   885     4          1    0.0670  roc_auc     binary     0.816    10 0.00960
##  4   541    16          9    0.0266  accuracy    binary     0.818    10 0.00554
##  5   541    16          9    0.0266  brier_class binary     0.128    10 0.00353
##  6   541    16          9    0.0266  roc_auc     binary     0.836    10 0.00786
##  7   325    18         10    0.00276 accuracy    binary     0.762    10 0.0105 
##  8   325    18         10    0.00276 brier_class binary     0.179    10 0.00261
##  9   325    18         10    0.00276 roc_auc     binary     0.791    10 0.0104 
## 10  1754    32         13    0.00495 accuracy    binary     0.814    10 0.00566
## 11  1754    32         13    0.00495 brier_class binary     0.131    10 0.00347
## 12  1754    32         13    0.00495 roc_auc     binary     0.834    10 0.00823
## 13  1312    38          7    0.141   accuracy    binary     0.819    10 0.00646
## 14  1312    38          7    0.141   brier_class binary     0.137    10 0.00440
## 15  1312    38          7    0.141   roc_auc     binary     0.832    10 0.00709
## # ℹ 1 more variable: .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the Model for the Last Time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.827 Preprocessor1_Model1
## 2 roc_auc     binary         0.834 Preprocessor1_Model1
## 3 brier_class binary         0.132 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

Variable Importance

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

# Conclusion Normalization and PCA: I added normalization and principal component analysis (PCA) to reduce dimensionality and eliminate redundant variance in numeric features. This helped streamline the model and reduce noise.

Improved text preprocessing: I continued using TF-IDF on the notes field but also fine-tuned the token filter to limit it to the top 100 tokens, reducing overfitting risk.

Stronger feature engineering: I used step_other() on high-cardinality categorical variables like coname and exec_fullname, which helped simplify the model without losing important patterns.

SMOTE balancing: As before, I addressed the class imbalance using SMOTE, but this step became even more effective when paired with cleaner data and reduced dimensionality.

Apply it to your data 8

Nils Skogestig

2025-03-06