The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Goal is to predict CEO departure (ceo_dismissal) by using the departures dataset.
data <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv")
skimr::skim(data)
Name | data |
Number of rows | 9423 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
character | 9 |
numeric | 10 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
coname | 0 | 1.00 | 2 | 30 | 0 | 3860 | 0 |
exec_fullname | 0 | 1.00 | 5 | 790 | 0 | 8701 | 0 |
interim_coceo | 9105 | 0.03 | 6 | 7 | 0 | 6 | 0 |
leftofc | 1802 | 0.81 | 20 | 20 | 0 | 3627 | 0 |
still_there | 7311 | 0.22 | 3 | 10 | 0 | 77 | 0 |
notes | 1644 | 0.83 | 5 | 3117 | 0 | 7755 | 0 |
sources | 1475 | 0.84 | 18 | 1843 | 0 | 7915 | 0 |
eight_ks | 4499 | 0.52 | 69 | 3884 | 0 | 4914 | 0 |
X_merge | 0 | 1.00 | 11 | 11 | 0 | 1 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1.00 | 5684.10 | 25005.46 | 1 | 2305.5 | 4593 | 6812.5 | 559044 | ▇▁▁▁▁ |
gvkey | 0 | 1.00 | 40132.48 | 53921.34 | 1004 | 7337.0 | 14385 | 60900.5 | 328795 | ▇▁▁▁▁ |
fyear | 0 | 1.00 | 2007.74 | 8.19 | 1987 | 2000.0 | 2008 | 2016.0 | 2020 | ▁▆▅▅▇ |
co_per_rol | 0 | 1.00 | 25580.22 | 18202.38 | -1 | 8555.5 | 22980 | 39275.5 | 64602 | ▇▆▅▃▃ |
departure_code | 1667 | 0.82 | 5.20 | 1.53 | 1 | 5.0 | 5 | 7.0 | 9 | ▁▃▇▅▁ |
ceo_dismissal | 1813 | 0.81 | 0.20 | 0.40 | 0 | 0.0 | 0 | 0.0 | 1 | ▇▁▁▁▂ |
tenure_no_ceodb | 0 | 1.00 | 1.03 | 0.17 | 0 | 1.0 | 1 | 1.0 | 3 | ▁▇▁▁▁ |
max_tenure_ceodb | 0 | 1.00 | 1.05 | 0.24 | 1 | 1.0 | 1 | 1.0 | 4 | ▇▁▁▁▁ |
fyear_gone | 1802 | 0.81 | 2006.64 | 13.63 | 1980 | 2000.0 | 2007 | 2013.0 | 2997 | ▇▁▁▁▁ |
cik | 245 | 0.97 | 741469.17 | 486551.43 | 1750 | 106413.0 | 857323 | 1050375.8 | 1808065 | ▆▁▇▂▁ |
# Clean the data and ensure ceo_dismissal is a factor
data_clean <- data %>%
# Convert ceo_dismissal and factors to proper types
filter(!is.na(ceo_dismissal)) %>%
mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>%
# Remove variables with missing values in key columns
select(-c(interim_coceo, still_there, eight_ks)) %>%
# Remove irrelevant variables that don't seem to have predictive power
select(-c(X_merge, sources)) %>%
# Remove variable with info that only becomes available after the fact
select(-departure_code) %>%
# Remove redundant variables
select(-c(gvkey, cik, co_per_rol, leftofc, fyear)) %>%
# Remove duplicates in dismissal_dataset_id, which is the id variable
distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
# Remove 2997 in fyear_gone
filter(fyear_gone < 2025) %>%
# Convert numeric variables that should be factors
mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%
# Convert all character variables to factors
mutate(across(where(is.character), as.factor)) %>%
# Convert notes to character
mutate(notes = as.character(notes)) %>%
# Remove missing values
na.omit()
skimr::skim(data_clean)
Name | data_clean |
Number of rows | 7458 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 1 |
factor | 6 |
numeric | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
notes | 0 | 1 | 5 | 3117 | 0 | 7448 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
coname | 0 | 1 | FALSE | 3427 | BAR: 8, CLA: 8, FED: 8, NTN: 8 |
exec_fullname | 0 | 1 | FALSE | 6961 | Joh: 4, Mel: 4, Alb: 3, Ami: 3 |
ceo_dismissal | 0 | 1 | FALSE | 2 | not: 5976, dis: 1482 |
tenure_no_ceodb | 0 | 1 | FALSE | 3 | 1: 7274, 2: 177, 3: 7 |
max_tenure_ceodb | 0 | 1 | FALSE | 4 | 1: 7123, 2: 317, 3: 15, 4: 3 |
fyear_gone | 0 | 1 | FALSE | 34 | 200: 378, 199: 350, 200: 332, 200: 320 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1 | 5570.24 | 25786.43 | 1 | 2170.25 | 4321.5 | 6575.75 | 559044 | ▇▁▁▁▁ |
# Bar plot for CEO Dismissal
data_clean %>%
ggplot(aes(ceo_dismissal)) +
geom_bar() +
labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")
data_clean %>%
ggplot(aes(x = ceo_dismissal, y = tenure_no_ceodb)) +
geom_boxplot() +
labs(title = "CEO Dismissal vs. Tenure", x = "CEO Dismissal", y = "CEO Tenure")
data_clean %>%
select(-dismissal_dataset_id, -notes) %>%
binarize() -> data_binarized
data_binarized %>% glimpse()
## Rows: 7,458
## Columns: 40
## $ coname__BARRICK_GOLD_CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__dismissed <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ ceo_dismissal__not_dis <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ tenure_no_ceodb__1 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1993 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__1994 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1995 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone__1996 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1997 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1998 <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2001 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2002 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2003 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2004 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2005 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2006 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2008 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2009 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2011 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2016 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2017 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2018 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Correlation for both categories of ceo_dismissal
correlation_results <- data_binarized %>%
correlate(`ceo_dismissal__not_dis`)
correlation_results
## # A tibble: 40 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 ceo_dismissal dismissed -1
## 2 ceo_dismissal not_dis 1
## 3 max_tenure_ceodb 1 -0.0577
## 4 max_tenure_ceodb 2 0.0533
## 5 fyear_gone 1999 0.0390
## 6 fyear_gone 2002 -0.0378
## 7 fyear_gone 2003 -0.0303
## 8 fyear_gone 2009 -0.0292
## 9 fyear_gone 2008 -0.0261
## 10 fyear_gone 1997 0.0255
## # ℹ 30 more rows
# Step 3: Plot the correlation funnel
correlation_results %>%
correlationfunnel::plot_correlation_funnel() +
labs(title = "Correlation Funnel for CEO Dismissal")
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# set.seed(2025)
# data_clean <- data_clean %>%
# group_by(ceo_dismissal) %>%
# sample_n(50) %>%
# ungroup()
# Set seed for reproducibility
set.seed(1234)
# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)
# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [5032/561]> Fold01
## 2 <split [5033/560]> Fold02
## 3 <split [5034/559]> Fold03
## 4 <split [5034/559]> Fold04
## 5 <split [5034/559]> Fold05
## 6 <split [5034/559]> Fold06
## 7 <split [5034/559]> Fold07
## 8 <split [5034/559]> Fold08
## 9 <split [5034/559]> Fold09
## 10 <split [5034/559]> Fold10
xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
#step_YeoJohnson(co_per_rol) %>%
step_tokenize(notes) %>%
step_tokenfilter(notes, max_tokens = 100) %>%
step_tfidf(notes) %>%
step_other(coname, exec_fullname) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_pca(all_numeric_predictors(), threshold = .99) %>%
step_smote(ceo_dismissal)
xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 8,964
## Columns: 135
## $ PC001 <dbl> -1.6284406, 0.4270375, 1.5833069, -2.1709041, -0.5342902…
## $ PC002 <dbl> 0.78352516, 0.09523363, -0.91550439, -0.90312207, -0.073…
## $ PC003 <dbl> 1.797294004, 2.706701023, 2.490838270, 1.162134665, -0.0…
## $ PC004 <dbl> 0.41680455, 0.42088337, -0.57972958, 0.05150080, -0.7222…
## $ PC005 <dbl> -0.40284287, -1.12993574, 1.83100116, 2.19968386, 1.4129…
## $ PC006 <dbl> 1.251781146, 0.224236085, -1.926919739, -0.312256269, 1.…
## $ PC007 <dbl> -0.09349713, -0.02737186, -2.12117958, -1.50151521, -0.8…
## $ PC008 <dbl> 0.8694577, 2.0245836, -0.8054977, -0.8371349, -0.2957123…
## $ PC009 <dbl> -0.7388101955, 1.0852747952, 1.3569491798, -0.4062328436…
## $ PC010 <dbl> 0.808950250, 1.715108180, -0.292668007, 0.230742289, -0.…
## $ PC011 <dbl> 0.352543485, -0.187871630, 0.773848033, -0.127928018, -0…
## $ PC012 <dbl> 0.49908054, -1.83884235, -0.69968503, -0.57498004, 0.164…
## $ PC013 <dbl> 1.07099601, 1.99248283, 0.55953164, 0.24501966, -0.19673…
## $ PC014 <dbl> 0.82372498, 0.53901128, 2.05327868, 0.85582580, -1.13468…
## $ PC015 <dbl> 1.80814673, 0.72726215, 0.52540846, -0.09101924, -0.8689…
## $ PC016 <dbl> 0.91538555, -3.78006949, 0.34673968, 0.67202391, 0.81740…
## $ PC017 <dbl> -1.27000497, 0.08805806, 0.46087330, -0.61347138, -1.111…
## $ PC018 <dbl> -1.09609131, 0.05447544, -1.42430391, 0.08385628, 0.5465…
## $ PC019 <dbl> 1.14584366, 1.12705849, 1.17266185, 0.64415099, -0.23605…
## $ PC020 <dbl> -0.28594557, -1.58359581, 0.83335682, 0.66336208, -0.630…
## $ PC021 <dbl> -0.818993709, -0.445854919, -0.754140899, -0.441686250, …
## $ PC022 <dbl> 0.26415258, -1.66175308, 0.03054463, 1.52216215, -0.9075…
## $ PC023 <dbl> -1.09662989, 1.25317532, -1.32865310, -0.24488633, -0.15…
## $ PC024 <dbl> -1.1445900, -0.4470237, -0.3016942, 0.3841351, -0.246605…
## $ PC025 <dbl> -0.53647192, 0.04807141, -0.19088881, 0.30820563, -0.242…
## $ PC026 <dbl> 1.50981114, -0.51615694, 0.16513118, 0.63753748, 0.44205…
## $ PC027 <dbl> -0.07245940, -0.51266761, 1.12488730, -0.32599001, -0.77…
## $ PC028 <dbl> -0.05693672, -0.17102664, -0.54324711, -0.40030113, 0.66…
## $ PC029 <dbl> 1.50539619, 1.35874874, 0.56824482, 1.65194337, 2.626676…
## $ PC030 <dbl> -0.63636824, -0.83672734, -0.43225037, -0.37869069, 0.76…
## $ PC031 <dbl> -0.007521625, -0.473040807, 0.416282552, -0.050367172, 0…
## $ PC032 <dbl> -1.92105757, -1.63608597, -0.61480401, 1.05801649, -0.55…
## $ PC033 <dbl> -0.63032181, -0.46139987, -0.16691122, 0.75439889, 1.274…
## $ PC034 <dbl> -1.25979044, 0.37900923, 0.07952578, 1.26260822, -2.3758…
## $ PC035 <dbl> -0.02550365, 0.19123241, 2.47315501, 0.21888484, 1.62866…
## $ PC036 <dbl> -1.36731243, -0.77573286, -0.37960334, -1.14969337, 0.51…
## $ PC037 <dbl> 2.69303072, -1.66166346, -1.18823281, -0.30185560, 0.683…
## $ PC038 <dbl> -0.63857000, -0.51805365, -0.94859245, -1.30926096, -0.4…
## $ PC039 <dbl> 0.53652088, 2.09626066, -0.47487946, 0.04529004, 0.28460…
## $ PC040 <dbl> -0.51317653, -0.56451553, 0.06609194, -0.03121990, 0.353…
## $ PC041 <dbl> 0.213126613, 0.616577616, -0.166693696, -1.869643410, -0…
## $ PC042 <dbl> 0.64788131, -1.44337495, 0.71640970, -0.03156215, -0.513…
## $ PC043 <dbl> 0.79357866, -0.43332233, 0.79292957, -0.04254581, -0.336…
## $ PC044 <dbl> 1.28327607, 0.71933768, -0.98717127, 0.42008156, 1.93243…
## $ PC045 <dbl> 0.6410789, -0.6178581, -0.1606612, -0.7296902, 0.4220054…
## $ PC046 <dbl> -0.12884084, 0.44778503, -1.06656904, -0.18696149, -0.94…
## $ PC047 <dbl> -1.0641485, -0.5182698, 0.3577788, 1.8094486, -0.8046669…
## $ PC048 <dbl> -0.92753945, 0.23539358, 0.46255590, -1.69031749, -1.358…
## $ PC049 <dbl> 1.30664838, -0.48803518, -1.39065514, -1.01875688, 2.019…
## $ PC050 <dbl> 0.04904405, -0.50178890, 0.45082999, -1.08198804, -0.102…
## $ PC051 <dbl> 2.131406882, -0.017023641, 0.009231029, 0.148124322, 2.0…
## $ PC052 <dbl> -0.15449817, 0.43810351, -1.23973897, -0.14819136, -1.21…
## $ PC053 <dbl> 0.19260529, 1.96382503, -0.20665421, 0.07923503, 1.97357…
## $ PC054 <dbl> -1.93142704, 0.30964273, 1.75039616, -0.25465173, -2.789…
## $ PC055 <dbl> -1.60895445, 1.24471425, -1.25830438, 0.26571799, -1.940…
## $ PC056 <dbl> -0.72215820, -0.04415546, -0.22536909, -0.89896639, 0.18…
## $ PC057 <dbl> 0.51142118, -0.11580627, -0.63675038, -0.31832659, 0.655…
## $ PC058 <dbl> -0.97017725, -1.21464240, 1.34696790, 0.90212052, 0.1281…
## $ PC059 <dbl> 0.75315945, 0.02767181, -0.71617502, 0.49902472, 0.42617…
## $ PC060 <dbl> -0.03902719, 0.18530274, 1.09682130, 0.37835629, -0.6488…
## $ PC061 <dbl> -1.493596305, 1.075699407, 0.765456118, 0.599683637, -1.…
## $ PC062 <dbl> -2.05131366, -0.55458157, 2.02321535, -1.29585018, -1.25…
## $ PC063 <dbl> 2.9527917, -0.7935539, -0.8699252, 0.2931757, 1.6639981,…
## $ PC064 <dbl> -1.37966043, 0.10900777, -0.15947715, 1.29266875, -0.679…
## $ PC065 <dbl> 1.30290587, -0.34814748, -0.75528409, 1.46921587, 1.1311…
## $ PC066 <dbl> 0.12699374, -0.04871897, 2.47103250, -0.71825876, 0.0342…
## $ PC067 <dbl> -1.8870410, -0.2273448, -0.4989705, -0.2392547, -3.15121…
## $ PC068 <dbl> -0.24920745, 2.37790659, -1.10306664, -1.10063188, -0.90…
## $ PC069 <dbl> 0.953126206, 0.921769365, -1.466911316, 0.046239434, -0.…
## $ PC070 <dbl> 1.27264363, -0.54946407, -0.31972235, 0.90952400, 1.0801…
## $ PC071 <dbl> -1.5329678, -0.2417034, -0.4878853, 0.4211702, -1.235711…
## $ PC072 <dbl> -0.191877945, -0.445668908, -1.765512212, -2.107329173, …
## $ PC073 <dbl> 0.08879847, -0.26413927, 0.78038461, 0.05179306, 0.87129…
## $ PC074 <dbl> -0.40045538, 0.32751497, -0.71452553, 0.06169818, -1.081…
## $ PC075 <dbl> 1.13964098, -0.01928949, -0.14901320, -0.09346351, 2.129…
## $ PC076 <dbl> 0.922498280, 0.142542812, 0.342413605, -1.223167492, -1.…
## $ PC077 <dbl> -0.65251173, -0.01658185, -1.77767902, 0.34551817, 1.017…
## $ PC078 <dbl> 0.7212250, -1.7250055, -1.2924428, 0.7585850, 0.3492498,…
## $ PC079 <dbl> 1.43663739, 0.25631997, -1.10223856, 0.48262546, 2.53034…
## $ PC080 <dbl> 0.00906635, 0.17999460, 0.40963494, 1.06494936, 0.074141…
## $ PC081 <dbl> 0.86873354, -0.28240495, -1.45181943, -1.05137161, 0.556…
## $ PC082 <dbl> 0.52109244, 0.11650614, -0.06642021, 0.76320348, 0.02032…
## $ PC083 <dbl> -3.18487580, 0.54628750, -0.36104433, 0.19097054, -1.345…
## $ PC084 <dbl> 1.45045710, -0.55675973, -0.66697593, -0.26464009, 0.889…
## $ PC085 <dbl> 1.535257503, 0.574684993, -0.585225780, 0.007181624, -0.…
## $ PC086 <dbl> -0.551989708, 0.052739272, -0.118771616, 1.080455737, -0…
## $ PC087 <dbl> -0.54356324, 1.06442196, -2.44189066, 0.34401324, 1.1888…
## $ PC088 <dbl> -0.09579508, 0.60470887, -0.22915670, -1.78303941, 0.565…
## $ PC089 <dbl> -0.33016885, -1.64524741, 0.47007581, -1.27647505, 0.081…
## $ PC090 <dbl> 0.295571328, 0.501597775, 1.371195841, -0.012027245, 0.0…
## $ PC091 <dbl> -0.23920501, -1.45628123, -0.44207233, 0.49044559, -0.40…
## $ PC092 <dbl> -0.02159656, -0.46645243, 0.23209852, 0.67951592, -0.179…
## $ PC093 <dbl> 1.19622978, 0.17610307, -1.43716159, -0.17093242, 0.0943…
## $ PC094 <dbl> 0.17504842, -0.30239553, 1.28596978, 0.14646092, 0.33183…
## $ PC095 <dbl> 1.20737557, 0.19955276, 1.02896667, -0.25395975, -0.5019…
## $ PC096 <dbl> 0.70325270, -0.42940598, -0.88382380, -1.06096622, -1.00…
## $ PC097 <dbl> 0.48793783, -0.32786873, -0.31443492, -0.74769337, -1.47…
## $ PC098 <dbl> -0.821374271, 0.201742844, -1.330244881, 0.885833720, -0…
## $ PC099 <dbl> -0.9455541942, 0.4178525990, 0.3468402612, 1.1363935779,…
## $ PC100 <dbl> -0.22088216, -0.33090152, -1.03672502, -0.07762677, 0.74…
## $ PC101 <dbl> -4.040468e-01, -4.595533e-01, -1.119953e+00, -3.766495e-…
## $ PC102 <dbl> -0.63277402, -1.00641652, -0.87290863, 0.27840469, -1.23…
## $ PC103 <dbl> 1.14972895, 0.04600043, 0.32252962, 0.37052229, 0.288131…
## $ PC104 <dbl> -0.538281132, 0.593001958, 0.576996533, 0.313007103, -0.…
## $ PC105 <dbl> 1.27589282, 1.16204694, -0.72424700, -0.38200727, 0.8564…
## $ PC106 <dbl> 0.66945125, -0.04078497, 0.79413743, -0.30236233, 0.2667…
## $ PC107 <dbl> 1.74919540, 0.87742567, -1.24035011, -0.09197240, -2.296…
## $ PC108 <dbl> -0.05094060, 0.07726532, -0.15595084, 0.93615172, 0.3325…
## $ PC109 <dbl> -2.43593309, -0.69609700, -1.28720179, 0.77425434, -1.59…
## $ PC110 <dbl> -0.88206162, 0.48866967, 0.38565580, 0.18169790, -0.7218…
## $ PC111 <dbl> -0.15372249, 1.11225624, 0.53570294, -0.13551887, 1.8560…
## $ PC112 <dbl> -0.4692114, -0.8183311, 0.2138384, 0.3627767, -0.1275456…
## $ PC113 <dbl> -1.27959570, 1.55485971, -0.07274464, -0.59437228, 2.176…
## $ PC114 <dbl> 0.56227707, 1.18945574, 0.59030831, 0.19136660, -0.78010…
## $ PC115 <dbl> -0.90241352, 0.31208834, 0.61634255, -0.47254676, 0.6825…
## $ PC116 <dbl> -0.551916042, -0.543162422, 2.071059902, -0.196954778, 0…
## $ PC117 <dbl> -0.574760750, -0.346781722, -0.768897535, -0.224909081, …
## $ PC118 <dbl> -1.12799445, 1.50131059, -1.30047289, 0.40925606, -0.184…
## $ PC119 <dbl> -0.11286362, -0.17380818, 0.94883651, 0.11296578, -0.594…
## $ PC120 <dbl> -0.043648918, -0.691727458, -0.581224416, -0.373860196, …
## $ PC121 <dbl> -0.06955683, 1.84391264, -0.38489307, 0.14888605, 0.2595…
## $ PC122 <dbl> -0.43319820, 1.00740912, 0.86550622, -0.07968793, -0.333…
## $ PC123 <dbl> 0.67739741, -0.74384343, -0.39486819, 0.08830781, 1.2858…
## $ PC124 <dbl> 0.29405693, -0.08603429, 1.96144662, -1.02242457, 0.1152…
## $ PC125 <dbl> -0.50489899, 2.32060427, -0.46141710, 0.08393367, -0.162…
## $ PC126 <dbl> 0.58186873, 0.62872774, -0.99395680, 0.23513235, -1.1367…
## $ PC127 <dbl> 0.05332216, 1.11184104, 0.82267039, 0.39604248, 0.117065…
## $ PC128 <dbl> -0.44777579, -1.97178543, 1.07818217, -0.18580340, -0.17…
## $ PC129 <dbl> -0.50689285, -1.41521549, 0.86671746, 0.36039933, 0.6627…
## $ PC130 <dbl> 0.21249640, -0.77755442, -0.06605365, -0.45650700, -0.54…
## $ PC131 <dbl> 0.002318063, 0.353422079, -0.682594405, 0.316645043, 0.7…
## $ PC132 <dbl> -0.807333738, -0.438413053, -0.227659805, -0.409797866, …
## $ PC133 <dbl> -0.74184284, -0.04890875, -0.55329175, -0.73380161, -0.6…
## $ PC134 <dbl> -0.44436782, -0.55781138, -0.63206478, 0.17875853, 0.432…
## $ ceo_dismissal <fct> dismissed, dismissed, dismissed, dismissed, dismissed, d…
xgboost_spec <-
boost_tree(trees = tune(),
tree_depth = tune(),
min_n = tune(),
learn_rate = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_rec) %>%
add_model(xgboost_spec)
doParallel::registerDoParallel()
set.seed(65743)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5,
control = control_grid(save_pred = TRUE))
collect_metrics(xgboost_tune)
## # A tibble: 15 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 885 4 1 0.0670 accuracy binary 0.771 10 0.00716
## 2 885 4 1 0.0670 brier_class binary 0.156 10 0.00391
## 3 885 4 1 0.0670 roc_auc binary 0.816 10 0.00960
## 4 541 16 9 0.0266 accuracy binary 0.818 10 0.00554
## 5 541 16 9 0.0266 brier_class binary 0.128 10 0.00353
## 6 541 16 9 0.0266 roc_auc binary 0.836 10 0.00786
## 7 325 18 10 0.00276 accuracy binary 0.762 10 0.0105
## 8 325 18 10 0.00276 brier_class binary 0.179 10 0.00261
## 9 325 18 10 0.00276 roc_auc binary 0.791 10 0.0104
## 10 1754 32 13 0.00495 accuracy binary 0.814 10 0.00566
## 11 1754 32 13 0.00495 brier_class binary 0.131 10 0.00347
## 12 1754 32 13 0.00495 roc_auc binary 0.834 10 0.00823
## 13 1312 38 7 0.141 accuracy binary 0.819 10 0.00646
## 14 1312 38 7 0.141 brier_class binary 0.137 10 0.00440
## 15 1312 38 7 0.141 roc_auc binary 0.832 10 0.00709
## # ℹ 1 more variable: .config <chr>
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(ceo_dismissal, .pred_dismissed) %>%
autoplot()
xgboost_last <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
last_fit(data_split)
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.827 Preprocessor1_Model1
## 2 roc_auc binary 0.834 Preprocessor1_Model1
## 3 brier_class binary 0.132 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
autoplot()
xgboost_last %>%
workflows::extract_fit_engine() %>%
vip()
# Conclusion Normalization and PCA: I added normalization and principal
component analysis (PCA) to reduce dimensionality and eliminate
redundant variance in numeric features. This helped streamline the model
and reduce noise.
Improved text preprocessing: I continued using TF-IDF on the notes field but also fine-tuned the token filter to limit it to the top 100 tokens, reducing overfitting risk.
Stronger feature engineering: I used step_other() on high-cardinality categorical variables like coname and exec_fullname, which helped simplify the model without losing important patterns.
SMOTE balancing: As before, I addressed the class imbalance using SMOTE, but this step became even more effective when paired with cleaner data and reduced dimensionality.