Goal is to to predict CEO departure (ceo_dismissal).

Import Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(correlationfunnel)

## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.2
## ✔ recipes      1.1.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(textrecipes)
library(tidytext)
library(usemodels)
library(xgboost)

## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice

library(themis)
library(doParallel)

## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: iterators
## Loading required package: parallel

library(vip)

## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi

data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv')

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

Clean Dataset

data_clean <- data %>%
    
    # Clean the target
    filter(!is.na(ceo_dismissal)) %>%
    mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
    mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>%
    
    # Address too many missing values
    select(-still_there, -interim_coceo, -eight_ks, -notes) %>%
    
    # Remove irrelevant variables
    select(-`_merge`, -sources) %>%
    
    # Remove variables that can't be used
    select(-departure_code) %>%
    
    # Remove redundant variables
    select(-cik, -gvkey, -fyear, -leftofc) %>%
    
    # Remove duplicates in the id variable
    distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
    
    # Remove 2997 in fyear_gone
    filter(fyear_gone < 2025) %>%
    
    # Convert character columns to factors
    mutate(across(c(tenure_no_ceodb, fyear_gone), as.factor)) %>%

    # Convert character columns to factors
    mutate(across(where(is.character), as.factor)) %>%

    # Omit missing values
    na.omit()
    
data_clean %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	7475
Number of columns	8
_______________________
Column type frequency:
factor	5
numeric	3
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
coname	1	FALSE	3427	BAR: 8, CLA: 8, FED: 8, GRE: 8
exec_fullname	1	FALSE	6975	Joh: 4, Mel: 4, Alb: 3, Ami: 3
ceo_dismissal	1	FALSE	2	not: 5992, dis: 1483
tenure_no_ceodb	1	FALSE	3	1: 7289, 2: 179, 3: 7
fyear_gone	1	FALSE	34	200: 379, 199: 351, 200: 334, 200: 321

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	1	5570.32	25757.33	1	2175.5	4326	6579.5	559044	▇▁▁▁▁
co_per_rol	1	21446.53	16350.34	-1	6981.0	18269	33418.5	64601	▇▅▅▂▁
max_tenure_ceodb	1	1.05	0.23	1	1.0	1	1.0	4	▇▁▁▁▁

Explore Data

data_clean %>% count(ceo_dismissal)

## # A tibble: 2 × 2
##   ceo_dismissal     n
##   <fct>         <int>
## 1 dismissed      1483
## 2 not_dis        5992

data_clean %>%
    ggplot(aes(ceo_dismissal)) +
    geom_bar()

ceo_dismissal vs. fyear_gone

data_clean %>%
    ggplot(aes(group = ceo_dismissal, fyear_gone)) +
    geom_boxplot()

correlation plot

# Step 1: Binarize
data_binarized <- data_clean %>%
    select(-dismissal_dataset_id) %>%
    na.omit() %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 7,475
## Columns: 44
## $ coname__BARRICK_GOLD_CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `co_per_rol__-Inf_6981`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ co_per_rol__6981_18269      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__18269_33418.5   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ co_per_rol__33418.5_Inf     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__dismissed    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ ceo_dismissal__not_dis      <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ tenure_no_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1993            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ fyear_gone__1994            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1995            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ fyear_gone__1996            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1997            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1998            <dbl> 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__1999            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2002            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2003            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2004            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2006            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2008            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2009            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2010            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2013            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2014            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2017            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2018            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fyear_gone__2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-OTHER`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2: Correlation
data_correlation <- data_binarized %>%
    correlate(ceo_dismissal__dismissed)

data_correlation

## # A tibble: 44 × 3
##    feature          bin         correlation
##    <fct>            <chr>             <dbl>
##  1 ceo_dismissal    dismissed        1     
##  2 ceo_dismissal    not_dis         -1     
##  3 co_per_rol       -Inf_6981       -0.0595
##  4 max_tenure_ceodb 1                0.0580
##  5 co_per_rol       33418.5_Inf      0.0559
##  6 max_tenure_ceodb 2               -0.0536
##  7 fyear_gone       1999            -0.0391
##  8 fyear_gone       2002             0.0374
##  9 fyear_gone       2003             0.0296
## 10 fyear_gone       2009             0.0289
## # ℹ 34 more rows

# Step 3: Plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Split Data

set.seed(1234)

data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = ceo_dismissal)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5044/562]> Fold01
##  2 <split [5044/562]> Fold02
##  3 <split [5045/561]> Fold03
##  4 <split [5045/561]> Fold04
##  5 <split [5046/560]> Fold05
##  6 <split [5046/560]> Fold06
##  7 <split [5046/560]> Fold07
##  8 <split [5046/560]> Fold08
##  9 <split [5046/560]> Fold09
## 10 <split [5046/560]> Fold10

Preprocess Data

xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
    update_role(dismissal_dataset_id, new_role = "ID") %>%
    step_other(exec_fullname, coname, threshold = 0.01) %>%
    step_dummy(all_nominal_predictors()) %>% 
    step_YeoJohnson(max_tenure_ceodb)%>%
    step_normalize(all_numeric_predictors()) %>%
    step_smote(ceo_dismissal)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 8,988
## Columns: 41
## $ dismissal_dataset_id <dbl> 84, 85, 119, 162, 243, 244, 263, 280, 300, 346, 3…
## $ co_per_rol           <dbl> -1.314842, -1.314781, -1.312884, -1.309703, -1.30…
## $ max_tenure_ceodb     <dbl> -0.2110499, -0.2110499, -0.2110499, -0.2110499, -…
## $ ceo_dismissal        <fct> dismissed, dismissed, dismissed, dismissed, dismi…
## $ coname_other         <dbl> 0.03535534, 0.03535534, 0.03535534, 0.03535534, 0…
## $ exec_fullname_other  <dbl> 0.02671897, 0.02671897, 0.02671897, 0.02671897, 0…
## $ tenure_no_ceodb_X2   <dbl> -0.1606112, -0.1606112, -0.1606112, -0.1606112, -…
## $ tenure_no_ceodb_X3   <dbl> -0.02671897, -0.02671897, -0.02671897, -0.0267189…
## $ fyear_gone_X1988     <dbl> -0.01335591, -0.01335591, -0.01335591, -0.0133559…
## $ fyear_gone_X1990     <dbl> -0.01888979, -0.01888979, -0.01888979, -0.0188897…
## $ fyear_gone_X1991     <dbl> -0.01888979, -0.01888979, -0.01888979, -0.0188897…
## $ fyear_gone_X1992     <dbl> -0.02671897, -0.02671897, -0.02671897, -0.0267189…
## $ fyear_gone_X1993     <dbl> 7.9179139, -0.1262734, -0.1262734, -0.1262734, 7.…
## $ fyear_gone_X1994     <dbl> -0.1522343, -0.1522343, -0.1522343, -0.1522343, -…
## $ fyear_gone_X1995     <dbl> -0.1800187, -0.1800187, -0.1800187, -0.1800187, -…
## $ fyear_gone_X1996     <dbl> -0.1762889, -0.1762889, 5.6714953, -0.1762889, -0…
## $ fyear_gone_X1997     <dbl> -0.205882, -0.205882, -0.205882, -0.205882, -0.20…
## $ fyear_gone_X1998     <dbl> -0.2068214, 4.8342263, -0.2068214, -0.2068214, -0…
## $ fyear_gone_X1999     <dbl> -0.2249237, -0.2249237, -0.2249237, -0.2249237, -…
## $ fyear_gone_X2000     <dbl> -0.2257975, -0.2257975, -0.2257975, -0.2257975, -…
## $ fyear_gone_X2001     <dbl> -0.208224, -0.208224, -0.208224, -0.208224, -0.20…
## $ fyear_gone_X2002     <dbl> -0.181072, -0.181072, -0.181072, -0.181072, -0.18…
## $ fyear_gone_X2003     <dbl> -0.182120, -0.182120, -0.182120, -0.182120, -0.18…
## $ fyear_gone_X2004     <dbl> -0.1872832, -0.1872832, -0.1872832, -0.1872832, -…
## $ fyear_gone_X2005     <dbl> -0.2114665, -0.2114665, -0.2114665, 4.7280382, -0…
## $ fyear_gone_X2006     <dbl> -0.2035174, -0.2035174, -0.2035174, -0.2035174, -…
## $ fyear_gone_X2007     <dbl> -0.2119263, -0.2119263, -0.2119263, -0.2119263, -…
## $ fyear_gone_X2008     <dbl> -0.2169314, -0.2169314, -0.2169314, -0.2169314, -…
## $ fyear_gone_X2009     <dbl> -0.1842003, -0.1842003, -0.1842003, -0.1842003, -…
## $ fyear_gone_X2010     <dbl> -0.1831627, -0.1831627, -0.1831627, -0.1831627, -…
## $ fyear_gone_X2011     <dbl> -0.2011295, -0.2011295, -0.2011295, -0.2011295, -…
## $ fyear_gone_X2012     <dbl> -0.204466, -0.204466, -0.204466, -0.204466, -0.20…
## $ fyear_gone_X2013     <dbl> -0.1977456, -0.1977456, -0.1977456, -0.1977456, -…
## $ fyear_gone_X2014     <dbl> -0.1883011, -0.1883011, -0.1883011, -0.1883011, -…
## $ fyear_gone_X2015     <dbl> -0.2119263, -0.2119263, -0.2119263, -0.2119263, -…
## $ fyear_gone_X2016     <dbl> -0.2025651, -0.2025651, -0.2025651, -0.2025651, -…
## $ fyear_gone_X2017     <dbl> -0.1996852, -0.1996852, -0.1996852, -0.1996852, -…
## $ fyear_gone_X2018     <dbl> -0.2030417, -0.2030417, -0.2030417, -0.2030417, -…
## $ fyear_gone_X2019     <dbl> -0.1657943, -0.1657943, -0.1657943, -0.1657943, -…
## $ fyear_gone_X2020     <dbl> -0.07576223, -0.07576223, -0.07576223, -0.0757622…
## $ fyear_gone_X2021     <dbl> -0.03272976, -0.03272976, -0.03272976, -0.0327297…

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune Hyperparameters

tree_grid <- grid_regular(trees(),
                          tree_depth(),
                          levels = 5)

doParallel::registerDoParallel()

set.seed(17375)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5,
            control = control_grid(save_pred = TRUE))

## i Creating pre-processing data to finalize unknown parameter: mtry

Model Evaluation

Identify optimal values for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 11
##     mtry trees min_n tree_depth learn_rate .metric     .estimator  mean     n
##    <int> <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>
##  1     1  1674    21          8    0.0166  accuracy    binary     0.491    10
##  2     1  1674    21          8    0.0166  brier_class binary     0.245    10
##  3     1  1674    21          8    0.0166  roc_auc     binary     0.561    10
##  4    11   346    39         12    0.128   accuracy    binary     0.573    10
##  5    11   346    39         12    0.128   brier_class binary     0.239    10
##  6    11   346    39         12    0.128   roc_auc     binary     0.569    10
##  7    20   852    17          2    0.00405 accuracy    binary     0.470    10
##  8    20   852    17          2    0.00405 brier_class binary     0.244    10
##  9    20   852    17          2    0.00405 roc_auc     binary     0.559    10
## 10    29  1497    27          6    0.0475  accuracy    binary     0.576    10
## 11    29  1497    27          6    0.0475  brier_class binary     0.240    10
## 12    29  1497    27          6    0.0475  roc_auc     binary     0.572    10
## 13    37   659     8         14    0.00288 accuracy    binary     0.548    10
## 14    37   659     8         14    0.00288 brier_class binary     0.238    10
## 15    37   659     8         14    0.00288 roc_auc     binary     0.573    10
## # ℹ 2 more variables: std_err <dbl>, .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(ceo_dismissal, .pred_dismissed) %>%
    autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.579 Preprocessor1_Model1
## 2 roc_auc     binary         0.581 Preprocessor1_Model1
## 3 brier_class binary         0.239 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
    autoplot()

Variable importance

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had an accuracy of 0.463 and AUC of 0.571

Feature transformation: Normalized Numeric Data. It resulted slight improvement with accuracy at 0.464 and AUC at 0.572.
Feature transformation: YeoJohnson Transformation. It resulted no improvement with accuracy remaining at 0.464 and AUC at 0.572.
Feature transformation: PCA Transformation. It resulted in a worse accuracy of 0.451 with a slightly worse AUC of 0.571.
Feature transformation: Increased Tuning Transformation. It resulted an improvement in accuracy to 0.579 and an improvement AUC of 0.581.

Apply 8

Brady Martin

2025-03-30