Goal: To Predict CEO Departures

#Import Data

library(tidyverse)

## Warning: package 'purrr' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.4.3

## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.7     ✔ rsample      1.2.1
## ✔ dials        1.4.0     ✔ tune         1.3.0
## ✔ infer        1.0.7     ✔ workflows    1.2.0
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.3.0     ✔ yardstick    1.3.2
## ✔ recipes      1.1.1

## Warning: package 'dials' was built under R version 4.4.3

## Warning: package 'infer' was built under R version 4.4.3

## Warning: package 'modeldata' was built under R version 4.4.3

## Warning: package 'parsnip' was built under R version 4.4.3

## Warning: package 'recipes' was built under R version 4.4.3

## Warning: package 'rsample' was built under R version 4.4.3

## Warning: package 'tune' was built under R version 4.4.3

## Warning: package 'workflows' was built under R version 4.4.3

## Warning: package 'workflowsets' was built under R version 4.4.3

## Warning: package 'yardstick' was built under R version 4.4.3

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()

library(rsample)
library(purrr)
library(recipes)
library(themis)

## Warning: package 'themis' was built under R version 4.4.3

library(usemodels)

## Warning: package 'usemodels' was built under R version 4.4.3

library(doParallel)

## Warning: package 'doParallel' was built under R version 4.4.3

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 4.4.3

## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: iterators

## Warning: package 'iterators' was built under R version 4.4.3

## Loading required package: parallel

library(dplyr)
library(textrecipes)

## Warning: package 'textrecipes' was built under R version 4.4.3

library(workflows)


data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv')

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Clean Data

skimr::skim(data)

Data summary
Name	data
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

factors_vec <- data %>% select(dismissal_dataset_id,departure_code,ceo_dismissal,fyear_gone,max_tenure_ceodb,tenure_no_ceodb,coname,exec_fullname,) %>%
  names()

data_clean <- data %>%
    # Drop zero-variance variables, missing values, and non predictive values
  select(-c(`_merge`,interim_coceo,still_there,eight_ks,departure_code,gvkey,cik,co_per_rol,leftofc,fyear,sources)) %>%
  filter(!fyear_gone == 2997,) %>%
  na.omit() %>%
  #Remove Duplicates in ID
distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
  
   # Address factors imported as numeric
  mutate(across(where(is.character), as.factor))%>%
  
  mutate(across(where(is.logical), as.factor)) %>%
  
  # Convert ceo_dismissal to factor
  mutate(ceo_dismissal = as.factor(ceo_dismissal))

#Explore Data

data_clean %>% count(ceo_dismissal)

## # A tibble: 2 × 2
##   ceo_dismissal     n
##   <fct>         <int>
## 1 0              5976
## 2 1              1482

data_clean %>%
  ggplot(aes(ceo_dismissal))+
  geom_bar()

#CEO Dismissal vs year

data_clean %>%
  ggplot(aes(ceo_dismissal, fyear_gone)) +
  geom_boxplot()

# Step 1: Binarize the data
data_binarized <- data_clean %>%
  select(-notes,-dismissal_dataset_id) %>%
  binarize()

data_binarized %>% glimpse()

## Rows: 7,458
## Columns: 16
## $ coname__BARRICK_GOLD_CORP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `coname__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ exec_fullname__John_W._Rowe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `exec_fullname__-OTHER`     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ceo_dismissal__0            <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ ceo_dismissal__1            <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ tenure_no_ceodb__1          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tenure_no_ceodb__2          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tenure_no_ceodb__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ max_tenure_ceodb__1         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ max_tenure_ceodb__2         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `max_tenure_ceodb__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `fyear_gone__-Inf_2000`     <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, …
## $ fyear_gone__2000_2006       <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, …
## $ fyear_gone__2006_2013       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ fyear_gone__2013_Inf        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2: Correlation
data_correlation <- data_binarized %>%
  correlate(ceo_dismissal__1)

#Step #3: Plot
data_correlation %>%
  correlationfunnel::plot_correlation_funnel()

#Model Building

Split Data

set.seed(1234)
 #data_clean <- data_clean %>% sample_n(100)
#data_clean <- data_clean %>% group_by(ceo_dismissal) %>% sample_n(50)


data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train<- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = ceo_dismissal)

##Preprocess Data

xgboost_recipe <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
   update_role(dismissal_dataset_id, new_role = "ID") %>%
   step_tokenize(notes) %>%
   step_tokenfilter(notes, max_tokens = 100) %>%
   step_tfidf(notes) %>%
  step_other(coname, exec_fullname, threshold = .001) %>%
 step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
   step_smote(ceo_dismissal)
  
 

  #update_role(dismissal_dataset_id, new_role = "ID") %>%
   #step_tokenize(notes) %>%
   #step_tokenfilter(notes, max_tokens = 100) %>%
   #step_tfidf(notes) %>%
  # step_other(coname, exec_fullname) %>%
# step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
   #step_smote(ceo_dismissal)
  

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 8,964
## Columns: 136
## $ dismissal_dataset_id               <dbl> 12, 31, 43, 51, 63, 75, 76, 80, 99,…
## $ tenure_no_ceodb                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ max_tenure_ceodb                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ fyear_gone                         <dbl> 1998, 1998, 2002, 1997, 1998, 1995,…
## $ ceo_dismissal                      <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_1                      <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_1999                   <dbl> 0.0000000, 0.1200750, 0.0000000, 0.…
## $ tfidf_notes_2000                   <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_a                      <dbl> 0.00000000, 0.04090517, 0.00000000,…
## $ tfidf_notes_acquisition            <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_after                  <dbl> 0.07383147, 0.00000000, 0.00000000,…
## $ tfidf_notes_agreement              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_also                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_an                     <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_and                    <dbl> 0.03616386, 0.10397110, 0.10397110,…
## $ tfidf_notes_announced              <dbl> 0.07277316, 0.00000000, 0.00000000,…
## $ tfidf_notes_april                  <dbl> 0.00000000, 0.00000000, 0.16976660,…
## $ tfidf_notes_as                     <dbl> 0.00000000, 0.00000000, 0.05956175,…
## $ tfidf_notes_at                     <dbl> 0.00000000, 0.07172203, 0.00000000,…
## $ tfidf_notes_based                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_be                     <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_been                   <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_billion                <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_board                  <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_business               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_but                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_by                     <dbl> 0.00000000, 0.07113804, 0.00000000,…
## $ tfidf_notes_ceo                    <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_chairman               <dbl> 0.05123107, 0.04909645, 0.14728934,…
## $ tfidf_notes_chief                  <dbl> 0.05200631, 0.04983938, 0.07475908,…
## $ tfidf_notes_co                     <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_company                <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ `tfidf_notes_company's`            <dbl> 0.10316347, 0.09886499, 0.00000000,…
## $ tfidf_notes_corp                   <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_corporation            <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_december               <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_director               <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_directors              <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_down                   <dbl> 0.00000000, 0.00000000, 0.12386560,…
## $ tfidf_notes_effective              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_executive              <dbl> 0.04923937, 0.04718773, 0.07078159,…
## $ tfidf_notes_financial              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_following              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_for                    <dbl> 0.00000000, 0.05594219, 0.00000000,…
## $ tfidf_notes_from                   <dbl> 0.00000000, 0.05609840, 0.00000000,…
## $ tfidf_notes_group                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_had                    <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_has                    <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_have                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_he                     <dbl> 0.05636087, 0.00000000, 0.00000000,…
## $ tfidf_notes_his                    <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_in                     <dbl> 0.04299439, 0.04120296, 0.12360889,…
## $ tfidf_notes_inc                    <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_into                   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_is                     <dbl> 0.00000000, 0.00000000, 0.10662454,…
## $ tfidf_notes_it                     <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_its                    <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_january                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_july                   <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_june                   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_march                  <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_may                    <dbl> 0.1102568, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_merger                 <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_million                <dbl> 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_notes_more                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_mr                     <dbl> 0.06678312, 0.00000000, 0.00000000,…
## $ tfidf_notes_named                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_new                    <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_not                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_nyse                   <dbl> 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_notes_of                     <dbl> 0.07020830, 0.06728296, 0.05046222,…
## $ tfidf_notes_officer                <dbl> 0.00000000, 0.00000000, 0.08211329,…
## $ tfidf_notes_on                     <dbl> 0.05784383, 0.00000000, 0.00000000,…
## $ tfidf_notes_operating              <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_over                   <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_position               <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_president              <dbl> 0.00000000, 0.00000000, 0.08296798,…
## $ tfidf_notes_resigned               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_retire                 <dbl> 0.09752930, 0.09346558, 0.00000000,…
## $ tfidf_notes_retired                <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_retirement             <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_role                   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_said                   <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_served                 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_share                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_since                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_stepped                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_stock                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_notes_that                   <dbl> 0.05930096, 0.00000000, 0.00000000,…
## $ tfidf_notes_the                    <dbl> 0.16625028, 0.15932318, 0.00000000,…
## $ tfidf_notes_this                   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_time                   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_to                     <dbl> 0.00000000, 0.00000000, 0.05796779,…
## $ tfidf_notes_today                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_until                  <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_vice                   <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_was                    <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_when                   <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_which                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_who                    <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_will                   <dbl> 0.00000000, 0.06440856, 0.00000000,…
## $ tfidf_notes_with                   <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_would                  <dbl> 0.12278287, 0.00000000, 0.00000000,…
## $ tfidf_notes_year                   <dbl> 0.00000000, 0.00000000, 0.00000000,…
## $ tfidf_notes_years                  <dbl> 0.08693090, 0.08330878, 0.00000000,…
## $ coname_ALERIS.CORP                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_ALLEGHENY.TECHNOLOGIES.INC  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_ALTABA.INC                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_AMBAC.FINANCIAL.GROUP.INC   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_BIOLASE.INC                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_BORDERS.GROUP.INC           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_CALATLANTIC.GROUP.INC       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_CHRISTOPHER...BANKS.CORP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_CLAIRES.STORES.INC          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_CLECO.CORP                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_CNO.FINANCIAL.GROUP.INC     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_DUN...BRADSTREET.CORP       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_E.TRADE.FINANCIAL.CORP      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_EL.PASO.ELECTRIC.CO         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_EQT.CORP                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_FEDERAL.MOGUL.HOLDINGS.CORP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_ION.GEOPHYSICAL.CORP        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_MCDERMOTT.INTL.INC          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_NORTEL.NETWORKS.CORP        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_NTN.BUZZTIME.INC            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_ORBITAL.ATK.INC             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_PENNEY..J.C..CO             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_PG.E.CORP                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_READERS.DIGEST.ASSN.INC     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_SEARS.HOLDINGS.CORP         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_SUPERVALU.INC               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_US.AIRWAYS.GROUP.INC.OLD    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_WAUSAU.PAPER.CORP           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ coname_other                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ exec_fullname_John.W..Rowe         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ exec_fullname_other                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…

Specify Model

usemodels::use_xgboost(ceo_dismissal ~., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = ceo_dismissal ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(50226)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_spec <- 
  boost_tree(trees = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec)

Tune Hyperparameters

doParallel::registerDoParallel()


set.seed(66771)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv, grid = 5,
              control = control_grid(save_pred = TRUE ))

## Warning: ! tune detected a parallel backend registered with foreach but no backend
##   registered with future.
## ℹ Support for parallel processing with foreach was soft-deprecated in tune
##   1.2.1.
## ℹ See ?parallelism (`?tune::parallelism()`) to learn more.

Model Evaluation

##Identify Optimal values for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 7
##    trees .metric     .estimator  mean     n std_err .config             
##    <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>               
##  1     3 accuracy    binary     0.775    10 0.00570 Preprocessor1_Model1
##  2     3 brier_class binary     0.171    10 0.00193 Preprocessor1_Model1
##  3     3 roc_auc     binary     0.744    10 0.00943 Preprocessor1_Model1
##  4   476 accuracy    binary     0.827    10 0.00417 Preprocessor1_Model2
##  5   476 brier_class binary     0.135    10 0.00304 Preprocessor1_Model2
##  6   476 roc_auc     binary     0.844    10 0.00665 Preprocessor1_Model2
##  7   979 accuracy    binary     0.831    10 0.00386 Preprocessor1_Model3
##  8   979 brier_class binary     0.139    10 0.00286 Preprocessor1_Model3
##  9   979 roc_auc     binary     0.843    10 0.00664 Preprocessor1_Model3
## 10  1478 accuracy    binary     0.831    10 0.00431 Preprocessor1_Model4
## 11  1478 brier_class binary     0.141    10 0.00310 Preprocessor1_Model4
## 12  1478 roc_auc     binary     0.842    10 0.00661 Preprocessor1_Model4
## 13  2000 accuracy    binary     0.831    10 0.00394 Preprocessor1_Model5
## 14  2000 brier_class binary     0.141    10 0.00326 Preprocessor1_Model5
## 15  2000 roc_auc     binary     0.841    10 0.00669 Preprocessor1_Model5

collect_predictions(xgboost_tune) %>%
  group_by(id) %>%
  roc_curve(ceo_dismissal, .pred_1) %>%
  autoplot()

##Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
  finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
  last_fit(data_split)

## Warning: package 'xgboost' was built under R version 4.4.3

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.841 Preprocessor1_Model1
## 2 roc_auc     binary         0.853 Preprocessor1_Model1
## 3 brier_class binary         0.128 Preprocessor1_Model1

collect_predictions(xgboost_last) %>% yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
  autoplot()

##Variable Importance

library(vip)

## Warning: package 'vip' was built under R version 4.4.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
  workflows::extract_fit_engine() %>%
  vip()

In the previous model, the accuracy was .846 and the AUC was .855

I added a threshold of 10% to step other to try and reintroduce more data back into the dataset but it had no impact on the model as both the accuracy and the AUC remain unchanged. When I reduce the threshold down to 5% there is still no change. I tried once more to lower the threshold to 1% to see if I could get any other data reintroduced but there was still no change to the accuracy of the model or to the AUC curve.

Apply 7

Michael Toronto

2025-04-02

Split Data

Specify Model

Tune Hyperparameters

Model Evaluation