The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Goal is to predict CEO departure (ceo_dismissal) by using the departures dataset.
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'stringr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.1
library(correlationfunnel)
## ══ correlationfunnel Tip #2 ════════════════════════════════════════════════════
## Clean your NA's prior to using `binarize()`.
## Missing values and cleaning data are critical to getting great correlations. :)
data <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv")
skimr::skim(data)
| Name | data |
| Number of rows | 9423 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| coname | 0 | 1.00 | 2 | 30 | 0 | 3860 | 0 |
| exec_fullname | 0 | 1.00 | 5 | 790 | 0 | 8701 | 0 |
| interim_coceo | 9105 | 0.03 | 6 | 7 | 0 | 6 | 0 |
| leftofc | 1802 | 0.81 | 20 | 20 | 0 | 3627 | 0 |
| still_there | 7311 | 0.22 | 3 | 10 | 0 | 77 | 0 |
| notes | 1644 | 0.83 | 5 | 3117 | 0 | 7755 | 0 |
| sources | 1475 | 0.84 | 18 | 1843 | 0 | 7915 | 0 |
| eight_ks | 4499 | 0.52 | 69 | 3884 | 0 | 4914 | 0 |
| X_merge | 0 | 1.00 | 11 | 11 | 0 | 1 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| dismissal_dataset_id | 0 | 1.00 | 5684.10 | 25005.46 | 1 | 2305.5 | 4593 | 6812.5 | 559044 | ▇▁▁▁▁ |
| gvkey | 0 | 1.00 | 40132.48 | 53921.34 | 1004 | 7337.0 | 14385 | 60900.5 | 328795 | ▇▁▁▁▁ |
| fyear | 0 | 1.00 | 2007.74 | 8.19 | 1987 | 2000.0 | 2008 | 2016.0 | 2020 | ▁▆▅▅▇ |
| co_per_rol | 0 | 1.00 | 25580.22 | 18202.38 | -1 | 8555.5 | 22980 | 39275.5 | 64602 | ▇▆▅▃▃ |
| departure_code | 1667 | 0.82 | 5.20 | 1.53 | 1 | 5.0 | 5 | 7.0 | 9 | ▁▃▇▅▁ |
| ceo_dismissal | 1813 | 0.81 | 0.20 | 0.40 | 0 | 0.0 | 0 | 0.0 | 1 | ▇▁▁▁▂ |
| tenure_no_ceodb | 0 | 1.00 | 1.03 | 0.17 | 0 | 1.0 | 1 | 1.0 | 3 | ▁▇▁▁▁ |
| max_tenure_ceodb | 0 | 1.00 | 1.05 | 0.24 | 1 | 1.0 | 1 | 1.0 | 4 | ▇▁▁▁▁ |
| fyear_gone | 1802 | 0.81 | 2006.64 | 13.63 | 1980 | 2000.0 | 2007 | 2013.0 | 2997 | ▇▁▁▁▁ |
| cik | 245 | 0.97 | 741469.17 | 486551.43 | 1750 | 106413.0 | 857323 | 1050375.8 | 1808065 | ▆▁▇▂▁ |
* interim_coceo (91% missing)
* eight_ks (52% missing)
* departure_code (categorical but currently numeric)
* interim_coceo (needs to be a factor)
* leftofc (needs to be a factor)
* still_there (needs to be a factor)
* X-merge
* coname
* exec_fullname
* sources
* ceo_dismissal
* dismissal_dataset_id
* gvkey
* cik
# Identify factor variables
factors_vec <- c("departure_code", "leftofc", "still_there")
# Clean the data and ensure ceo_dismissal is a factor
data_clean <- data %>%
# Convert ceo_dismissal and factors to proper types
mutate(ceo_dismissal = as.factor(ceo_dismissal),
across(all_of(factors_vec), as.factor)) %>%
# Remove rows with missing values in key columns
drop_na(ceo_dismissal, tenure_no_ceodb, fyear_gone, departure_code, leftofc, still_there) %>%
# Drop zero-variance and irrelevant columns
select(-c(X_merge, interim_coceo, eight_ks, dismissal_dataset_id, gvkey, cik, coname, exec_fullname, sources))
skimr::skim(data_clean)
| Name | data_clean |
| Number of rows | 501 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| factor | 4 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| notes | 0 | 1 | 109 | 2013 | 0 | 500 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| departure_code | 0 | 1 | FALSE | 3 | 7: 496, 3: 4, 5: 1, 1: 0 |
| ceo_dismissal | 0 | 1 | FALSE | 2 | 0: 497, 1: 4 |
| leftofc | 0 | 1 | FALSE | 465 | 199: 3, 200: 3, 199: 2, 199: 2 |
| still_there | 0 | 1 | FALSE | 44 | 01a: 33, 29d: 28, 01j: 26, 01m: 26 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| fyear | 0 | 1 | 2001.63 | 6.93 | 1993 | 1996 | 1999 | 2006 | 2018 | ▇▅▃▁▂ |
| co_per_rol | 0 | 1 | 13989.14 | 13410.46 | 900 | 4323 | 7354 | 20346 | 63294 | ▇▂▁▁▁ |
| tenure_no_ceodb | 0 | 1 | 1.01 | 0.13 | 1 | 1 | 1 | 1 | 3 | ▇▁▁▁▁ |
| max_tenure_ceodb | 0 | 1 | 1.02 | 0.21 | 1 | 1 | 1 | 1 | 4 | ▇▁▁▁▁ |
| fyear_gone | 0 | 1 | 2004.94 | 44.96 | 1990 | 1998 | 2000 | 2007 | 2997 | ▇▁▁▁▁ |
# Bar plot for CEO Dismissal
data_clean %>%
ggplot(aes(ceo_dismissal)) +
geom_bar() +
labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")
data_clean %>%
ggplot(aes(x = ceo_dismissal, y = tenure_no_ceodb)) +
geom_boxplot() +
labs(title = "CEO Dismissal vs. Tenure", x = "CEO Dismissal", y = "CEO Tenure")
# Step 1: Binarize the data
data_binarized <- data_clean %>%
binarize()
data_binarized %>% glimpse()
## Rows: 501
## Columns: 55
## $ `fyear__-Inf_1996` <dbl> …
## $ fyear__1996_1999 <dbl> …
## $ fyear__1999_2006 <dbl> …
## $ fyear__2006_Inf <dbl> …
## $ `co_per_rol__-Inf_4323` <dbl> …
## $ co_per_rol__4323_7354 <dbl> …
## $ co_per_rol__7354_20346 <dbl> …
## $ co_per_rol__20346_Inf <dbl> …
## $ departure_code__7 <dbl> …
## $ `departure_code__-OTHER` <dbl> …
## $ ceo_dismissal__0 <dbl> …
## $ `ceo_dismissal__-OTHER` <dbl> …
## $ tenure_no_ceodb__1 <dbl> …
## $ `tenure_no_ceodb__-OTHER` <dbl> …
## $ max_tenure_ceodb__1 <dbl> …
## $ `max_tenure_ceodb__-OTHER` <dbl> …
## $ `fyear_gone__-Inf_1998` <dbl> …
## $ fyear_gone__1998_2000 <dbl> …
## $ fyear_gone__2000_2007 <dbl> …
## $ fyear_gone__2007_Inf <dbl> …
## $ `leftofc__1996-08-01T00:00:00Z` <dbl> …
## $ `leftofc__-OTHER` <dbl> …
## $ still_there__01apr2021 <dbl> …
## $ still_there__01feb2021 <dbl> …
## $ still_there__01jan2021 <dbl> …
## $ still_there__01jul2021 <dbl> …
## $ still_there__01jun2021 <dbl> …
## $ still_there__01mar2021 <dbl> …
## $ still_there__01may2021 <dbl> …
## $ still_there__12apr2020 <dbl> …
## $ still_there__12aug2020 <dbl> …
## $ still_there__12dec2020 <dbl> …
## $ still_there__12jul2020 <dbl> …
## $ still_there__12jun2020 <dbl> …
## $ still_there__12mar2020 <dbl> …
## $ still_there__12nov2020 <dbl> …
## $ still_there__12oct2020 <dbl> …
## $ still_there__12sep2020 <dbl> …
## $ still_there__13dec2020 <dbl> …
## $ still_there__14dec2020 <dbl> …
## $ still_there__14jan2021 <dbl> …
## $ still_there__17dec2020 <dbl> …
## $ still_there__20dec2020 <dbl> …
## $ still_there__21dec2020 <dbl> …
## $ still_there__24dec2020 <dbl> …
## $ still_there__25dec2020 <dbl> …
## $ still_there__26dec2020 <dbl> …
## $ still_there__27dec2020 <dbl> …
## $ still_there__28dec2020 <dbl> …
## $ still_there__29dec2020 <dbl> …
## $ still_there__30dec2020 <dbl> …
## $ still_there__31dec2020 <dbl> …
## $ `still_there__-OTHER` <dbl> …
## $ `notes__Constantine_S._Macricostas_is_Chairman_of_the_Board_and_founder_of_the_Company._Mr._Macricostas_was_Executive_Chairman_of_the_Company_until_January_20,_2018._Mr._Macricostas_previously_served_as_Chief_Executive_Officer_of_the_Company_on_three_different_occasions_from_1974_until_August_1997,_from_February_2004_to_June_2005,_and_from_April_2009_until_May_2015._Mr._Macricostas_is_a_former_director_of_RagingWire_Data_Centers,_Inc.,_(“RagingWire”)._Mr._Macricostas_is_the_father_of_George_Macricostas._Mr._Macricostas’_knowledge_of_the_Company_and_its_operations,_as_well_as,_the_industry_is_invaluable_to_the_Board_of_Directors_in_evaluating_and_directing_the_Company’s_future._Through_his_long_service_to_the_Company_and_experience_in_the_photomask_industry,_he_has_developed_extensive_knowledge_in_the_areas_of_leadership,_safety,_risk_oversight,_management,_and_corporate_governance,_each_of_which_provides_great_value_to_the_Board_of_Directors._Mr._Macricostas_is_a_member_of_the_Cyber_Security_Committee_of_the_Board.` <dbl> …
## $ `notes__-OTHER` <dbl> …
# Correlation for both categories of ceo_dismissal
data_correlation_0 <- data_binarized %>%
correlate(`ceo_dismissal__0`) # Correlation for one class
data_correlation_other <- data_binarized %>%
correlate(`ceo_dismissal__-OTHER`) # Correlation for the other class
## Warning: correlate(): [Data Imbalance Detected] Consider sampling to balance the classes more than 5%
## Column with imbalance: ceo_dismissal__-OTHER
# Display the correlation results
data_correlation_0
## # A tibble: 55 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 ceo_dismissal 0 1
## 2 ceo_dismissal -OTHER -1
## 3 departure_code 7 0.894
## 4 departure_code -OTHER -0.894
## 5 leftofc 1996-08-01T00:00:00Z -0.284
## 6 leftofc -OTHER 0.284
## 7 still_there 12jul2020 -0.268
## 8 fyear_gone -Inf_1998 -0.130
## 9 co_per_rol 4323_7354 -0.104
## 10 still_there 12sep2020 -0.0996
## # ℹ 45 more rows
data_correlation_other
## # A tibble: 55 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 ceo_dismissal 0 -1
## 2 ceo_dismissal -OTHER 1
## 3 departure_code 7 -0.894
## 4 departure_code -OTHER 0.894
## 5 leftofc 1996-08-01T00:00:00Z 0.284
## 6 leftofc -OTHER -0.284
## 7 still_there 12jul2020 0.268
## 8 fyear_gone -Inf_1998 0.130
## 9 co_per_rol 4323_7354 0.104
## 10 still_there 12sep2020 0.0996
## # ℹ 45 more rows
# Step 3: Plot the correlation funnel
data_correlation_0 %>%
correlationfunnel::plot_correlation_funnel() +
labs(title = "Correlation Funnel for CEO Dismissal (Class 0)")
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
data_correlation_other %>%
correlationfunnel::plot_correlation_funnel() +
labs(title = "Correlation Funnel for CEO Dismissal (Class Other)")
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps