departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv')
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm (1): leftofc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(departures)
Name | departures |
Number of rows | 9423 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
character | 8 |
numeric | 10 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
coname | 0 | 1.00 | 2 | 30 | 0 | 3860 | 0 |
exec_fullname | 0 | 1.00 | 5 | 790 | 0 | 8701 | 0 |
interim_coceo | 9105 | 0.03 | 6 | 7 | 0 | 6 | 0 |
still_there | 7311 | 0.22 | 3 | 10 | 0 | 77 | 0 |
notes | 1644 | 0.83 | 5 | 3117 | 0 | 7755 | 0 |
sources | 1475 | 0.84 | 18 | 1843 | 0 | 7915 | 0 |
eight_ks | 4499 | 0.52 | 69 | 3884 | 0 | 4914 | 0 |
_merge | 0 | 1.00 | 11 | 11 | 0 | 1 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1.00 | 5684.10 | 25005.46 | 1 | 2305.5 | 4593 | 6812.5 | 559044 | ▇▁▁▁▁ |
gvkey | 0 | 1.00 | 40132.48 | 53921.34 | 1004 | 7337.0 | 14385 | 60900.5 | 328795 | ▇▁▁▁▁ |
fyear | 0 | 1.00 | 2007.74 | 8.19 | 1987 | 2000.0 | 2008 | 2016.0 | 2020 | ▁▆▅▅▇ |
co_per_rol | 0 | 1.00 | 25580.22 | 18202.38 | -1 | 8555.5 | 22980 | 39275.5 | 64602 | ▇▆▅▃▃ |
departure_code | 1667 | 0.82 | 5.20 | 1.53 | 1 | 5.0 | 5 | 7.0 | 9 | ▁▃▇▅▁ |
ceo_dismissal | 1813 | 0.81 | 0.20 | 0.40 | 0 | 0.0 | 0 | 0.0 | 1 | ▇▁▁▁▂ |
tenure_no_ceodb | 0 | 1.00 | 1.03 | 0.17 | 0 | 1.0 | 1 | 1.0 | 3 | ▁▇▁▁▁ |
max_tenure_ceodb | 0 | 1.00 | 1.05 | 0.24 | 1 | 1.0 | 1 | 1.0 | 4 | ▇▁▁▁▁ |
fyear_gone | 1802 | 0.81 | 2006.64 | 13.63 | 1980 | 2000.0 | 2007 | 2013.0 | 2997 | ▇▁▁▁▁ |
cik | 245 | 0.97 | 741469.17 | 486551.43 | 1750 | 106413.0 | 857323 | 1050375.8 | 1808065 | ▆▁▇▂▁ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
leftofc | 1802 | 0.81 | 1981-01-01 | 2998-04-27 | 2006-12-31 | 3627 |
Missing Values *interim_coceo, still_there, notes (string variable), sources, eight_ks, departure_code, ceo_dismissal, fyear_gone, cik
Converting Numeric Variables to Factors * tenure_no_ceodb, max_tenure_ceodb, fyear_gone
Zero variance variables * _merge, gvkey, dismissal_dataset_id
Character Variables * coname, exec_fullname, sources, eight_ks, notes (string variable and not factor)
Unbalanced Target Variable ceo_dismissal #adress in recipe section step_smoke function
Handling ID Variables * dismissal_dataset_id #dataset primary key
# Clean data
departures_clean <- departures %>%
# Clean the target variable
filter(!is.na(ceo_dismissal)) %>%
mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>%
# Remove variables with too many missing values
select(-c(interim_coceo, still_there, eight_ks))%>%
# Remove irrelevant variables
select(-`_merge`, -sources) %>%
# Remove variables with info that only becomes
select(-departure_code) %>%
# Remove redundant variables
select(-c(gvkey, cik, co_per_rol, leftofc, fyear)) %>%
#Remove duplicated in dismissal_dataset_id our id variable
distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
#Remove 2997 in fyear_gone
filter(fyear_gone < 2025) %>%
# Convert factors that are incorrectly imported as numeric variables
mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(notes = as.character(notes))
skimr::skim(departures_clean)
Name | departures_clean |
Number of rows | 7475 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 1 |
factor | 6 |
numeric | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
notes | 17 | 1 | 5 | 3117 | 0 | 7448 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
coname | 0 | 1 | FALSE | 3427 | BAR: 8, CLA: 8, FED: 8, GRE: 8 |
exec_fullname | 0 | 1 | FALSE | 6975 | Joh: 4, Mel: 4, Alb: 3, Ami: 3 |
ceo_dismissal | 0 | 1 | FALSE | 2 | not: 5992, dis: 1483 |
tenure_no_ceodb | 0 | 1 | FALSE | 3 | 1: 7289, 2: 179, 3: 7 |
max_tenure_ceodb | 0 | 1 | FALSE | 4 | 1: 7138, 2: 319, 3: 15, 4: 3 |
fyear_gone | 0 | 1 | FALSE | 34 | 200: 379, 199: 351, 200: 334, 200: 321 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1 | 5570.32 | 25757.33 | 1 | 2175.5 | 4326 | 6579.5 | 559044 | ▇▁▁▁▁ |
departures_clean %>% count(ceo_dismissal)
## # A tibble: 2 × 2
## ceo_dismissal n
## <fct> <int>
## 1 dismissed 1483
## 2 not_dis 5992
departures_clean %>%
ggplot(aes(ceo_dismissal)) +
geom_bar() +
labs(title = "CEO Dismissal Count", x = "CEO Dismissal", y = "Count")
departures_clean %>%
count(ceo_dismissal, tenure_no_ceodb) %>%
ggplot(mapping = aes(x = ceo_dismissal, y = tenure_no_ceodb)) +
geom_tile(mapping = aes(fill = n)) +
labs(title = "CEO Dismissal or Tenure", x = "CEO Dismissal", y = "CEO Tenure")
departures_clean <- departures_clean %>%
drop_na(notes)
# Step 1: Binarize the data
data_binarized <- departures_clean %>%
binarize()
data_binarized %>% glimpse()
## Rows: 7,458
## Columns: 46
## $ `dismissal_dataset_id__-Inf_2170.25` <dbl> …
## $ dismissal_dataset_id__2170.25_4321.5 <dbl> …
## $ dismissal_dataset_id__4321.5_6575.75 <dbl> …
## $ dismissal_dataset_id__6575.75_Inf <dbl> …
## $ coname__BARRICK_GOLD_CORP <dbl> …
## $ `coname__-OTHER` <dbl> …
## $ exec_fullname__John_W._Rowe <dbl> …
## $ `exec_fullname__-OTHER` <dbl> …
## $ ceo_dismissal__dismissed <dbl> …
## $ ceo_dismissal__not_dis <dbl> …
## $ tenure_no_ceodb__1 <dbl> …
## $ tenure_no_ceodb__2 <dbl> …
## $ `tenure_no_ceodb__-OTHER` <dbl> …
## $ max_tenure_ceodb__1 <dbl> …
## $ max_tenure_ceodb__2 <dbl> …
## $ `max_tenure_ceodb__-OTHER` <dbl> …
## $ fyear_gone__1993 <dbl> …
## $ fyear_gone__1994 <dbl> …
## $ fyear_gone__1995 <dbl> …
## $ fyear_gone__1996 <dbl> …
## $ fyear_gone__1997 <dbl> …
## $ fyear_gone__1998 <dbl> …
## $ fyear_gone__1999 <dbl> …
## $ fyear_gone__2000 <dbl> …
## $ fyear_gone__2001 <dbl> …
## $ fyear_gone__2002 <dbl> …
## $ fyear_gone__2003 <dbl> …
## $ fyear_gone__2004 <dbl> …
## $ fyear_gone__2005 <dbl> …
## $ fyear_gone__2006 <dbl> …
## $ fyear_gone__2007 <dbl> …
## $ fyear_gone__2008 <dbl> …
## $ fyear_gone__2009 <dbl> …
## $ fyear_gone__2010 <dbl> …
## $ fyear_gone__2011 <dbl> …
## $ fyear_gone__2012 <dbl> …
## $ fyear_gone__2013 <dbl> …
## $ fyear_gone__2014 <dbl> …
## $ fyear_gone__2015 <dbl> …
## $ fyear_gone__2016 <dbl> …
## $ fyear_gone__2017 <dbl> …
## $ fyear_gone__2018 <dbl> …
## $ fyear_gone__2019 <dbl> …
## $ `fyear_gone__-OTHER` <dbl> …
## $ `notes__Constantine_S._Macricostas_is_Chairman_of_the_Board_and_founder_of_the_Company._Mr._Macricostas_was_Executive_Chairman_of_the_Company_until_January_20,_2018._Mr._Macricostas_previously_served_as_Chief_Executive_Officer_of_the_Company_on_three_different_occasions_from_1974_until_August_1997,_from_February_2004_to_June_2005,_and_from_April_2009_until_May_2015._Mr._Macricostas_is_a_former_director_of_RagingWire_Data_Centers,_Inc.,_(“RagingWire”)._Mr._Macricostas_is_the_father_of_George_Macricostas._Mr._Macricostas’_knowledge_of_the_Company_and_its_operations,_as_well_as,_the_industry_is_invaluable_to_the_Board_of_Directors_in_evaluating_and_directing_the_Company’s_future._Through_his_long_service_to_the_Company_and_experience_in_the_photomask_industry,_he_has_developed_extensive_knowledge_in_the_areas_of_leadership,_safety,_risk_oversight,_management,_and_corporate_governance,_each_of_which_provides_great_value_to_the_Board_of_Directors._Mr._Macricostas_is_a_member_of_the_Cyber_Security_Committee_of_the_Board.` <dbl> …
## $ `notes__-OTHER` <dbl> …
# Step 2: Correlation
data_correlation_dismissed <- data_binarized %>%
correlate(ceo_dismissal__dismissed)
data_correlation_notdis <- data_binarized %>%
correlate(ceo_dismissal__not_dis)
data_correlation_dismissed
## # A tibble: 46 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 ceo_dismissal dismissed 1
## 2 ceo_dismissal not_dis -1
## 3 max_tenure_ceodb 1 0.0577
## 4 max_tenure_ceodb 2 -0.0533
## 5 fyear_gone 1999 -0.0390
## 6 fyear_gone 2002 0.0378
## 7 fyear_gone 2003 0.0303
## 8 fyear_gone 2009 0.0292
## 9 fyear_gone 2008 0.0261
## 10 fyear_gone 1997 -0.0255
## # ℹ 36 more rows
# Step 3: Plot
data_correlation_dismissed %>%
correlationfunnel::plot_correlation_funnel() +
labs(title = "Correlation Funnel for CEO Dismissal")
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
data_correlation_notdis %>%
correlationfunnel::plot_correlation_funnel() +
labs(title = "Correlation Funnel for CEO not Dismissal")
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps