CEO Departures: The dataset documents the reasons for CEO departure in S&P 1500 firms from 2000 through 2018. Build a topic (clustering) model to discover major reasons (topics) for CEO departure (ceo_dismissal). Use the departures dataset.

Approach

In Julia’s screencast, her goal was to discover topics in in the lyrics of Spice Girls songs. Applying the case to the CEO departure data, let’s say our goal is to discover main departure reasons (topics) in the notes. In summary:

library(tidyverse)
departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')

departures %>% skimr::skim()
Data summary
Name Piped data
Number of rows 9423
Number of columns 19
_______________________
Column type frequency:
character 8
numeric 10
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
coname 0 1.00 2 30 0 3860 0
exec_fullname 0 1.00 5 790 0 8701 0
interim_coceo 9105 0.03 6 7 0 6 0
still_there 7311 0.22 3 10 0 77 0
notes 1644 0.83 5 3117 0 7755 0
sources 1475 0.84 18 1843 0 7915 0
eight_ks 4499 0.52 69 3884 0 4914 0
_merge 0 1.00 11 11 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dismissal_dataset_id 0 1.00 5684.10 25005.46 1 2305.5 4593 6812.5 559044 ▇▁▁▁▁
gvkey 0 1.00 40132.48 53921.34 1004 7337.0 14385 60900.5 328795 ▇▁▁▁▁
fyear 0 1.00 2007.74 8.19 1987 2000.0 2008 2016.0 2020 ▁▆▅▅▇
co_per_rol 0 1.00 25580.22 18202.38 -1 8555.5 22980 39275.5 64602 ▇▆▅▃▃
departure_code 1667 0.82 5.20 1.53 1 5.0 5 7.0 9 ▁▃▇▅▁
ceo_dismissal 1813 0.81 0.20 0.40 0 0.0 0 0.0 1 ▇▁▁▁▂
tenure_no_ceodb 0 1.00 1.03 0.17 0 1.0 1 1.0 3 ▁▇▁▁▁
max_tenure_ceodb 0 1.00 1.05 0.24 1 1.0 1 1.0 4 ▇▁▁▁▁
fyear_gone 1802 0.81 2006.64 13.63 1980 2000.0 2007 2013.0 2997 ▇▁▁▁▁
cik 245 0.97 741469.17 486551.43 1750 106413.0 857323 1050375.8 1808065 ▆▁▇▂▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
leftofc 1802 0.81 1981-01-01 2998-04-27 2006-12-31 3627

Notes:

departures_clean <- departures %>% distinct(exec_fullname, .keep_all = TRUE)
library(tidytext)

tidy_departures <-
    departures_clean %>%
    unnest_tokens(word, notes) %>%
    anti_join(get_stopwords())
tidy_departures %>%
  count(word, sort = TRUE)
## # A tibble: 24,689 × 2
##    word          n
##    <chr>     <int>
##  1 company    6244
##  2 executive  5010
##  3 chief      4576
##  4 chairman   4424
##  5 ceo        4318
##  6 board      3979
##  7 officer    3897
##  8 president  3778
##  9 mr         3578
## 10 inc        2751
## # ℹ 24,679 more rows
tidy_departures %>%
  count(exec_fullname, word, sort = TRUE)
## # A tibble: 232,688 × 3
##    exec_fullname          word           n
##    <chr>                  <chr>      <int>
##  1 Peter Jeffrey Kight    checkfree     19
##  2 Stanley J. Bright      iowa          15
##  3 Donald Cameron Clark   household     14
##  4 James A. McGrady       dsw           13
##  5 John F. Antioco        mr            13
##  6 Thomas A. Andruskevich birks         13
##  7 Craig M. Bernfield     aviv          12
##  8 Frank D. Hickingbotham tcby          12
##  9 Paul G. Greig          merger        12
## 10 Ted Thomas Cecala      wilmington    12
## # ℹ 232,678 more rows

Train a topic model

departures_sparse <-
    tidy_departures %>%
    count(exec_fullname, word) %>%
    
    # Filter for words uniquely identify executives: tf_idf penalizes words appearing all cases of executives: For more info, see text mining with R (chapter 3)
    bind_tf_idf(word, exec_fullname, n) %>%
    slice_max(order_by = tf_idf, n = 1000) %>%
    
    cast_sparse(exec_fullname, word, n)

dim(departures_sparse)
## [1] 693 952
library(stm)
set.seed(123)
topic_model <- stm(departures_sparse, K = 3, verbose = FALSE)

summary(topic_model)
## A topic model with 3 topics, 693 documents and a 952 word dictionary.
## Topic 1 Top Words:
##       Highest Prob: checkfree, tcby, hsbc, roadway, circus, justin, inco 
##       FREX: checkfree, tcby, roadway, circus, justin, inco, tcf 
##       Lift: 75yrs, 49, checkfree, old, tcby, roadway, circus 
##       Score: checkfree, tcby, hsbc, roadway, circus, justin, inco 
## Topic 2 Top Words:
##       Highest Prob: regarding, appear, planned, execucomp, founder, gallagher, web 
##       FREX: planned, regarding, founder, appear, web, retires, commscope 
##       Lift: planned, premeditated, bailey's, retirement, advance, orderly, info 
##       Score: planned, retirement, premeditated, advance, bailey's, regarding, founder 
## Topic 3 Top Words:
##       Highest Prob: execucomp, shows, turnover, coceo, interim, shaw, heart 
##       FREX: execucomp, shows, turnover, interim, leaving, coceo, goes 
##       Lift: rusty, chip, goes, lovejoy, problem, person, listing 
##       Score: execucomp, goes, rusty, chip, shows, turnover, interim

Explore topic model results

word_topics <- tidy(topic_model, matrix = "beta")
word_topics
## # A tibble: 2,856 × 3
##    topic term             beta
##    <int> <chr>           <dbl>
##  1     1 info         4.62e- 5
##  2     2 info         1.25e- 3
##  3     3 info         2.58e- 4
##  4     1 fraud        2.74e- 4
##  5     2 fraud        1.83e- 3
##  6     3 fraud        1.51e- 3
##  7     1 75yrs        1.91e- 3
##  8     2 75yrs        3.02e-11
##  9     3 75yrs        5.76e-10
## 10     1 premeditated 1.74e-11
## # ℹ 2,846 more rows
word_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  mutate(topic = paste("Topic", topic)) %>%
  ggplot(aes(beta, reorder_within(term, beta, topic), fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(topic), scales = "free_y") +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_reordered() +
  labs(x = expression(beta), y = NULL)

departures_topics <- tidy(topic_model,
  matrix = "gamma",
  document_names = rownames(departures_sparse)
)
departures_topics
## # A tibble: 2,079 × 3
##    document               topic  gamma
##    <chr>                  <int>  <dbl>
##  1 Larry Stephen Smith        1 0.0676
##  2 Anu D. Saad                1 0.134 
##  3 Thomas G. Deranek          1 0.921 
##  4 Wayne R. Weidner           1 0.0239
##  5 C. Lee Cox                 1 0.148 
##  6 John R. Frantz             1 0.0664
##  7 Kessel D. Stelling Jr.     1 0.139 
##  8 Richard E. Anthony         1 0.139 
##  9 Clifford W. Illig          1 0.0676
## 10 Russell A. Childs          1 0.0664
## # ℹ 2,069 more rows
departures_topics %>%
    
    # Filter for top 5 each topic in terms of gamma
    group_by(topic) %>%
    slice_max(order_by = gamma, n = 5) %>%
    ungroup() %>%
    
    mutate(
      notes = fct_reorder(document, gamma),
      topic = factor(topic)
    ) %>%
    ggplot(aes(gamma, topic, fill = topic)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(vars(notes), ncol = 3) +
    scale_x_continuous(expand = c(0, 0)) +
    labs(x = expression(gamma), y = "Topic")

You may stop here.