library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
departures_clean <- departures %>% distinct(exec_fullname, .keep_all =TRUE)
departures %>% distinct(coname, exec_fullname)
## # A tibble: 9,203 × 2
##    coname                       exec_fullname              
##    <chr>                        <chr>                      
##  1 SONICBLUE INC                L. Gregory Ballard         
##  2 AMERICAN AIRLINES GROUP INC  Robert L. Crandall         
##  3 AMERICAN AIRLINES GROUP INC  Donald J. Carty            
##  4 ABBOTT LABORATORIES          Duane L. Burnham           
##  5 ADVANCED MICRO DEVICES       Walter Jeremiah Sanders III
##  6 AETNA INC                    Ronald Edward Compton      
##  7 AHMANSON (H F) & CO          Richard H. Deihl           
##  8 AHMANSON (H F) & CO          Charles R. Rinehart        
##  9 AHMANSON (H F) & CO          Robert M. De Kruif         
## 10 AIR PRODUCTS & CHEMICALS INC Harold A. Wagner           
## # ℹ 9,193 more rows
library(tidytext)

tidy_departures <-
  departures %>%
  unnest_tokens(word, notes) %>%
  anti_join(get_stopwords())
## Joining with `by = join_by(word)`
tidy_departures %>%
  count(word, sort = TRUE)
## # A tibble: 25,553 × 2
##    word          n
##    <chr>     <int>
##  1 company    6713
##  2 executive  5447
##  3 chief      4946
##  4 chairman   4814
##  5 ceo        4744
##  6 board      4352
##  7 officer    4213
##  8 president  4042
##  9 mr         3845
## 10 inc        2945
## # ℹ 25,543 more rows
tidy_departures %>%
  count(exec_fullname, word, sort = TRUE)
## # A tibble: 247,733 × 3
##    exec_fullname              word            n
##    <chr>                      <chr>       <int>
##  1 Constantine S. Macricostas macricostas    24
##  2 Peter Jeffrey Kight        checkfree      19
##  3 Constantine S. Macricostas mr             18
##  4 Stanley J. Bright          iowa           16
##  5 Constantine S. Macricostas company        15
##  6 Donald Cameron Clark       household      14
##  7 Edward D. Breen            executive      14
##  8 Edward D. Breen            dupont         13
##  9 Jack Lawrence Howard       steel          13
## 10 James A. McGrady           dsw            13
## # ℹ 247,723 more rows
departures_sparse <-
  tidy_departures %>%
  count(exec_fullname, word) %>%
     bind_tf_idf(word, exec_fullname, n) %>%
    slice_max(order_by = tf_idf, n = 1000) %>%
  cast_sparse(exec_fullname, word, n)

dim(departures_sparse)
## [1] 651 923
library(stm)
## stm v1.3.6.1 successfully loaded. See ?stm for help. 
##  Papers, resources, and other materials at structuraltopicmodel.com
set.seed(123)
topic_model <- stm(departures_sparse, K = 3, verbose = FALSE)
summary(topic_model)
## A topic model with 3 topics, 651 documents and a 923 word dictionary.
## Topic 1 Top Words:
##       Highest Prob: checkfree, tcby, barker, roadway, circus, justin, motorola 
##       FREX: checkfree, tcby, barker, roadway, circus, justin, motorola 
##       Lift: 75yrs, 49, checkfree, tcby, barker, roadway, circus 
##       Score: checkfree, tcby, barker, roadway, circus, justin, motorola 
## Topic 2 Top Words:
##       Highest Prob: regarding, gallagher, planned, web, exec, callon, dismissal 
##       FREX: planned, regarding, web, information, retirement, founder, advance 
##       Lift: planned, premeditated, bailey's, retirement, advance, orderly, sungard 
##       Score: planned, regarding, gallagher, web, exec, callon, dismissal 
## Topic 3 Top Words:
##       Highest Prob: execucomp, shaw, sold, heart, age, interim, appear 
##       FREX: shaw, heart, age, died, attack, goes, casey’s 
##       Lift: rusty, chip, goes, covering, accused, paased, away 
##       Score: execucomp, shaw, age, heart, interim, sold, died
word_topics <- tidy(topic_model, matrix = "beta")
word_topics
## # A tibble: 2,769 × 3
##    topic term             beta
##    <int> <chr>           <dbl>
##  1     1 info         2.59e- 9
##  2     2 info         1.43e- 3
##  3     3 info         5.51e- 7
##  4     1 fraud        7.76e- 9
##  5     2 fraud        2.87e- 3
##  6     3 fraud        2.25e- 6
##  7     1 75yrs        2.08e- 3
##  8     2 75yrs        2.38e-15
##  9     3 75yrs        1.25e-13
## 10     1 premeditated 1.52e-16
## # ℹ 2,759 more rows
word_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  mutate(topic = paste("Topic", topic)) %>%
  ggplot(aes(beta, reorder_within(term, beta, topic), fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(topic), scales = "free_y") +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_reordered() +
  labs(x = expression(beta), y = NULL)

departures_topics <- tidy(topic_model,
  matrix = "gamma",
  document_names = rownames(departures_sparse)
)
departures_topics
## # A tibble: 1,953 × 3
##    document                       topic  gamma
##    <chr>                          <int>  <dbl>
##  1 Larry Stephen Smith                1 0.0549
##  2 Anu D. Saad                        1 0.0549
##  3 Thomas G. Deranek                  1 0.784 
##  4 Wayne R. Weidner                   1 0.0393
##  5 C. Lee Cox                         1 0.0450
##  6 John R. Frantz                     1 0.0572
##  7 Clifford W. Illig                  1 0.0549
##  8 Russell A. Childs                  1 0.0572
##  9 Arif Shakeel                       1 0.0393
## 10 Christopher William Eccleshare     1 0.0393
## # ℹ 1,943 more rows
departures_topics %>%
    
    group_by(topic) %>%
    slice_max(order_by = gamma, n = 5) %>%
    ungroup() %>%
    
    mutate(
      notes = fct_reorder(document, gamma),
      topic = factor(topic)
    ) %>%
    ggplot(aes(gamma, topic, fill = topic)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(vars(notes), ncol = 3) +
    scale_x_continuous(expand = c(0, 0)) +
    labs(x = expression(gamma), y = "Topic")

  1. Question and Data
  1. What is the goal of the analysis? The goal of the analysis is to explore the notes section in the CEO departures data set. I am using the topic model to analyze the notes to discover if there is anything in there that indicates why the CEO may have been fired.
  2. Describe the data briefly. The data set contains data regarding CEO’s of many different companies. The data set contains variables such as coname, fyear, exec_fullname, departure_code, ceo_dismissal, notes and fyear_gone. Coname is the name of the company, exec_fullname is the name of the CEO, departure code is the code that signals if they left on their own terms or were fired. Ceo_dismissal indicates if the CEO was fired or not. Notes is a description of the CEO, such as when they started working at the company. Fyear is the year the CEO was hired. The data set contains 9,423 observations of 19 variables.
  3. What are the characteristics of the key variables used in the analysis? The key variables used in the analysis are exec_fullname and notes. These variables are important in the model as we use both to create the analysis. Notes is character data and so exec_fullname.
  1. Data Preparation
  1. Describe the differences between the original data and the data transformed from modeling. Why? The original data has 9,423 observations of 19 variables. The cleaned data has 8,701 observations of 19 variables. We used “unnest_tokens(word, notes)” to unnest and tokenize the notes column into individual words. This is helpful for our analysis as we are trying to see if any words in the notes section indicate if/why a CEO would be fired. I also used distinct() to filter the executive names to make sure there were no duplicates in the data set, as this would skew our analysis. I removed common words that do not contribute an actual meaning such as “like”, “the” “and” etc so that the analysis can focus on more meaningful words. This was done using “anti_join(get_stopwords())”. I used bind_tf_idf(word, exec_fullname, n) to compute the term frequency-inverse document frequency. This is needed because it helps identify which words are important. I also used cast_sparse(exec_fullname, word, n) to show the relationship between exec_fullname and important words. This is necassary for topic modeling. 3.Data preparation and modeling
  2. What is the type of model used in the analysis? The type of model used in this analysis is Structual Topic Model (stm).this allowes includsion of metadata in the modeling process. The model is good at finding relationships in topics of text data.
  3. How is the optimal value for k found? The optimal value of K was found through trial and error. At first I tried 4 because that is what was in the code along for this week. 4 did not seem to look good, so I tried 5-7 but did not like those either. I decided to try a number lower than 4 and found that 3 seemed to work well.
  1. Conclusion
  1. What are the findings from the analysis? My analysis has transformed raw executive departure data into a structured format for text analysis and topic modeling. By applying the STM, we can see topics that offer insights into executive departures, which were not immediately apparent in the raw data. Topic 1 contains words such as “checkfree” “circus” “diplomat” “fluke” and “sterling”. Topic 2 contains words such as “regarding” “planned” “web” “dismissal” and “information”. Topic 3 has many words such as “shaw” “heart” “sold” “age” and “interm” Topic one has 18 words, topic 2 has only 10 words, and topic 3 has around 50 words.