library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm (1): leftofc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
departures_clean <- departures %>% distinct(exec_fullname, .keep_all =TRUE)
departures %>% distinct(coname, exec_fullname)
## # A tibble: 9,203 × 2
## coname exec_fullname
## <chr> <chr>
## 1 SONICBLUE INC L. Gregory Ballard
## 2 AMERICAN AIRLINES GROUP INC Robert L. Crandall
## 3 AMERICAN AIRLINES GROUP INC Donald J. Carty
## 4 ABBOTT LABORATORIES Duane L. Burnham
## 5 ADVANCED MICRO DEVICES Walter Jeremiah Sanders III
## 6 AETNA INC Ronald Edward Compton
## 7 AHMANSON (H F) & CO Richard H. Deihl
## 8 AHMANSON (H F) & CO Charles R. Rinehart
## 9 AHMANSON (H F) & CO Robert M. De Kruif
## 10 AIR PRODUCTS & CHEMICALS INC Harold A. Wagner
## # ℹ 9,193 more rows
library(tidytext)
tidy_departures <-
departures %>%
unnest_tokens(word, notes) %>%
anti_join(get_stopwords())
## Joining with `by = join_by(word)`
tidy_departures %>%
count(word, sort = TRUE)
## # A tibble: 25,553 × 2
## word n
## <chr> <int>
## 1 company 6713
## 2 executive 5447
## 3 chief 4946
## 4 chairman 4814
## 5 ceo 4744
## 6 board 4352
## 7 officer 4213
## 8 president 4042
## 9 mr 3845
## 10 inc 2945
## # ℹ 25,543 more rows
tidy_departures %>%
count(exec_fullname, word, sort = TRUE)
## # A tibble: 247,733 × 3
## exec_fullname word n
## <chr> <chr> <int>
## 1 Constantine S. Macricostas macricostas 24
## 2 Peter Jeffrey Kight checkfree 19
## 3 Constantine S. Macricostas mr 18
## 4 Stanley J. Bright iowa 16
## 5 Constantine S. Macricostas company 15
## 6 Donald Cameron Clark household 14
## 7 Edward D. Breen executive 14
## 8 Edward D. Breen dupont 13
## 9 Jack Lawrence Howard steel 13
## 10 James A. McGrady dsw 13
## # ℹ 247,723 more rows
departures_sparse <-
tidy_departures %>%
count(exec_fullname, word) %>%
bind_tf_idf(word, exec_fullname, n) %>%
slice_max(order_by = tf_idf, n = 1000) %>%
cast_sparse(exec_fullname, word, n)
dim(departures_sparse)
## [1] 651 923
library(stm)
## stm v1.3.6.1 successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
set.seed(123)
topic_model <- stm(departures_sparse, K = 3, verbose = FALSE)
summary(topic_model)
## A topic model with 3 topics, 651 documents and a 923 word dictionary.
## Topic 1 Top Words:
## Highest Prob: checkfree, tcby, barker, roadway, circus, justin, motorola
## FREX: checkfree, tcby, barker, roadway, circus, justin, motorola
## Lift: 75yrs, 49, checkfree, tcby, barker, roadway, circus
## Score: checkfree, tcby, barker, roadway, circus, justin, motorola
## Topic 2 Top Words:
## Highest Prob: regarding, gallagher, planned, web, exec, callon, dismissal
## FREX: planned, regarding, web, information, retirement, founder, advance
## Lift: planned, premeditated, bailey's, retirement, advance, orderly, sungard
## Score: planned, regarding, gallagher, web, exec, callon, dismissal
## Topic 3 Top Words:
## Highest Prob: execucomp, shaw, sold, heart, age, interim, appear
## FREX: shaw, heart, age, died, attack, goes, casey’s
## Lift: rusty, chip, goes, covering, accused, paased, away
## Score: execucomp, shaw, age, heart, interim, sold, died
word_topics <- tidy(topic_model, matrix = "beta")
word_topics
## # A tibble: 2,769 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 info 2.59e- 9
## 2 2 info 1.43e- 3
## 3 3 info 5.51e- 7
## 4 1 fraud 7.76e- 9
## 5 2 fraud 2.87e- 3
## 6 3 fraud 2.25e- 6
## 7 1 75yrs 2.08e- 3
## 8 2 75yrs 2.38e-15
## 9 3 75yrs 1.25e-13
## 10 1 premeditated 1.52e-16
## # ℹ 2,759 more rows
word_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
mutate(topic = paste("Topic", topic)) %>%
ggplot(aes(beta, reorder_within(term, beta, topic), fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(topic), scales = "free_y") +
scale_x_continuous(expand = c(0, 0)) +
scale_y_reordered() +
labs(x = expression(beta), y = NULL)

departures_topics <- tidy(topic_model,
matrix = "gamma",
document_names = rownames(departures_sparse)
)
departures_topics
## # A tibble: 1,953 × 3
## document topic gamma
## <chr> <int> <dbl>
## 1 Larry Stephen Smith 1 0.0549
## 2 Anu D. Saad 1 0.0549
## 3 Thomas G. Deranek 1 0.784
## 4 Wayne R. Weidner 1 0.0393
## 5 C. Lee Cox 1 0.0450
## 6 John R. Frantz 1 0.0572
## 7 Clifford W. Illig 1 0.0549
## 8 Russell A. Childs 1 0.0572
## 9 Arif Shakeel 1 0.0393
## 10 Christopher William Eccleshare 1 0.0393
## # ℹ 1,943 more rows
departures_topics %>%
group_by(topic) %>%
slice_max(order_by = gamma, n = 5) %>%
ungroup() %>%
mutate(
notes = fct_reorder(document, gamma),
topic = factor(topic)
) %>%
ggplot(aes(gamma, topic, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(notes), ncol = 3) +
scale_x_continuous(expand = c(0, 0)) +
labs(x = expression(gamma), y = "Topic")

- Question and Data
- What is the goal of the analysis? The goal of the analysis is to
explore the notes section in the CEO departures data set. I am using the
topic model to analyze the notes to discover if there is anything in
there that indicates why the CEO may have been fired.
- Describe the data briefly. The data set contains data regarding
CEO’s of many different companies. The data set contains variables such
as coname, fyear, exec_fullname, departure_code, ceo_dismissal, notes
and fyear_gone. Coname is the name of the company, exec_fullname is the
name of the CEO, departure code is the code that signals if they left on
their own terms or were fired. Ceo_dismissal indicates if the CEO was
fired or not. Notes is a description of the CEO, such as when they
started working at the company. Fyear is the year the CEO was hired. The
data set contains 9,423 observations of 19 variables.
- What are the characteristics of the key variables used in the
analysis? The key variables used in the analysis are exec_fullname and
notes. These variables are important in the model as we use both to
create the analysis. Notes is character data and so exec_fullname.
- Data Preparation
- Describe the differences between the original data and the data
transformed from modeling. Why? The original data has 9,423 observations
of 19 variables. The cleaned data has 8,701 observations of 19
variables. We used “unnest_tokens(word, notes)” to unnest and tokenize
the notes column into individual words. This is helpful for our analysis
as we are trying to see if any words in the notes section indicate
if/why a CEO would be fired. I also used distinct() to filter the
executive names to make sure there were no duplicates in the data set,
as this would skew our analysis. I removed common words that do not
contribute an actual meaning such as “like”, “the” “and” etc so that the
analysis can focus on more meaningful words. This was done using
“anti_join(get_stopwords())”. I used bind_tf_idf(word, exec_fullname, n)
to compute the term frequency-inverse document frequency. This is needed
because it helps identify which words are important. I also used
cast_sparse(exec_fullname, word, n) to show the relationship between
exec_fullname and important words. This is necassary for topic modeling.
3.Data preparation and modeling
- What is the type of model used in the analysis? The type of model
used in this analysis is Structual Topic Model (stm).this allowes
includsion of metadata in the modeling process. The model is good at
finding relationships in topics of text data.
- How is the optimal value for k found? The optimal value of K was
found through trial and error. At first I tried 4 because that is what
was in the code along for this week. 4 did not seem to look good, so I
tried 5-7 but did not like those either. I decided to try a number lower
than 4 and found that 3 seemed to work well.
- Conclusion
- What are the findings from the analysis? My analysis has transformed
raw executive departure data into a structured format for text analysis
and topic modeling. By applying the STM, we can see topics that offer
insights into executive departures, which were not immediately apparent
in the raw data. Topic 1 contains words such as “checkfree” “circus”
“diplomat” “fluke” and “sterling”. Topic 2 contains words such as
“regarding” “planned” “web” “dismissal” and “information”. Topic 3 has
many words such as “shaw” “heart” “sold” “age” and “interm” Topic one
has 18 words, topic 2 has only 10 words, and topic 3 has around 50
words.