Analyzing Trump speeches with tidytext

Source of data: this dataset of 56 Trump speeches by Pedram Navid.

library(tidyverse)
library(tidytext)
library(rvest)
library(stringr)

# get dates of each speech
h <- read_html("http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=45&campaign=2016TRUMP&doctype=5000")
speech_dates <- data_frame(
  date = html_text(html_nodes(h, ".listdate:nth-child(2)")),
  title = html_text(html_nodes(h, ".listdate a"))
) %>%
  mutate(date = as.Date(date, "%B %d, %Y"))

# I downloaded the speeches into ~/Repositories/trump_speeches.
trump_speeches <- map_df(dir("~/Repositories/trump_speeches/data/", pattern = "speech_", full.names = TRUE),
       function(f) data_frame(content = read_lines(f))) %>%
  mutate(speech = (row_number() + 1) %/% 2,
         type = c("text", "title")[row_number() %% 2 + 1]) %>%
  spread(type, content) %>%
  inner_join(speech_dates, by = "title") %>%
  mutate(title = str_trim(str_replace(title, "^Remarks (at( the| a)?|to the|in|on)?", "")))

trump_words <- trump_speeches %>%
  mutate(text = str_replace_all(text, "\\.", ". ")) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)

First, simply the most common words:

trump_words %>%
  count(word, sort = TRUE) %>%
  head(25) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ylab("# of uses in these 55 speeches")

What were the speeches with the most dramatic sentiment?

trump_words %>%
  inner_join(get_sentiments("afinn"), by = "word") %>%
  group_by(title) %>%
  summarize(average_sentiment = mean(score)) %>%
  top_n(20, abs(average_sentiment)) %>%
  mutate(title = reorder(title, average_sentiment)) %>%
  ggplot(aes(title, average_sentiment, fill = average_sentiment > 0)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  ylab("Average AFINN Sentiment") +
  ggtitle("Sentiment by Trump speech")

trump_words %>%
  count(word) %>%
  inner_join(get_sentiments("afinn"), by = "word") %>%
  mutate(total_score = score * n,
         word = reorder(word, total_score)) %>%
  top_n(40, abs(total_score)) %>%
  ggplot(aes(word, total_score, fill = total_score > 0)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  ylab("Frequency * AFINN sentiment score") +
  ggtitle("Words contributing the most to sentiment scores")

What about the most common words associated with each NRC sentiment?

trump_words %>%
  count(word) %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  filter(!sentiment %in% c("negative", "positive")) %>%
  mutate(word = reorder(word, n)) %>%
  group_by(sentiment) %>%
  top_n(8, n) %>%
  ggplot(aes(word, n)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  facet_wrap(~ sentiment, scales = "free", nrow = 4) +
  ylab("Frequency")

Over time

What words have changed in frequency over time, possibly reflecting a change in campaign focus?

We fit a beta regression model to the frequency of each common word.

library(lubridate)
library(broom)

word_months <- trump_words %>%
  filter(date >= "2016-01-01") %>%
  mutate(month = round_date(date, "month"))

month_totals <- word_months %>%
  group_by(month) %>%
  summarize(total = n())

word_month_percent <- word_months %>%
  count(month, word) %>%
  ungroup() %>%
  complete(month, word, fill = list(n = 0)) %>%
  inner_join(month_totals) %>%
  mutate(percent = n / total)

# need to add pseudo-count of 1, since betareg requires in (0, 1)
word_month_models <- word_month_percent %>%
  mutate(percent = (n + 1) / (total + 1)) %>%
  mutate(month_number = month(month)) %>%
  group_by(word) %>%
  filter(sum(n) > 100) %>%
  nest(-word) %>%
  unnest(map(data, ~ tidy(betareg::betareg(percent ~ month_number, data = .)))) %>%
  filter(term == "month_number") %>%
  arrange(desc(estimate))

Show the significantly increasing and decreasing terms (at a false discovery rate of 5%):

significant_terms <- word_month_models %>%
  mutate(fdr = p.adjust(p.value, method = "fdr")) %>%
  filter(fdr < .05) %>%
  mutate(direction = ifelse(estimate > 0, "Increasing", "Decreasing"))

significant_terms %>%
  mutate(word = reorder(word, estimate)) %>%
  ggplot(aes(word, estimate, fill = direction)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  ylab("Growth or decline rate of word frequency over time") +
  coord_flip()

significant_terms %>%
  inner_join(word_month_percent) %>%
  mutate(word = reorder(word, estimate)) %>%
  ggplot(aes(month, percent, color = direction)) +
  geom_line() +
  facet_wrap(~ word) +
  scale_y_continuous(labels = percent_format()) +
  ylab("Frequency in Trump speeches")

Relationships among words

library(widyr)

by_sentence <- trump_speeches %>%
  unnest_tokens(sentence, text, token = "sentences") %>%
  mutate(sentence_id = row_number()) %>%
  unnest_tokens(word, sentence, token = "words") %>%
  filter(!word %in% stop_words$word)

word_cors <- by_sentence %>%
  group_by(word) %>%
  mutate(frequency = n()) %>%
  ungroup() %>%
  filter(frequency > 30) %>%
  pairwise_cor(word, sentence_id, sort = TRUE)

word_cors

## # A tibble: 126,380 × 3
##           item1        item2 correlation
##           <chr>        <chr>       <dbl>
## 1         house        white   0.8750040
## 2         white        house   0.8750040
## 3       replace       repeal   0.8322619
## 4        repeal      replace   0.8322619
## 5           8th     november   0.8155952
## 6      november          8th   0.8155952
## 7          east       middle   0.8082144
## 8        middle         east   0.8082144
## 9         court constitution   0.7690968
## 10 constitution        court   0.7690968
## # ... with 126,370 more rows

library(ggraph)
library(igraph)

set.seed(2016)

word_cors %>%
  filter(correlation > .3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(alpha = correlation)) +
  geom_node_point(color = "lightblue", size = 4) +
  geom_node_text(aes(label = name), repel = TRUE) +
  ggforce::theme_no_axes()

Analyzing Trump speeches with tidytext

David Robinson

November 16, 2016

Over time

Relationships among words