Source of data: this dataset of 56 Trump speeches by Pedram Navid.
library(tidyverse)
library(tidytext)
library(rvest)
library(stringr)
# get dates of each speech
h <- read_html("http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=45&campaign=2016TRUMP&doctype=5000")
speech_dates <- data_frame(
date = html_text(html_nodes(h, ".listdate:nth-child(2)")),
title = html_text(html_nodes(h, ".listdate a"))
) %>%
mutate(date = as.Date(date, "%B %d, %Y"))
# I downloaded the speeches into ~/Repositories/trump_speeches.
trump_speeches <- map_df(dir("~/Repositories/trump_speeches/data/", pattern = "speech_", full.names = TRUE),
function(f) data_frame(content = read_lines(f))) %>%
mutate(speech = (row_number() + 1) %/% 2,
type = c("text", "title")[row_number() %% 2 + 1]) %>%
spread(type, content) %>%
inner_join(speech_dates, by = "title") %>%
mutate(title = str_trim(str_replace(title, "^Remarks (at( the| a)?|to the|in|on)?", "")))
trump_words <- trump_speeches %>%
mutate(text = str_replace_all(text, "\\.", ". ")) %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word)
First, simply the most common words:
trump_words %>%
count(word, sort = TRUE) %>%
head(25) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_bar(stat = "identity") +
coord_flip() +
ylab("# of uses in these 55 speeches")
What were the speeches with the most dramatic sentiment?
trump_words %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(title) %>%
summarize(average_sentiment = mean(score)) %>%
top_n(20, abs(average_sentiment)) %>%
mutate(title = reorder(title, average_sentiment)) %>%
ggplot(aes(title, average_sentiment, fill = average_sentiment > 0)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
ylab("Average AFINN Sentiment") +
ggtitle("Sentiment by Trump speech")
trump_words %>%
count(word) %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
mutate(total_score = score * n,
word = reorder(word, total_score)) %>%
top_n(40, abs(total_score)) %>%
ggplot(aes(word, total_score, fill = total_score > 0)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
ylab("Frequency * AFINN sentiment score") +
ggtitle("Words contributing the most to sentiment scores")
What about the most common words associated with each NRC sentiment?
trump_words %>%
count(word) %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(!sentiment %in% c("negative", "positive")) %>%
mutate(word = reorder(word, n)) %>%
group_by(sentiment) %>%
top_n(8, n) %>%
ggplot(aes(word, n)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(~ sentiment, scales = "free", nrow = 4) +
ylab("Frequency")
What words have changed in frequency over time, possibly reflecting a change in campaign focus?
We fit a beta regression model to the frequency of each common word.
library(lubridate)
library(broom)
word_months <- trump_words %>%
filter(date >= "2016-01-01") %>%
mutate(month = round_date(date, "month"))
month_totals <- word_months %>%
group_by(month) %>%
summarize(total = n())
word_month_percent <- word_months %>%
count(month, word) %>%
ungroup() %>%
complete(month, word, fill = list(n = 0)) %>%
inner_join(month_totals) %>%
mutate(percent = n / total)
# need to add pseudo-count of 1, since betareg requires in (0, 1)
word_month_models <- word_month_percent %>%
mutate(percent = (n + 1) / (total + 1)) %>%
mutate(month_number = month(month)) %>%
group_by(word) %>%
filter(sum(n) > 100) %>%
nest(-word) %>%
unnest(map(data, ~ tidy(betareg::betareg(percent ~ month_number, data = .)))) %>%
filter(term == "month_number") %>%
arrange(desc(estimate))
Show the significantly increasing and decreasing terms (at a false discovery rate of 5%):
significant_terms <- word_month_models %>%
mutate(fdr = p.adjust(p.value, method = "fdr")) %>%
filter(fdr < .05) %>%
mutate(direction = ifelse(estimate > 0, "Increasing", "Decreasing"))
significant_terms %>%
mutate(word = reorder(word, estimate)) %>%
ggplot(aes(word, estimate, fill = direction)) +
geom_bar(stat = "identity", show.legend = FALSE) +
ylab("Growth or decline rate of word frequency over time") +
coord_flip()
significant_terms %>%
inner_join(word_month_percent) %>%
mutate(word = reorder(word, estimate)) %>%
ggplot(aes(month, percent, color = direction)) +
geom_line() +
facet_wrap(~ word) +
scale_y_continuous(labels = percent_format()) +
ylab("Frequency in Trump speeches")
library(widyr)
by_sentence <- trump_speeches %>%
unnest_tokens(sentence, text, token = "sentences") %>%
mutate(sentence_id = row_number()) %>%
unnest_tokens(word, sentence, token = "words") %>%
filter(!word %in% stop_words$word)
word_cors <- by_sentence %>%
group_by(word) %>%
mutate(frequency = n()) %>%
ungroup() %>%
filter(frequency > 30) %>%
pairwise_cor(word, sentence_id, sort = TRUE)
word_cors
## # A tibble: 126,380 × 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 house white 0.8750040
## 2 white house 0.8750040
## 3 replace repeal 0.8322619
## 4 repeal replace 0.8322619
## 5 8th november 0.8155952
## 6 november 8th 0.8155952
## 7 east middle 0.8082144
## 8 middle east 0.8082144
## 9 court constitution 0.7690968
## 10 constitution court 0.7690968
## # ... with 126,370 more rows
library(ggraph)
library(igraph)
set.seed(2016)
word_cors %>%
filter(correlation > .3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha = correlation)) +
geom_node_point(color = "lightblue", size = 4) +
geom_node_text(aes(label = name), repel = TRUE) +
ggforce::theme_no_axes()