Inspired by Kyle Scot Shank’s tweet:
@drob does this particular set of data interest you?https://t.co/nI9QTzwWDD
— Kyle Scot Shank (@KyleScotShank) September 25, 2017
library(tidyverse)
library(rvest)
library(lubridate)
links <- read_html("https://factba.se/topic/howard-stern-interviews") %>%
html_nodes("#timelineblock a , small") %>%
html_attr("href") %>%
na.omit() %>%
unique()
download_from_link <- function(link) {
message(link)
h <- read_html(link)
text <- h %>%
html_nodes("#resultsblock a") %>%
html_text()
speaker <- h %>%
html_nodes(".speaker-label") %>%
html_text()
data_frame(text = text[text != ""],
speaker = speaker)
}
transcripts <- data_frame(link = links) %>%
mutate(transcripts = map(link, possibly(download_from_link, NULL)))
unnested_transcripts <- transcripts %>%
filter(!map_lgl(transcripts, is.null)) %>%
mutate(date = parse_date_time(link, "b!-d!-Y!")) %>%
select(-link) %>%
unnest(transcripts)
library(tidytext)
library(stringr)
transcript_words <- unnested_transcripts %>%
mutate(line = row_number()) %>%
unnest_tokens(word, text)
trump_words <- transcript_words %>%
filter(speaker == "Donald Trump") %>%
anti_join(stop_words, by = "word") %>%
mutate(year = year(date),
length = str_length(word),
year_bin = cut(year, c(1990, 2000, 2005, 2010, 2015), dig.lab = 4))
# change in word length over time?
# There are more sophisticated methods out there to measure complexity
trump_words %>%
group_by(year_bin) %>%
summarize(words = n(),
average_length = mean(length))
## # A tibble: 4 x 3
## year_bin words average_length
## <fctr> <int> <dbl>
## 1 (1990,2000] 3116 5.822850
## 2 (2000,2005] 10511 5.802207
## 3 (2005,2010] 5263 5.692951
## 4 (2010,2015] 3232 5.862933
year_totals <- trump_words %>%
group_by(year) %>%
summarize(year_total = n())
Is he getting more negative over time? Maybe a little.
trump_words %>%
inner_join(get_sentiments("afinn")) %>%
group_by(year_bin) %>%
summarize(words = n(),
sentiment = mean(score))
## # A tibble: 4 x 3
## year_bin words sentiment
## <fctr> <int> <dbl>
## 1 (1990,2000] 532 0.7481203
## 2 (2000,2005] 1860 0.3559140
## 3 (2005,2010] 910 0.2307692
## 4 (2010,2015] 586 0.4078498
What words are changing in frequency over time?
library(scales)
theme_set(theme_minimal())
# change in word usage. Must have at least 50 total to be included.
word_year_counts <- trump_words %>%
count(year, word) %>%
add_count(word, wt = n) %>%
filter(nn >= 30) %>%
complete(word, year, fill = list(n = 0)) %>%
inner_join(year_totals, by = "year")
models <- word_year_counts %>%
nest(-word) %>%
mutate(model = map(data, ~ glm(cbind(n, year_total) ~ year, data = ., family = "binomial")))
library(broom)
# most changed over time:
models %>%
unnest(map(model, tidy)) %>%
filter(term == "year") %>%
mutate(p.adjusted = p.adjust(p.value, method = "fdr")) %>%
filter(p.adjusted <= .05) %>%
arrange(desc(estimate))
## # A tibble: 14 x 7
## word term estimate std.error statistic p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 country year 0.17619403 0.03072088 5.735319 9.732923e-09
## 2 remember year 0.11414658 0.02892828 3.945848 7.951799e-05
## 3 apprentice year 0.08889796 0.02476716 3.589349 3.315051e-04
## 4 robin year -0.05097911 0.01814188 -2.810023 4.953790e-03
## 5 howard year -0.06083035 0.01010550 -6.019532 1.749224e-09
## 6 book year -0.07272925 0.02538425 -2.865133 4.168345e-03
## 7 pretty year -0.08570490 0.03088437 -2.775025 5.519750e-03
## 8 john year -0.09534569 0.03163167 -3.014248 2.576171e-03
## 9 fight year -0.09822409 0.03094902 -3.173739 1.504890e-03
## 10 gonna year -0.10004049 0.02491344 -4.015523 5.931413e-05
## 11 marla year -0.12157735 0.03551339 -3.423423 6.183787e-04
## 12 city year -0.12530446 0.03284574 -3.814938 1.362171e-04
## 13 met year -0.13000653 0.03604161 -3.607123 3.096110e-04
## 14 girlfriend year -0.14850059 0.03616443 -4.106261 4.021155e-05
## # ... with 1 more variables: p.adjusted <dbl>
word_year_counts %>%
filter(word %in% c("country", "remember", "met", "girlfriend")) %>%
ggplot(aes(year, n / year_total)) +
geom_line() +
scale_y_continuous(labels = percent_format()) +
facet_wrap(~ word) +
labs(y = "Frequency of word within this year")
Common co-occurring words:
library(widyr)
library(ggraph)
library(igraph)
cors <- trump_words %>%
add_count(word) %>%
filter(n >= 20) %>%
pairwise_cor(word, line, sort = TRUE)
set.seed(2017)
cors %>%
top_n(250) %>%
graph_from_data_frame() %>%
ggraph() +
geom_edge_link(aes(edge_alpha = correlation)) +
geom_node_point(size = 4, color = "lightblue") +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void() +
theme(legend.position = "none") +
labs(title = "Co-occurrence network of Trump's words",
subtitle = "Based on interviews with Howard Stern; only words with at least 20 uses.")