Inspired by Kyle Scot Shank’s tweet:

@drob does this particular set of data interest you?https://t.co/nI9QTzwWDD

— Kyle Scot Shank (@KyleScotShank) September 25, 2017
library(tidyverse)
library(rvest)
library(lubridate)

links <- read_html("https://factba.se/topic/howard-stern-interviews") %>%
  html_nodes("#timelineblock a , small") %>%
  html_attr("href") %>%
  na.omit() %>%
  unique()

download_from_link <- function(link) {
  message(link)
  h <- read_html(link)
  
  text <- h %>%
    html_nodes("#resultsblock a") %>%
    html_text()
  
  speaker <- h %>%
    html_nodes(".speaker-label") %>%
    html_text()
  
  data_frame(text = text[text != ""],
             speaker = speaker)

}

transcripts <- data_frame(link = links) %>%
  mutate(transcripts = map(link, possibly(download_from_link, NULL)))

unnested_transcripts <- transcripts %>%
  filter(!map_lgl(transcripts, is.null)) %>%
  mutate(date = parse_date_time(link, "b!-d!-Y!")) %>%
  select(-link) %>%
  unnest(transcripts)
library(tidytext)
library(stringr)

transcript_words <- unnested_transcripts %>%
  mutate(line = row_number()) %>%
  unnest_tokens(word, text)

trump_words <- transcript_words %>%
  filter(speaker == "Donald Trump") %>%
  anti_join(stop_words, by = "word") %>%
  mutate(year = year(date),
         length = str_length(word),
         year_bin = cut(year, c(1990, 2000, 2005, 2010, 2015), dig.lab = 4))
# change in word length over time?
# There are more sophisticated methods out there to measure complexity
trump_words %>%
  group_by(year_bin) %>%
  summarize(words = n(),
            average_length = mean(length))
## # A tibble: 4 x 3
##      year_bin words average_length
##        <fctr> <int>          <dbl>
## 1 (1990,2000]  3116       5.822850
## 2 (2000,2005] 10511       5.802207
## 3 (2005,2010]  5263       5.692951
## 4 (2010,2015]  3232       5.862933
year_totals <- trump_words %>%
  group_by(year) %>%
  summarize(year_total = n())

Is he getting more negative over time? Maybe a little.

trump_words %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(year_bin) %>%
  summarize(words = n(),
            sentiment = mean(score))
## # A tibble: 4 x 3
##      year_bin words sentiment
##        <fctr> <int>     <dbl>
## 1 (1990,2000]   532 0.7481203
## 2 (2000,2005]  1860 0.3559140
## 3 (2005,2010]   910 0.2307692
## 4 (2010,2015]   586 0.4078498

What words are changing in frequency over time?

library(scales)
theme_set(theme_minimal())

# change in word usage. Must have at least 50 total to be included.
word_year_counts <- trump_words %>%
  count(year, word) %>%
  add_count(word, wt = n) %>%
  filter(nn >= 30) %>%
  complete(word, year, fill = list(n = 0)) %>%
  inner_join(year_totals, by = "year")

models <- word_year_counts %>%
  nest(-word) %>%
  mutate(model = map(data, ~ glm(cbind(n, year_total) ~ year, data = ., family = "binomial")))

library(broom)

# most changed over time:
models %>%
  unnest(map(model, tidy)) %>%
  filter(term == "year") %>%
  mutate(p.adjusted = p.adjust(p.value, method = "fdr")) %>%
  filter(p.adjusted <= .05) %>%
  arrange(desc(estimate))
## # A tibble: 14 x 7
##          word  term    estimate  std.error statistic      p.value
##         <chr> <chr>       <dbl>      <dbl>     <dbl>        <dbl>
##  1    country  year  0.17619403 0.03072088  5.735319 9.732923e-09
##  2   remember  year  0.11414658 0.02892828  3.945848 7.951799e-05
##  3 apprentice  year  0.08889796 0.02476716  3.589349 3.315051e-04
##  4      robin  year -0.05097911 0.01814188 -2.810023 4.953790e-03
##  5     howard  year -0.06083035 0.01010550 -6.019532 1.749224e-09
##  6       book  year -0.07272925 0.02538425 -2.865133 4.168345e-03
##  7     pretty  year -0.08570490 0.03088437 -2.775025 5.519750e-03
##  8       john  year -0.09534569 0.03163167 -3.014248 2.576171e-03
##  9      fight  year -0.09822409 0.03094902 -3.173739 1.504890e-03
## 10      gonna  year -0.10004049 0.02491344 -4.015523 5.931413e-05
## 11      marla  year -0.12157735 0.03551339 -3.423423 6.183787e-04
## 12       city  year -0.12530446 0.03284574 -3.814938 1.362171e-04
## 13        met  year -0.13000653 0.03604161 -3.607123 3.096110e-04
## 14 girlfriend  year -0.14850059 0.03616443 -4.106261 4.021155e-05
## # ... with 1 more variables: p.adjusted <dbl>
word_year_counts %>%
  filter(word %in% c("country", "remember", "met", "girlfriend")) %>%
  ggplot(aes(year, n / year_total)) +
  geom_line() +
  scale_y_continuous(labels = percent_format()) +
  facet_wrap(~ word) +
  labs(y = "Frequency of word within this year")

Common co-occurring words:

library(widyr)
library(ggraph)
library(igraph)

cors <- trump_words %>%
  add_count(word) %>%
  filter(n >= 20) %>%
  pairwise_cor(word, line, sort = TRUE)

set.seed(2017)
cors %>%
  top_n(250) %>%
  graph_from_data_frame() %>%
  ggraph() +
  geom_edge_link(aes(edge_alpha = correlation)) +
  geom_node_point(size = 4, color = "lightblue") +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() +
  theme(legend.position = "none") +
  labs(title = "Co-occurrence network of Trump's words",
       subtitle = "Based on interviews with Howard Stern; only words with at least 20 uses.")