Assignment: Select another news organization and grab 5000 of their tweets. Some possibilities: For Billings Gazette headlines: billingsgazette Wall Street Journal: wsj Washington Post: washingtonpost CNN: cnn CNN breaking news: cnnbrk USA Today: usatoday

  1. Unnest the words of the tweets, remove stop words and weird web “words”, and create a table and a word cloud of the top words.
  2. Conduct a sentiment analysis using bing, remove any errors like trump = positive, and create a graph of the words that contribute most to each sentiment.
  3. Do the same as above but with the nrc sentiment lexicon.
  4. Unnest the tweets as bigrams, remove stop words and errors, and create a table and word cloud of the most common bigrams.
  5. Using the bigrams, look for the most common words that follow two different words. You may choose trump and pelosi, or choose your own.
news_tweets <- get_timeline("billingsgazette", n = 5000)
news_words <- news_tweets %>% 
  unnest_tokens(word, text) %>% 
  select(screen_name, word)
news_words %>%
  count(word, sort = T)
news_words %>% 
  anti_join(get_stopwords()) %>% 
  count(word, sort = T)
Joining, by = "word"
news_words %>% 
  anti_join(get_stopwords()) %>% 
  filter(!word == "https",
         !word == "t.co") %>%
  count(word, sort = T)
Joining, by = "word"
news_words %>% 
  anti_join(get_stopwords()) %>% 
  filter(!word == "https",
         !word == "t.co") %>%
  count(word, sort = T) %>%
  top_n(200) %>%
  wordcloud2(size = .5)
Joining, by = "word"
Selecting by n
bing <- get_sentiments("bing")
bing
news_words %>% 
  inner_join(bing) %>% 
  count(word, sentiment, sort = TRUE)
Joining, by = "word"
news_words %>% 
  inner_join(bing) %>% 
  filter(!word == "trump") %>%
  count(word, sentiment, sort = TRUE) 
Joining, by = "word"
news_words %>% 
  inner_join(bing) %>% 
  filter(!word == "trump") %>%
  count(word, sentiment, sort = TRUE)%>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(sentiment), scales = "free") +
  labs(y = "News headlines: Words that contribute the most to each sentiment",
       x = NULL) +
  coord_flip() +
  theme_minimal()
Joining, by = "word"
Selecting by n

nrc <- get_sentiments("nrc")
nrc
nrc %>%
  distinct(sentiment)
news_words %>% 
  inner_join(nrc) %>% 
  filter(!word == "trump") %>%
  count(word, sentiment, sort = TRUE)%>%
  group_by(sentiment) %>%
  top_n(5) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(sentiment), scales = "free") +
  labs(y = "News headlines: Words that contribute the most to each sentiment",
       x = NULL) +
  coord_flip() +
  theme_minimal()
Joining, by = "word"
Selecting by n

news_tweets %>%
  select(text) %>%                                                 # this selects just the text of the tweets
  unnest_tokens(words, text, token = "ngrams", n = 2)
NA
news_tweets %>%
  select(text) %>%                                                 #
  unnest_tokens(words, text, token = "ngrams", n = 2) %>%
  count(words, sort = T)
news_tweets %>%
  select(text) %>%                                                 # this selects just the text of the tweets
  unnest_tokens(words, text, token = "ngrams", n = 2) %>% 
  separate(words, c("word1", "word2"), sep = " ") %>%          # separate them temporarily
  filter(!word1 %in% stop_words$word) %>%                      # remove if first word is a stop word
  filter(!word2 %in% stop_words$word) %>%                      # remove if second word is a stop word   
  unite(words, word1, word2, sep = " ")                        # put them back together
remove_words = c("https", "t.co")

news_tweets %>%
  select(text) %>%                                                 
  unnest_tokens(words, text, token = "ngrams", n = 2) %>% 
  separate(words, c("word1", "word2"), sep = " ") %>%          # separate them temporarily
  filter(!word1 %in% stop_words$word) %>%                      # remove if first word is a stop word
  filter(!word2 %in% stop_words$word) %>%                      # remove if second word is a stop word   
  filter(!word1 %in% remove_words) %>%                         # these two lines remove our remove_words
  filter(!word2 %in% remove_words) %>%                         
  unite(words, word1, word2, sep = " ")                        # put them back together
remove_words = c("https", "t.co")

news_tweets %>%
  select(text) %>%                                                 
  unnest_tokens(words, text, token = "ngrams", n = 2) %>% 
  separate(words, c("word1", "word2"), sep = " ") %>%          # separate them temporarily
  filter(!word1 %in% stop_words$word) %>%                      # remove if first word is a stop word
  filter(!word2 %in% stop_words$word) %>%                      # remove if second word is a stop word   
  filter(!word1 %in% remove_words) %>%                         # these two lines remove our remove_words
  filter(!word2 %in% remove_words) %>%                         
  unite(words, word1, word2, sep = " ") -> news_bigrams                       # put them back together
news_bigrams %>% 
  count(words, sort = T)
news_bigrams %>% 
  count(words, sort = T) %>%
  top_n(100) %>%
  wordcloud2(size = .5)
Selecting by n

NA
first_word <- c("trump", "covid19")                                  # these need to be lowercase

news_bigrams %>%             
  count(words, sort = TRUE) %>%
  separate(words, c("word1", "word2"), sep = " ") %>%       # separate the two words
  filter(word1 %in% first_word) %>%                          # find first words from our list
  count(word1, word2, wt = n, sort = TRUE) %>% 
  rename(total = nn)
first_word <- c("trump", "covid19")                                  # these need to be lowercase

news_bigrams %>%             
  count(words, sort = TRUE) %>%
  separate(words, c("word1", "word2"), sep = " ") %>%       # separate the two words
  filter(word1 %in% first_word) %>%                          # find first words from our list
  count(word1, word2, wt = n, sort = TRUE) %>% 
  rename(total = nn) %>%
  mutate(word2 = factor(word2, levels = rev(unique(word2)))) %>%     # put the words in order
  group_by(word1) %>% 
  top_n(5) %>% 
  ggplot(aes(word2, total, fill = word1)) +                          #
  scale_fill_viridis_d() +                                           # set the color palette
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL, title = "Word following:") +
  facet_wrap(~word1, scales = "free") +
  coord_flip()
Selecting by total

