Assignment: Select another news organization and grab 5000 of their tweets. Some possibilities: For Billings Gazette headlines: billingsgazette Wall Street Journal: wsj Washington Post: washingtonpost CNN: cnn CNN breaking news: cnnbrk USA Today: usatoday
- Unnest the words of the tweets, remove stop words and weird web “words”, and create a table and a word cloud of the top words.
- Conduct a sentiment analysis using bing, remove any errors like trump = positive, and create a graph of the words that contribute most to each sentiment.
- Do the same as above but with the nrc sentiment lexicon.
- Unnest the tweets as bigrams, remove stop words and errors, and create a table and word cloud of the most common bigrams.
- Using the bigrams, look for the most common words that follow two different words. You may choose trump and pelosi, or choose your own.
news_tweets <- get_timeline("billingsgazette", n = 5000)
news_words <- news_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word)
news_words %>%
count(word, sort = T)
news_words %>%
anti_join(get_stopwords()) %>%
count(word, sort = T)
Joining, by = "word"
news_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https",
!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
news_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https",
!word == "t.co") %>%
count(word, sort = T) %>%
top_n(200) %>%
wordcloud2(size = .5)
Joining, by = "word"
Selecting by n
bing <- get_sentiments("bing")
bing
news_words %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE)
Joining, by = "word"
news_words %>%
inner_join(bing) %>%
filter(!word == "trump") %>%
count(word, sentiment, sort = TRUE)
Joining, by = "word"
news_words %>%
inner_join(bing) %>%
filter(!word == "trump") %>%
count(word, sentiment, sort = TRUE)%>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(sentiment), scales = "free") +
labs(y = "News headlines: Words that contribute the most to each sentiment",
x = NULL) +
coord_flip() +
theme_minimal()
Joining, by = "word"
Selecting by n

nrc <- get_sentiments("nrc")
nrc
nrc %>%
distinct(sentiment)
news_words %>%
inner_join(nrc) %>%
filter(!word == "trump") %>%
count(word, sentiment, sort = TRUE)%>%
group_by(sentiment) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(sentiment), scales = "free") +
labs(y = "News headlines: Words that contribute the most to each sentiment",
x = NULL) +
coord_flip() +
theme_minimal()
Joining, by = "word"
Selecting by n

news_tweets %>%
select(text) %>% # this selects just the text of the tweets
unnest_tokens(words, text, token = "ngrams", n = 2)
NA
news_tweets %>%
select(text) %>% #
unnest_tokens(words, text, token = "ngrams", n = 2) %>%
count(words, sort = T)
news_tweets %>%
select(text) %>% # this selects just the text of the tweets
unnest_tokens(words, text, token = "ngrams", n = 2) %>%
separate(words, c("word1", "word2"), sep = " ") %>% # separate them temporarily
filter(!word1 %in% stop_words$word) %>% # remove if first word is a stop word
filter(!word2 %in% stop_words$word) %>% # remove if second word is a stop word
unite(words, word1, word2, sep = " ") # put them back together
remove_words = c("https", "t.co")
news_tweets %>%
select(text) %>%
unnest_tokens(words, text, token = "ngrams", n = 2) %>%
separate(words, c("word1", "word2"), sep = " ") %>% # separate them temporarily
filter(!word1 %in% stop_words$word) %>% # remove if first word is a stop word
filter(!word2 %in% stop_words$word) %>% # remove if second word is a stop word
filter(!word1 %in% remove_words) %>% # these two lines remove our remove_words
filter(!word2 %in% remove_words) %>%
unite(words, word1, word2, sep = " ") # put them back together
remove_words = c("https", "t.co")
news_tweets %>%
select(text) %>%
unnest_tokens(words, text, token = "ngrams", n = 2) %>%
separate(words, c("word1", "word2"), sep = " ") %>% # separate them temporarily
filter(!word1 %in% stop_words$word) %>% # remove if first word is a stop word
filter(!word2 %in% stop_words$word) %>% # remove if second word is a stop word
filter(!word1 %in% remove_words) %>% # these two lines remove our remove_words
filter(!word2 %in% remove_words) %>%
unite(words, word1, word2, sep = " ") -> news_bigrams # put them back together
news_bigrams %>%
count(words, sort = T)
news_bigrams %>%
count(words, sort = T) %>%
top_n(100) %>%
wordcloud2(size = .5)
Selecting by n
NA
first_word <- c("trump", "covid19") # these need to be lowercase
news_bigrams %>%
count(words, sort = TRUE) %>%
separate(words, c("word1", "word2"), sep = " ") %>% # separate the two words
filter(word1 %in% first_word) %>% # find first words from our list
count(word1, word2, wt = n, sort = TRUE) %>%
rename(total = nn)
first_word <- c("trump", "covid19") # these need to be lowercase
news_bigrams %>%
count(words, sort = TRUE) %>%
separate(words, c("word1", "word2"), sep = " ") %>% # separate the two words
filter(word1 %in% first_word) %>% # find first words from our list
count(word1, word2, wt = n, sort = TRUE) %>%
rename(total = nn) %>%
mutate(word2 = factor(word2, levels = rev(unique(word2)))) %>% # put the words in order
group_by(word1) %>%
top_n(5) %>%
ggplot(aes(word2, total, fill = word1)) + #
scale_fill_viridis_d() + # set the color palette
geom_col(show.legend = FALSE) +
labs(x = NULL, y = NULL, title = "Word following:") +
facet_wrap(~word1, scales = "free") +
coord_flip()
Selecting by total

