nyt_tweets <- get_timeline("nytimes", n = 5000)
wsj_tweets <- get_timeline("wsj", n = 5000)
wapo_tweets <- get_timeline("washingtonpost", n = 5000)
cnn_tweets <- get_timeline("cnn", n = 5000)
usa_tweets <- get_timeline("usatoday", n = 5000)
fox_tweets <- get_timeline("foxnews", n = 5000)
ap_tweets <- get_timeline("ap", n = 5000)
aje_tweets <- get_timeline("ajenglish", n = 5000)
bbc_tweets <- get_timeline("bbcworld", n = 5000)
reuters_tweets <- get_timeline("reuters", n = 5000)
nyt_words <- nyt_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word, created_at)
wsj_words <- wsj_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word, created_at)
wapo_words <- wapo_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word, created_at)
cnn_words <- cnn_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word, created_at)
usa_words <- usa_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word, created_at)
fox_words <- fox_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word, created_at)
ap_words <- ap_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word, created_at)
aje_words <- aje_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word, created_at)
bbc_words <- bbc_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word, created_at)
reuters_words <- reuters_tweets %>%
unnest_tokens(word, text) %>%
select(screen_name, word, created_at)
First, I pulled 5000 of the most recent tweets from ten news organizations, both local and abroad, and broke them down into their wording in order to correlate the verbiage with their sentiments.
nyt_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
wsj_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
cnn_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
fox_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
wapo_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
usa_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
ap_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
aje_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
bbc_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
reuters_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
These tables show the words used by each news station, organized by count.
news_words <- cnn_words %>%
full_join(fox_words) %>%
full_join(nyt_words) %>%
full_join(usa_words) %>%
full_join(wapo_words) %>%
full_join(wsj_words) %>%
full_join(aje_words) %>%
full_join(ap_words) %>%
full_join(bbc_words) %>%
full_join(reuters_words)
Joining, by = c("screen_name", "word", "created_at")
Joining, by = c("screen_name", "word", "created_at")
Joining, by = c("screen_name", "word", "created_at")
Joining, by = c("screen_name", "word", "created_at")
Joining, by = c("screen_name", "word", "created_at")
Joining, by = c("screen_name", "word", "created_at")
Joining, by = c("screen_name", "word", "created_at")
Joining, by = c("screen_name", "word", "created_at")
Joining, by = c("screen_name", "word", "created_at")
I combined all of the words utilized by the stations into one overarching group in order to analyze them as a whole.
news_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)
Joining, by = "word"
This table shows the most frequently used words, ranked by count. Coronavirus is (unsurprisingly) at the top.
news_words %>%
anti_join(get_stopwords()) %>%
filter(!word == "https") %>%
filter(!word == "t.co") %>%
count(word, sort = T)%>%
top_n(1000) %>%
wordcloud2(size = .6)
Joining, by = "word"
Selecting by n
This word cloud includes the top 1000 words most frequently used by news organizations.
bing_words <- get_sentiments("bing")
news_words %>%
inner_join(bing_words) %>%
filter(!word == "trump") %>%
filter(!word == "like") %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(sentiment), scales = "free") +
labs(y = "Bing Sentiment Contribution by Word (Major News Outlet Tweets)",
x = NULL) +
coord_flip() +
theme_minimal()
Joining, by = "word"
Selecting by n

Here are the top words organized by Bing Sentiment. The pandemic is terrifying. Here is proof.
nrc_words <- get_sentiments("nrc")
nrc_sentiments <- news_words %>%
inner_join(nrc_words) %>%
filter(!word == "trump") %>%
count(word, sentiment, screen_name, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(1000) %>%
ungroup() %>%
mutate(word = reorder(word, n))
Joining, by = "word"
Selecting by n
sentiments_labeled <- nrc_sentiments
number_of_bar <- nrow(sentiments_labeled)
angle <- 90 - 360 * (nrc_sentiments$n-0.5) /number_of_bar
sentiments_labeled$hjust <- ifelse( angle < -90, 1, 0)
sentiments_labeled$angle <- ifelse(angle < -90, angle+180, angle)
nrc_sentiments %>%
ggplot(aes(x = screen_name, y = n , fill = sentiment)) +
geom_bar(position="fill", stat = "identity") +
labs(y = "",
x = "Screen Name",
title = "NRC Sentiment Contribution by Word",
fill = "Sentiments") +
theme_gray() +
scale_fill_brewer(palette="Paired") +
theme(axis.text.x = element_text(angle = 90))

This is a visual representation of the words used and the sentiments they portray across all ten news stations. The most angry and also the most trusting news station is Fox. “Most Watched, Most Trusted”
CNN beats out USA by a close margin for the least angry.
news_words %>%
inner_join(nrc_words) %>%
filter(!word == "trump") %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(sentiment), scales = "free") +
labs(y = "NRC Sentiment Contribution by Word (Major News Network Tweets)",
x = NULL) +
coord_flip() +
theme_minimal()
Joining, by = "word"
Selecting by n

This is a more specific representation of what words are being used across news media.
