library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.1 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(wordcloud2)
library(pluralize)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
library(topicmodels)
library(rtweet)
##
## Attaching package: 'rtweet'
## The following object is masked from 'package:purrr':
##
## flatten
tweet <- get_timeline("JoeBiden", n = 2500)
term_freq_w_junk <- tweet %>%
unnest_tokens(word, text) %>%
group_by(word) %>%
summarise(n = n()) %>%
arrange(desc(n))
term_freq_w_junk %>%
top_n(100) %>%
wordcloud2()
## Selecting by n
get_stopwords()
Top_100 <- tweet %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(!word %in% c("https", "t.co")) %>%
filter(!str_detect(word,"^\\d")) %>%
group_by(word) %>%
summarise(n = n()) %>%
arrange(desc(n))
## Joining, by = "word"
# Word Cloud of the top 100 terms
Top_100 %>%
top_n(100) %>%
wordcloud2()
## Selecting by n
Top_100_April <- tweet %>%
filter(created_at > "2021-04-01") %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(!word %in% c("https", "t.co")) %>%
filter(!str_detect(word,"^\\d")) %>%
group_by(word) %>%
summarise(n = n()) %>%
arrange(desc(n))
## Joining, by = "word"
Top_100_March <- tweet %>%
filter(created_at > "2021-03-01", created_at < "2021-04-01") %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(!word %in% c("https", "t.co")) %>%
filter(!str_detect(word,"^\\d")) %>%
group_by(word) %>%
summarise(n = n()) %>%
arrange(desc(n))
## Joining, by = "word"
Top_100_Feb <- tweet %>%
filter(created_at > "2021-02-01", created_at < "2021-03-01") %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(!word %in% c("https", "t.co")) %>%
filter(!str_detect(word,"^\\d")) %>%
group_by(word) %>%
summarise(n = n()) %>%
arrange(desc(n))
## Joining, by = "word"
Top_100_Jan <- tweet %>%
filter(created_at > "2021-01-01", created_at < "2021-02-01") %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(!word %in% c("https", "t.co")) %>%
filter(!str_detect(word,"^\\d")) %>%
group_by(word) %>%
summarise(n = n()) %>%
arrange(desc(n))
## Joining, by = "word"
# Word Cloud of the top 100 terms in January, February, March, and April
Top_100_Jan %>%
top_n(100) %>%
wordcloud2()
## Selecting by n
Top_100_Feb %>%
top_n(100) %>%
wordcloud2()
## Selecting by n
Top_100_March %>%
top_n(100) %>%
wordcloud2()
## Selecting by n
Top_100_April %>%
top_n(100) %>%
wordcloud2()
## Selecting by n
#The word clouds do change however the theme remains the same over time.
bigram_freq <-
tweet %>%
unnest_tokens(bigram, text,
token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
anti_join(stop_words, by= c("word1" = "word")) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word1 %in% c("https","t.co")) %>%
filter(!word2 %in% c("t.co")) %>%
filter(!str_detect(word2,"^\\d")) %>%
filter(!str_detect(word1,"^\\d")) %>%
count(word1, word2, sort=TRUE)
bigram_freq %>%
head(20)
bigram_freq <-
tweet %>%
unnest_tokens(bigram, text,
token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word1 %in% c("https","t.co")) %>%
filter(!word2 %in% c("t.co")) %>%
filter(!str_detect(word1,"^\\d")) %>%
filter(!str_detect(word2,"^\\d")) %>%
mutate(word1 = singularize(word1)) %>%
mutate(bigram = str_c( word1, word2, sep = " ")) %>%
count(bigram, word1, word2, sort=TRUE)
#Word Cloud of the top 100 bi-grams
bigram_freq %>%
top_n(100) %>%
mutate(word1 = singularize(word1)) %>%
mutate(word2 = singularize(word2)) %>%
filter(word1 != word2) %>%
mutate(bigram = str_c( word1, word2, sep = " ")) %>%
group_by(bigram) %>%
summarise(n = sum(n)) %>%
select(bigram, n) %>%
arrange(desc(n)) %>%
wordcloud2(size=0.5)
## Selecting by n
bigram_freq %>%
filter(word1 == "american") %>%
mutate(word1 = singularize(word1)) %>%
mutate(word2 = singularize(word2)) %>%
filter(word1 != word2) %>%
# ungroup() %>%
top_n(10) %>%
ggplot(aes(x=reorder(bigram, n),n)) +
geom_col() +
coord_flip() +
labs(title = "Top 10 words after american",
x = "american + word",
y = "count")
## Selecting by n

nrc <- get_sentiments("nrc")
tweet_sentiment <- Top_100 %>%
left_join(nrc) %>%
mutate(sentiment = if_else(is.na(sentiment), "neutral", sentiment)) %>%
filter(sentiment %in% c("positive", "negative", "neutral"))
## Joining, by = "word"
tweet_sentiment %>%
top_n(5,n) %>%
ggplot(aes(reorder(word,n),n, fill=sentiment)) +
geom_col(position = "stack") +
coord_flip() +
labs(title="top 5 words, by sentiment - NRC",
y = "count",
x = "word")

tweet_sentiment %>%
group_by(sentiment) %>%
top_n(5,n) %>%
ggplot(aes(reorder(word,n),n, fill=sentiment)) +
geom_col(position = "stack") +
coord_flip() +
labs(title="top 5 positive and negative words by sentiment",
y = "count",
x = "word")

tweet_sentiment %>%
group_by(sentiment) %>%
top_n(5,n) %>%
mutate(n = if_else(sentiment == "positive", n, -n)) %>%
ggplot(aes(reorder(word,n),n, fill=sentiment)) +
geom_col(position = "stack") +
coord_flip() +
labs(title="top 5 positive and negative words by sentiment",
y = "count",
x = "word")

tweet_dtf <- tweet %>%
mutate(document = row_number()) %>%
unnest_tokens(word,text) %>%
anti_join(get_stopwords(language="en")) %>%
filter(!str_detect(word,"^\\d")) %>%
filter(!word %in% c("t.co", "http", "https")) %>%
filter(str_length(word) > 1) %>%
mutate(word = str_squish(word),
word = gsub("\\.", "", word)) %>%
mutate(word = singularize(word)) %>%
count(document, word, sort=TRUE)
## Joining, by = "word"
tweet_dtf %>%
arrange(-n)
tweet_dtm <- tweet_dtf %>%
cast_dtm(document, word, n)
tweet_dtm
## <<DocumentTermMatrix (documents: 999, terms: 3412)>>
## Non-/sparse entries: 15234/3393354
## Sparsity : 100%
## Maximal term length: 24
## Weighting : term frequency (tf)
tweet_topics_5 <- LDA(tweet_dtm, k=5, method = "Gibbs",
control = list(seed = 1234) )
tweet_topics_5
## A LDA_Gibbs topic model with 5 topics.
tweet_topics_10 <- LDA(tweet_dtm, k=10, method = "Gibbs",
control = list(seed = 1234) )
tweet_topics_10
## A LDA_Gibbs topic model with 10 topics.
options(scipen = 999)
tidy_tweet_topics10 <- tidy(tweet_topics_10)
tidy_tweet_topics10
tidy_tweet_topics5 <- tidy(tweet_topics_5)
tidy_tweet_topics5 %>%
arrange(desc(beta))
tweet_topic_terms10 <- tidy_tweet_topics10 %>%
filter(!term %in% c("got", "get", "lot", "try")) %>%
group_by(topic) %>%
top_n(4,beta)
tweet_topic_terms10 %>%
ungroup() %>%
arrange(topic, -beta)
tweet_topic_terms5 <- tidy_tweet_topics5 %>%
filter(!term %in% c("got", "get", "lot", "try")) %>%
group_by(topic) %>%
top_n(4,beta)
tweet_topic_terms5 %>%
ungroup() %>%
arrange(topic, -beta)
tweet_topic_terms10 %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap( ~ topic, scales = "free") +
coord_flip() +
labs(title = "Biden Twitter Topics",
x = "Topic Terms",
y = "Beta")

tweet_topic_terms5 %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap( ~ topic, scales = "free") +
coord_flip() +
labs(title = "Biden Twitter Topics",
x = "Topic Terms",
y = "Beta")

# I believe K=5 was more concise however, the topics remained very similar.