library(tidyverse)  
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext)   
library(wordcloud2) 
library(pluralize)  
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
library(topicmodels)
library(rtweet)
## 
## Attaching package: 'rtweet'
## The following object is masked from 'package:purrr':
## 
##     flatten
tweet <- get_timeline("JoeBiden",  n = 2500)
term_freq_w_junk <- tweet %>%
  unnest_tokens(word, text) %>%
  group_by(word) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

term_freq_w_junk %>%
  top_n(100) %>%
  wordcloud2()
## Selecting by n
get_stopwords()
Top_100 <- tweet %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  filter(!word %in% c("https", "t.co")) %>%
  filter(!str_detect(word,"^\\d")) %>% 
  group_by(word) %>%
  summarise(n = n()) %>%
  arrange(desc(n))
## Joining, by = "word"
# Word Cloud of the top 100 terms

Top_100 %>%
  top_n(100) %>%
  wordcloud2()
## Selecting by n
Top_100_April <- tweet %>%
  filter(created_at > "2021-04-01") %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  filter(!word %in% c("https", "t.co")) %>%
  filter(!str_detect(word,"^\\d")) %>% 
  group_by(word) %>%
  summarise(n = n()) %>%
  arrange(desc(n))
## Joining, by = "word"
Top_100_March <- tweet %>%
  filter(created_at > "2021-03-01", created_at < "2021-04-01") %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  filter(!word %in% c("https", "t.co")) %>%
  filter(!str_detect(word,"^\\d")) %>% 
  group_by(word) %>%
  summarise(n = n()) %>%
  arrange(desc(n))
## Joining, by = "word"
Top_100_Feb <- tweet %>%
  filter(created_at > "2021-02-01", created_at < "2021-03-01") %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  filter(!word %in% c("https", "t.co")) %>%
  filter(!str_detect(word,"^\\d")) %>% 
  group_by(word) %>%
  summarise(n = n()) %>%
  arrange(desc(n))
## Joining, by = "word"
Top_100_Jan <- tweet %>%
  filter(created_at > "2021-01-01", created_at < "2021-02-01") %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  filter(!word %in% c("https", "t.co")) %>%
  filter(!str_detect(word,"^\\d")) %>% 
  group_by(word) %>%
  summarise(n = n()) %>%
  arrange(desc(n))
## Joining, by = "word"
# Word Cloud of the top 100 terms in January, February, March, and April

Top_100_Jan %>%
  top_n(100) %>%
  wordcloud2()
## Selecting by n
Top_100_Feb %>%
  top_n(100) %>%
  wordcloud2()
## Selecting by n
Top_100_March %>%
  top_n(100) %>%
  wordcloud2()
## Selecting by n
Top_100_April %>%
  top_n(100) %>%
  wordcloud2()
## Selecting by n
#The word clouds do change however the theme remains the same over time.
bigram_freq <- 
  tweet %>%
    unnest_tokens(bigram, text, 
            token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  anti_join(stop_words, by= c("word1" = "word")) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word1 %in% c("https","t.co")) %>%
  filter(!word2 %in% c("t.co")) %>%
  filter(!str_detect(word2,"^\\d")) %>% 
  filter(!str_detect(word1,"^\\d")) %>% 
  count(word1, word2, sort=TRUE)

bigram_freq %>%
  head(20)
bigram_freq <- 
  tweet %>%
    unnest_tokens(bigram, text, 
            token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word1 %in% c("https","t.co")) %>%
  filter(!word2 %in% c("t.co")) %>%
  filter(!str_detect(word1,"^\\d")) %>% 
  filter(!str_detect(word2,"^\\d")) %>% 
  mutate(word1 = singularize(word1)) %>%
  mutate(bigram = str_c( word1, word2, sep = " ")) %>%
  count(bigram, word1, word2, sort=TRUE) 

#Word Cloud of the top 100 bi-grams

bigram_freq %>%
  top_n(100) %>%
  mutate(word1 = singularize(word1)) %>%
  mutate(word2 = singularize(word2)) %>%
  filter(word1 != word2) %>%
  mutate(bigram = str_c( word1, word2, sep = " ")) %>%
  group_by(bigram) %>%
  summarise(n = sum(n)) %>%
  select(bigram, n) %>%
  arrange(desc(n)) %>%
  wordcloud2(size=0.5)
## Selecting by n
bigram_freq %>%
  filter(word1 == "american") %>%
  mutate(word1 = singularize(word1)) %>%
  mutate(word2 = singularize(word2)) %>%
  filter(word1 != word2) %>%
  # ungroup() %>%
  top_n(10) %>%
  ggplot(aes(x=reorder(bigram, n),n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 10 words after american",
       x = "american + word",
       y = "count")
## Selecting by n

nrc <- get_sentiments("nrc") 

tweet_sentiment <- Top_100 %>%
  left_join(nrc) %>%
  mutate(sentiment = if_else(is.na(sentiment), "neutral", sentiment)) %>%
  filter(sentiment %in% c("positive", "negative", "neutral")) 
## Joining, by = "word"
tweet_sentiment %>%
  top_n(5,n) %>%
  ggplot(aes(reorder(word,n),n, fill=sentiment)) +
  geom_col(position = "stack") +
  coord_flip() +
  labs(title="top 5 words, by sentiment - NRC",
       y = "count",
       x = "word")

tweet_sentiment %>%
  group_by(sentiment) %>%
  top_n(5,n) %>%
  ggplot(aes(reorder(word,n),n, fill=sentiment)) +
  geom_col(position = "stack") +
  coord_flip() +
  labs(title="top 5 positive and negative words by sentiment",
       y = "count",
       x = "word")

tweet_sentiment %>%
  group_by(sentiment) %>%
  top_n(5,n) %>%
  mutate(n = if_else(sentiment == "positive", n, -n)) %>%
  ggplot(aes(reorder(word,n),n, fill=sentiment)) +
  geom_col(position = "stack") +
  coord_flip() +
  labs(title="top 5 positive and negative words by sentiment",
       y = "count",
       x = "word")

tweet_dtf <- tweet %>%
  mutate(document = row_number()) %>%
  unnest_tokens(word,text) %>%
  anti_join(get_stopwords(language="en")) %>%
  filter(!str_detect(word,"^\\d")) %>% 
  filter(!word %in% c("t.co", "http", "https")) %>%
  filter(str_length(word) > 1) %>%
  mutate(word = str_squish(word),
         word = gsub("\\.", "", word)) %>%
  mutate(word = singularize(word)) %>%
  count(document, word, sort=TRUE) 
## Joining, by = "word"
tweet_dtf %>%
  arrange(-n)
tweet_dtm <- tweet_dtf %>%
  cast_dtm(document, word, n)

tweet_dtm
## <<DocumentTermMatrix (documents: 999, terms: 3412)>>
## Non-/sparse entries: 15234/3393354
## Sparsity           : 100%
## Maximal term length: 24
## Weighting          : term frequency (tf)
tweet_topics_5 <- LDA(tweet_dtm, k=5, method = "Gibbs", 
            control = list(seed = 1234) )
tweet_topics_5
## A LDA_Gibbs topic model with 5 topics.
tweet_topics_10 <- LDA(tweet_dtm, k=10, method = "Gibbs", 
            control = list(seed = 1234) )
tweet_topics_10
## A LDA_Gibbs topic model with 10 topics.
options(scipen = 999)
tidy_tweet_topics10 <- tidy(tweet_topics_10)
tidy_tweet_topics10 
tidy_tweet_topics5 <- tidy(tweet_topics_5)
tidy_tweet_topics5 %>%
  arrange(desc(beta))
tweet_topic_terms10 <- tidy_tweet_topics10 %>%
  filter(!term %in% c("got", "get", "lot", "try")) %>%
  group_by(topic) %>%
  top_n(4,beta)

tweet_topic_terms10 %>%
  ungroup() %>%
  arrange(topic, -beta)
tweet_topic_terms5 <- tidy_tweet_topics5 %>%
  filter(!term %in% c("got", "get", "lot", "try")) %>%
  group_by(topic) %>%
  top_n(4,beta)

tweet_topic_terms5 %>%
  ungroup() %>%
  arrange(topic, -beta)
tweet_topic_terms10 %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) + 
  facet_wrap( ~ topic, scales = "free") + 
  coord_flip() +
  labs(title = "Biden Twitter Topics",
       x = "Topic Terms",
       y = "Beta")

tweet_topic_terms5 %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) + 
  facet_wrap( ~ topic, scales = "free") + 
  coord_flip() +
  labs(title = "Biden Twitter Topics",
       x = "Topic Terms",
       y = "Beta")

# I believe K=5 was more concise however, the topics remained very similar.