Review

  • I went back to the drawing board because I did not explude retweets when we scraped the data. Because of that, there were alot of duplicates, which changed the results of the text analysis. I grabbed more tweets (N = 2500), this time excluding rtweets.

Load Libraries

pacman::p_load(
  rtweet, 
  quanteda, 
  ggplot2, 
  tidyverse, 
  tidytext, 
  tm, 
  topicmodels, 
  textcat, 
  SnowballC, 
  stringr, 
  data.table,
  stm
)
wealth_tweets <- readRDS("wealth_tweets.rds")

Preprocessing Text

wealth_tweets_clean <- wealth_tweets %>%
  mutate(text = text %>% 
  iconv("latin1", "ASCII", sub="") %>%
  str_replace("https://t.co/[a-z,A-Z,0-9]*", "") %>%
  str_replace("&amp;", "")) %>%
  filter(textcat(text)== "english")

TidyText Method

wealth_tweets_tidy <- wealth_tweets_clean %>% 
  select(created_at, is_retweet, screen_name, text) %>%
  unnest_tokens("word", text) %>% 
  anti_join(stop_words)
## Joining, by = "word"
wealth_tweets_tidy <- wealth_tweets_tidy[-grep("\\b\\d+\\b", wealth_tweets_tidy$word),]
wealth_tweets_tidy$word <- gsub("\\s+","",wealth_tweets_tidy$word)
wealth_tweets_tidy$word <- gsub("’","",wealth_tweets_tidy$word)
wealth_tweets_tidy <- wealth_tweets_tidy %>%
      mutate_at("word", funs(wordStem((.), language="en")))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
top_words <-
    wealth_tweets_tidy %>%
        filter(!(word=="https"|
                 word=="rt"|
                 word=="t.co"|
                 word=="amp")) %>%
            count(word) %>%
              arrange(desc(n))
top_words %>%
  slice(1:20) %>%
    ggplot(aes(x=reorder(word, -n), y=n, fill=word))+
      geom_bar(stat="identity")+
        theme_minimal()+
        theme(axis.text.x = 
            element_text(angle = 60, hjust = 1, size=13))+
        theme(plot.title = 
            element_text(hjust = 0.5, size=18))+
          ylab("Frequency")+
          xlab("")+
          ggtitle("Most Frequent Words in Wealth Tweets")+
          guides(fill=FALSE)

wealth_tweets_tfidf <- wealth_tweets_tidy %>%
        filter(!(word=="https"|
                 word=="rt"|
                 word=="t.co"|
                 word=="amp")) %>%
           count(word, created_at) %>%
              bind_tf_idf(word, created_at, n)

top_words_tfidf <-
   wealth_tweets_tfidf %>%
              arrange(desc(tf_idf))


top_words_tfidf %>%
  slice(1:20) %>%
    ggplot(aes(x=reorder(word, -n), y=n, fill=word))+
      geom_bar(stat="identity")+
        theme_minimal()+
        theme(axis.text.x = 
            element_text(angle = 60, hjust = 1, size=13))+
        theme(plot.title = 
            element_text(hjust = 0.5, size=18))+
          ylab("Frequency")+
          xlab("")+
          ggtitle("Most Important Words in Wealth Tweets")+
          guides(fill=FALSE)

# Corpus Creation and DTM

wealth_corpus <- Corpus(VectorSource(as.vector(wealth_tweets_clean$text))) 
wealth_corpus_clean <- wealth_corpus %>% 
  tm_map(removeWords, stopwords("en")) %>%
  tm_map(content_transformer(removePunctuation)) %>%
  tm_map(content_transformer(removeNumbers)) %>%
  tm_map(content_transformer(stripWhitespace)) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(content_transformer(stemDocument), language = "en")
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords("en")): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., content_transformer(removePunctuation)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., content_transformer(removeNumbers)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., content_transformer(stripWhitespace)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., content_transformer(stemDocument), language =
## "en"): transformation drops documents
wealth_DTM <- DocumentTermMatrix(wealth_corpus_clean, control = list(wordLengths = c(2, Inf)))
rowTotals <- apply(wealth_DTM , 1, sum) #Find the sum of words in each Document
wealth_DTM   <- wealth_DTM[rowTotals > 0, ] 

##wealth_DTM_weighted <- weightTfIdf(wealth_DTM)

topic_model <- LDA(wealth_DTM, k = 10, control = list(seed = 321))


topics <- tidy(topic_model, matrix = "beta")

topics_terms <- 
  topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
get_terms(topic_model, 10)
##       Topic 1   Topic 2  Topic 3  Topic 4  Topic 5   Topic 6   Topic 7 
##  [1,] "wealth"  "wealth" "wealth" "wealth" "wealth"  "wealth"  "will"  
##  [2,] "the"     "get"    "the"    "the"    "peopl"   "the"     "us"    
##  [3,] "will"    "peopl"  "can"    "get"    "tax"     "peopl"   "wealth"
##  [4,] "one"     "just"   "money"  "can"    "will"    "generat" "famili"
##  [5,] "billion" "the"    "time"   "you"    "money"   "get"     "you"   
##  [6,] "work"    "need"   "it"     "rich"   "need"    "can"     "want"  
##  [7,] "invest"  "year"   "us"     "black"  "the"     "rich"    "care"  
##  [8,] "build"   "work"   "peopl"  "make"   "like"    "invest"  "peopl" 
##  [9,] "make"    "manag"  "famili" "also"   "just"    "money"   "work"  
## [10,] "way"     "health" "just"   "that"   "generat" "famili"  "rich"  
##       Topic 8  Topic 9  Topic 10 
##  [1,] "tax"    "wealth" "wealth" 
##  [2,] "peopl"  "us"     "peopl"  
##  [3,] "can"    "name"   "will"   
##  [4,] "rich"   "dean"   "generat"
##  [5,] "we"     "scuff"  "it"     
##  [6,] "money"  "will"   "work"   
##  [7,] "one"    "give"   "get"    
##  [8,] "like"   "and"    "time"   
##  [9,] "wealth" "in"     "us"     
## [10,] "say"    "pray"   "tax"
topics_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

Dictionary-Based Analysis

  • Unfortionately, hashtags live in a different place when downloading tweets using rtweet. I created a dictionary using the words we discussed and searched for them in the tweet.
dictionary <- c("wealth gap", "black wealth","poverty", "global wealth", "economic mobility")

dic_tweets<-wealth_tweets_clean[str_detect(wealth_tweets_clean$text, paste(dictionary, collapse="|")),]

head(dic_tweets)

Analysis with Quanteda

  • I used quanteda to dig into some n-gram analysis (weighted by TFIDF).
library(quanteda)

wealth_tweets_fc <- wealth_tweets_clean %>%
  rename(docid = status_id) ## renaming status_id to docid to create a corpus with the whole data frame and retain metadata. 
  
corpus <- quanteda::corpus(wealth_tweets_fc) ## create corpus

token <- tokens_remove(tokens(corpus, remove_punct = T, remove_symbols = T, remove_url = T, remove_separators = T, remove_numbers = T, include_docvars = T), stopwords("english")) ## create tokens removing urls, symbols, urls, separators, and numbers. 
n_grams <- tokens_ngrams(token, n = 1:2, concatenator = " ") ### unigram and bigram frequency
dfm <- dfm(n_grams)##document frequency matrix
dfm_weighted <- dfm_tfidf(dfm)
Grams_imm <- topfeatures(dfm_weighted, 20) #top 20 unigrams and bigrams (weighted)
top_20_ngrams <- setDT(as.data.frame(Grams_imm), keep.rownames = T)
top_20_ngrams %>%
  ggplot(aes(x = reorder(rn, -Grams_imm), Grams_imm)) +
  geom_col(width=.8) +
  coord_flip() + 
  xlab(NULL) +
  xlab("n-gram") +
  ylab("TF-IDF") + 
  theme(text=element_text(size = 12, family = "Times New Roman", colour = "black")) + 
  theme_bw()

Structural Topic Modeling

  • I created the STM considering whether the tweet was a retweet and engagement (i.e., retweets). I also removed duplicates for this analysis.
for_STM <- wealth_tweets_clean %>% 
  dplyr::rename(documents = text) %>% ## change name of text column to documents
  dplyr::distinct(documents, .keep_all = TRUE)

processed <- textProcessor(for_STM$documents, metadata = for_STM)
## Building corpus... 
## Converting to Lower Case... 
## Removing punctuation... 
## Removing stopwords... 
## Removing numbers... 
## Stemming... 
## Creating Output...
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
## Removing 20407 of 30919 terms (20407 of 244300 tokens) due to frequency 
## Removing 12 Documents with No Words 
## Your corpus now has 14663 documents, 10512 terms and 223893 tokens.
docs <- out$documents
vocab <- out$vocab
meta <-out$meta
First_STM <- stm(documents = out$documents, vocab = out$vocab,
              K = 20, prevalence =~ is_retweet + s(retweet_count),
              max.em.its = 75, data = out$meta,
              init.type = "Spectral", verbose = FALSE)
plot(First_STM, n = 20)