wealth_tweets_tidy <- wealth_tweets_clean %>%
select(created_at, is_retweet, screen_name, text) %>%
unnest_tokens("word", text) %>%
anti_join(stop_words)## Joining, by = "word"
wealth_tweets_tidy <- wealth_tweets_tidy[-grep("\\b\\d+\\b", wealth_tweets_tidy$word),]
wealth_tweets_tidy$word <- gsub("\\s+","",wealth_tweets_tidy$word)
wealth_tweets_tidy$word <- gsub("’","",wealth_tweets_tidy$word)
wealth_tweets_tidy <- wealth_tweets_tidy %>%
mutate_at("word", funs(wordStem((.), language="en")))## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
top_words <-
wealth_tweets_tidy %>%
filter(!(word=="https"|
word=="rt"|
word=="t.co"|
word=="amp")) %>%
count(word) %>%
arrange(desc(n))top_words %>%
slice(1:20) %>%
ggplot(aes(x=reorder(word, -n), y=n, fill=word))+
geom_bar(stat="identity")+
theme_minimal()+
theme(axis.text.x =
element_text(angle = 60, hjust = 1, size=13))+
theme(plot.title =
element_text(hjust = 0.5, size=18))+
ylab("Frequency")+
xlab("")+
ggtitle("Most Frequent Words in Wealth Tweets")+
guides(fill=FALSE)wealth_tweets_tfidf <- wealth_tweets_tidy %>%
filter(!(word=="https"|
word=="rt"|
word=="t.co"|
word=="amp")) %>%
count(word, created_at) %>%
bind_tf_idf(word, created_at, n)
top_words_tfidf <-
wealth_tweets_tfidf %>%
arrange(desc(tf_idf))
top_words_tfidf %>%
slice(1:20) %>%
ggplot(aes(x=reorder(word, -n), y=n, fill=word))+
geom_bar(stat="identity")+
theme_minimal()+
theme(axis.text.x =
element_text(angle = 60, hjust = 1, size=13))+
theme(plot.title =
element_text(hjust = 0.5, size=18))+
ylab("Frequency")+
xlab("")+
ggtitle("Most Important Words in Wealth Tweets")+
guides(fill=FALSE) # Corpus Creation and DTM
wealth_corpus_clean <- wealth_corpus %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(content_transformer(removePunctuation)) %>%
tm_map(content_transformer(removeNumbers)) %>%
tm_map(content_transformer(stripWhitespace)) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(content_transformer(stemDocument), language = "en")## Warning in tm_map.SimpleCorpus(., removeWords, stopwords("en")): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., content_transformer(removePunctuation)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., content_transformer(removeNumbers)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., content_transformer(stripWhitespace)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., content_transformer(stemDocument), language =
## "en"): transformation drops documents
wealth_DTM <- DocumentTermMatrix(wealth_corpus_clean, control = list(wordLengths = c(2, Inf)))
rowTotals <- apply(wealth_DTM , 1, sum) #Find the sum of words in each Document
wealth_DTM <- wealth_DTM[rowTotals > 0, ]
##wealth_DTM_weighted <- weightTfIdf(wealth_DTM)
topic_model <- LDA(wealth_DTM, k = 10, control = list(seed = 321))
topics <- tidy(topic_model, matrix = "beta")
topics_terms <-
topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7
## [1,] "wealth" "wealth" "wealth" "wealth" "wealth" "wealth" "will"
## [2,] "the" "get" "the" "the" "peopl" "the" "us"
## [3,] "will" "peopl" "can" "get" "tax" "peopl" "wealth"
## [4,] "one" "just" "money" "can" "will" "generat" "famili"
## [5,] "billion" "the" "time" "you" "money" "get" "you"
## [6,] "work" "need" "it" "rich" "need" "can" "want"
## [7,] "invest" "year" "us" "black" "the" "rich" "care"
## [8,] "build" "work" "peopl" "make" "like" "invest" "peopl"
## [9,] "make" "manag" "famili" "also" "just" "money" "work"
## [10,] "way" "health" "just" "that" "generat" "famili" "rich"
## Topic 8 Topic 9 Topic 10
## [1,] "tax" "wealth" "wealth"
## [2,] "peopl" "us" "peopl"
## [3,] "can" "name" "will"
## [4,] "rich" "dean" "generat"
## [5,] "we" "scuff" "it"
## [6,] "money" "will" "work"
## [7,] "one" "give" "get"
## [8,] "like" "and" "time"
## [9,] "wealth" "in" "us"
## [10,] "say" "pray" "tax"
topics_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()rtweet. I created a dictionary using the words we discussed and searched for them in the tweet.dictionary <- c("wealth gap", "black wealth","poverty", "global wealth", "economic mobility")
dic_tweets<-wealth_tweets_clean[str_detect(wealth_tweets_clean$text, paste(dictionary, collapse="|")),]
head(dic_tweets)quanteda to dig into some n-gram analysis (weighted by TFIDF).library(quanteda)
wealth_tweets_fc <- wealth_tweets_clean %>%
rename(docid = status_id) ## renaming status_id to docid to create a corpus with the whole data frame and retain metadata.
corpus <- quanteda::corpus(wealth_tweets_fc) ## create corpus
token <- tokens_remove(tokens(corpus, remove_punct = T, remove_symbols = T, remove_url = T, remove_separators = T, remove_numbers = T, include_docvars = T), stopwords("english")) ## create tokens removing urls, symbols, urls, separators, and numbers. n_grams <- tokens_ngrams(token, n = 1:2, concatenator = " ") ### unigram and bigram frequency
dfm <- dfm(n_grams)##document frequency matrix
dfm_weighted <- dfm_tfidf(dfm)
Grams_imm <- topfeatures(dfm_weighted, 20) #top 20 unigrams and bigrams (weighted)
top_20_ngrams <- setDT(as.data.frame(Grams_imm), keep.rownames = T)top_20_ngrams %>%
ggplot(aes(x = reorder(rn, -Grams_imm), Grams_imm)) +
geom_col(width=.8) +
coord_flip() +
xlab(NULL) +
xlab("n-gram") +
ylab("TF-IDF") +
theme(text=element_text(size = 12, family = "Times New Roman", colour = "black")) +
theme_bw()for_STM <- wealth_tweets_clean %>%
dplyr::rename(documents = text) %>% ## change name of text column to documents
dplyr::distinct(documents, .keep_all = TRUE)
processed <- textProcessor(for_STM$documents, metadata = for_STM)## Building corpus...
## Converting to Lower Case...
## Removing punctuation...
## Removing stopwords...
## Removing numbers...
## Stemming...
## Creating Output...
## Removing 20407 of 30919 terms (20407 of 244300 tokens) due to frequency
## Removing 12 Documents with No Words
## Your corpus now has 14663 documents, 10512 terms and 223893 tokens.
First_STM <- stm(documents = out$documents, vocab = out$vocab,
K = 20, prevalence =~ is_retweet + s(retweet_count),
max.em.its = 75, data = out$meta,
init.type = "Spectral", verbose = FALSE)