installing plotting libaries
library(quanteda)
## Package version: 2.1.2
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(tidyr)
library(tidytext)
library(quanteda.textplots)
##
## Attaching package: 'quanteda.textplots'
## The following objects are masked from 'package:quanteda':
##
## as.igraph, as.network, textplot_keyness, textplot_network,
## textplot_wordcloud, textplot_xray
library(stringr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following objects are masked from 'package:quanteda':
##
## meta, meta<-
##
## Attaching package: 'tm'
## The following objects are masked from 'package:quanteda':
##
## as.DocumentTermMatrix, stopwords
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(wordcloud)
## Loading required package: RColorBrewer
library(wordcloud2)
library(skimr)
library(tokenizers)
library(topicmodels)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(textdata)
Creating Corpus and DFM
text <- corpus(c(tweets$text, tweets$retweet_text))
## Warning: NA is replaced by empty string
text <- dfm(tokens(text, remove_punct=TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>%
tokens_select(pattern=stopwords("en"),
selection="remove"))
text <- dfm_trim(text, min_docfreq = 25)
# Back-up cleaning of dfm to make sure everything is removed, however there are still a lot of foreign stop words, mostly Spanish and German it seems
text <- dfm_remove(text, c("der", "für", "die", "une", "sur", "pas", "se", "qui", "que", "la", "de", "l", "au", "le", "pour", "les", "á", "milliards", "mais", "plus", "se", "une", "guerre", "plus", "se", "une", "guerre", "plus", "amzn", "twtr", "dwac", "baba", "aapl", "ba", "tsla", "ba", "fb", "amd", "tsla", "du", "et", "á","s", "nvda", "nflx", "msft", "qqq", "den", "im", "en", "un", "ne", "est", "des", "und", "à ", "il", "bn", "will"))
# After a great deal of stuggle I was able to find a way to remove each of the foreign stop words and usless character strings through dfm_remove. I included the word "will" becuase it seemed to confuse the topic modeling in the analysis that follows.
dtm <- convert(text, to = "topicmodels")
m = LDA(dtm, method = "Gibbs", k = 10, control = list(alpha = 0.1))
terms(m, 5)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7
## [1,] "war" "spy" "people" "billion" "ukraine" "sending" "new"
## [2,] "us" "world" "get" "money" "war" "money" "covid"
## [3,] "end" "now" "world" "wants" "russia" "dollars" "check"
## [4,] "ukraine" "news" "right" "support" "putin" "dont" "dans"
## [5,] "putin" "twitter" "make" "us" "nuclear" "joe" "vs"
## Topic 8 Topic 9 Topic 10
## [1,] "biden" "war" "billion"
## [2,] "us" "us" "aid"
## [3,] "president" "russian" "congress"
## [4,] "joe" "ukraine" "military"
## [5,] "ukraine" "ukrainian" "ukraine"
# Now that all of the usless words have been removed the topic models are making much more sense. There is a clear urgency for aid, specifically money and military. I find topic #1 especially striking as it expresses a strong desire for the war to end.
wordcloud(tweets$text, scale=c(4,0.5), max.words=75,
random.order=FALSE, rot.per=0.35,colors=brewer.pal(8,"Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

# This wordcloud is a good example of how my data was skewed by foreign stop words. In terms of sentiment analysis, the word "die" is especially concerning, also meaning "the" in German.
textplot_wordcloud(dfm_trim(text, min_count = 10, verbose = FALSE,
colors = c('red', 'green', 'purple', 'orange', 'blue')))
## Warning in dfm_trim.dfm(text, min_count = 10, verbose = FALSE, colors =
## c("red", : min_count is deprecated, use min_termfreq

# This is how the wordcloud is meant to look. As can be expected, Ukraine, war, and Biden are some of the most prominent words. Though, I'm suprised billions is as prominent as it is. After doing some research, these tweets were collected just after Biden had asked congress for $33 billion. This would be the biggest step the United States has made in support of Ukraine, and will later even be raised to $40 billion. These actions come about a month after the adminisration offered $800 million which many Ukranians and Americans thought wasn't enough given how much the US have given to allies in the past.
Plotting co-occurence network
# pull the top features
myFeatures <- names(topfeatures(text_fcm, 70))
# retain only those top features as part of our matrix
even_text_fcm <- fcm_select(text_fcm, pattern = myFeatures, selection = "keep")
# check dimensions
dim(even_text_fcm)
## [1] 70 70
# compute size weight for vertices in network
size <- log(colSums(even_text_fcm))
# create plot
textplot_network(even_text_fcm, vertex_size = size / max(size) * 3)

# This co-occurence network gives us some valuble insight to the public discourse of Biden's response. It presents a clear need for addictional aid and an overall sense of struggle. It validates some of the prior analysis in that Americans are looking to Biden to take more steps to protect Ukraine.
text <- dfm_remove(text, c("der", "für", "die", "une", "sur", "pas", "se", "qui", "que", "la", "de", "l", "au", "le", "pour", "les", "á", "milliards", "mais", "plus", "se", "une", "guerre", "plus", "se", "une", "guerre", "plus", "amzn", "twtr", "dwac", "baba", "aapl", "ba", "tsla", "ba", "fb", "amd", "tsla"))
Plotting topic model
m_topics <- tidy(m, matrix = "beta")
m_top_terms <-
m_topics %>%
group_by (topic) %>%
top_n (8,beta) %>%
ungroup() %>%
arrange(topic, -beta)
library (ggplot2)
m_top_terms %>%
mutate(term = reorder(term, beta)) %>%
mutate(topic = paste("Topic #", topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
theme_minimal() +
theme(plot.title =
element_text(hjust = 0.5, size = 18)) +
labs(
title = "Topic Model of #Ukraine & #Biden Tweets",
caption = "Top Terms by Topic"
) +
ylab("") +
xlab("") +
coord_flip()

# Given the limitation of data, some terms are repeated within multiple topics, like war and Ukraine, however, there are still clear themes presented in the LDA model. Topics #2 and #6 specifically reaffirm the urgency for aid, including military, present in the previous analysis. There are also topics that express the desire for an end to the war, such as topic #3, #8 and #9.