installing plotting libaries

library(quanteda)

## Package version: 2.1.2

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

library(tidyr)
library(tidytext)
library(quanteda.textplots)

## 
## Attaching package: 'quanteda.textplots'

## The following objects are masked from 'package:quanteda':
## 
##     as.igraph, as.network, textplot_keyness, textplot_network,
##     textplot_wordcloud, textplot_xray

library(stringr)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following objects are masked from 'package:quanteda':
## 
##     meta, meta<-

## 
## Attaching package: 'tm'

## The following objects are masked from 'package:quanteda':
## 
##     as.DocumentTermMatrix, stopwords

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(wordcloud)

## Loading required package: RColorBrewer

library(wordcloud2)
library(skimr)
library(tokenizers)
library(topicmodels)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(textdata)

Reading in tweets

tweets5 <- read.csv("~/Desktop/C:\\Users\\Keith\\Desktop\\tweets5.csv")

tweets4 <- read.csv("~/Desktop/C:\\Users\\Keith\\Desktop\\tweets4.csv")

Combinging two collections of tweets

tweets <- rbind(tweets5, tweets4)

Cleaning tweets

clean <- function (text) {
  str_remove_all(text," ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)") %>%
                            # Remove mentions
                            str_remove_all("@[[:alnum:]_]*") %>%
                            # Remove hash tags
                            str_remove_all("#[[:alnum:]_]+") %>%
                            # Replace "&" character reference with "and"
                            str_replace_all("&amp;", "and") %>%
                            # Remove punctuation, using a standard character class
                            str_remove_all("[[:punct:]]") %>%
                            # remove digits
                            str_remove_all("[[:digit:]]") %>%
                            # Remove "RT: " from beginning of retweets
                            str_remove_all("^RT:? ") %>%
                            # Replace any newline characters with a space
                            str_replace_all("\\\n|\\\r", " ") %>%
                            # remove strings like "<U+0001F9F5>"
                            str_remove_all("<.*?>") %>%
                            # removing emojies 
                            str_remove_all("[[:emoji:]]") %>%
                            # Make everything lowercase
                            str_to_lower() %>%
                            # Remove any trailing white space around the text and inside a string
                            str_squish()
}
tweets$text <- clean(tweets$text)
tweets$retweet_text <- clean(tweets$retweet_text)

#removing duplicated tweets
tweets <- tweets[!duplicated(tweets$text),]

# This inital pre-process cleaning of the text successfully removed most of the undesired matial, however I've noticed there are a fiar amount of tweets written in other languages which I will exclude from my analysis

Creating Corpus and DFM

text <- corpus(c(tweets$text, tweets$retweet_text))

## Warning: NA is replaced by empty string

text <- dfm(tokens(text, remove_punct=TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>%
             tokens_select(pattern=stopwords("en"),
                            selection="remove"))

text <- dfm_trim(text, min_docfreq = 25)

# Back-up cleaning of dfm to make sure everything is removed, however there are still a lot of foreign stop words, mostly Spanish and German it seems

text <- dfm_remove(text, c("der", "für", "die", "une", "sur", "pas", "se", "qui", "que", "la", "de", "l", "au", "le", "pour", "les", "á", "milliards", "mais", "plus", "se", "une", "guerre", "plus", "se", "une", "guerre", "plus", "amzn", "twtr", "dwac", "baba", "aapl", "ba", "tsla", "ba", "fb", "amd", "tsla", "du", "et", "á","s", "nvda", "nflx", "msft", "qqq", "den", "im", "en", "un", "ne", "est", "des", "und", "à", "il", "bn", "will"))

# After a great deal of stuggle I was able to find a way to remove each of the foreign stop words and usless character strings through dfm_remove. I included the word "will" becuase it seemed to confuse the topic modeling in the analysis that follows.

dtm <- convert(text, to = "topicmodels")

m = LDA(dtm, method = "Gibbs", k = 10,  control = list(alpha = 0.1))

terms(m, 5)

##      Topic 1   Topic 2   Topic 3  Topic 4   Topic 5   Topic 6   Topic 7
## [1,] "war"     "spy"     "people" "billion" "ukraine" "sending" "new"  
## [2,] "us"      "world"   "get"    "money"   "war"     "money"   "covid"
## [3,] "end"     "now"     "world"  "wants"   "russia"  "dollars" "check"
## [4,] "ukraine" "news"    "right"  "support" "putin"   "dont"    "dans" 
## [5,] "putin"   "twitter" "make"   "us"      "nuclear" "joe"     "vs"   
##      Topic 8     Topic 9     Topic 10  
## [1,] "biden"     "war"       "billion" 
## [2,] "us"        "us"        "aid"     
## [3,] "president" "russian"   "congress"
## [4,] "joe"       "ukraine"   "military"
## [5,] "ukraine"   "ukrainian" "ukraine"

# Now that all of the usless words have been removed the topic models are making much more sense. There is a clear urgency for aid, specifically money and military. I find topic #1 especially striking as it expresses a strong desire for the war to end.

wordcloud(tweets$text, scale=c(4,0.5), max.words=75, 
           random.order=FALSE, rot.per=0.35,colors=brewer.pal(8,"Dark2"))

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

# This wordcloud is a good example of how my data was skewed by foreign stop words. In terms of sentiment analysis, the word "die" is especially concerning, also meaning "the" in German.

textplot_wordcloud(dfm_trim(text, min_count = 10, verbose = FALSE, 
                            colors = c('red', 'green', 'purple', 'orange', 'blue')))

## Warning in dfm_trim.dfm(text, min_count = 10, verbose = FALSE, colors =
## c("red", : min_count is deprecated, use min_termfreq

# This is how the wordcloud is meant to look. As can be expected, Ukraine, war, and Biden are some of the most prominent words. Though, I'm suprised billions is as prominent as it is. After doing some research, these tweets were collected just after Biden had asked congress for $33 billion. This would be the biggest step the United States has made in support of Ukraine, and will later even be raised to $40 billion. These actions come about a month after the adminisration offered $800 million which many Ukranians and Americans thought wasn't enough given how much the US have given to allies in the past.

It seems the DFM still contains words that don't pertain to my research, so I will clean the DFM to only contain words that appear in 50% of the tweets

text_dfm <- dfm_trim(text, min_termfreq = .3, docfreq_type = "prop")

## Warning in dfm_trim.dfm(text, min_termfreq = 0.3, docfreq_type = "prop"): use
## termfreq_type = 'prop' for fractional term frequency

# create fcm from dfm
text_fcm <- fcm(text_dfm)

# check the dimensions (i.e., the number of rows and the number of columnns)
# of the matrix we created
dim(text_fcm)

## [1] 110 110

head(text_fcm, 10)

## Feature co-occurrence matrix of: 10 by 6 features.
##          features
## features  now news asking live white house
##   now       2    8      1    1     1     2
##   news      0    0      0    5     3     3
##   asking    0    0      1    0     1     1
##   live      0    0      0    1     2     2
##   white     0    0      0    0     1    18
##   house     0    0      0    0     0     0
##   covid     0    0      0    0     0     0
##   ukraine   0    0      0    0     0     0
##   country   0    0      0    0     0     0
##   america   0    0      0    0     0     0

Plotting co-occurence network

# pull the top features
myFeatures <- names(topfeatures(text_fcm, 70))

# retain only those top features as part of our matrix
even_text_fcm <- fcm_select(text_fcm, pattern = myFeatures, selection = "keep")

# check dimensions
dim(even_text_fcm)

## [1] 70 70

# compute size weight for vertices in network
size <- log(colSums(even_text_fcm))

# create plot
textplot_network(even_text_fcm, vertex_size = size / max(size) * 3)

# This co-occurence network gives us some valuble insight to the public discourse of Biden's response. It presents a clear need for addictional aid and an overall sense of struggle. It validates some of the prior analysis in that Americans are looking to Biden to take more steps to protect Ukraine.

text <- dfm_remove(text, c("der", "für", "die", "une", "sur", "pas", "se", "qui", "que", "la", "de", "l", "au", "le", "pour", "les", "á", "milliards", "mais", "plus", "se", "une", "guerre", "plus", "se", "une", "guerre", "plus", "amzn", "twtr", "dwac", "baba", "aapl", "ba", "tsla", "ba", "fb", "amd", "tsla"))

Plotting topic model

m_topics <- tidy(m, matrix = "beta")

m_top_terms <- 
  m_topics %>%
  group_by (topic) %>%
  top_n (8,beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

library (ggplot2)

m_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  mutate(topic = paste("Topic #", topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) + 
  facet_wrap(~ topic, scales = "free") +
  theme_minimal() +
  theme(plot.title = 
          element_text(hjust = 0.5, size = 18)) +
  labs(
    title = "Topic Model of #Ukraine & #Biden Tweets",
    caption = "Top Terms by Topic"
  ) + 
  ylab("") +
  xlab("") +
  coord_flip()

# Given the limitation of data, some terms are repeated within multiple topics, like war and Ukraine, however, there are still clear themes presented in the LDA model. Topics #2 and #6 specifically reaffirm the urgency for aid, including military, present in the previous analysis. There are also topics that express the desire for an end to the war, such as topic #3, #8 and #9.

Ukraine blog

installing plotting libaries

Reading in tweets

Combinging two collections of tweets

Cleaning tweets

Creating Corpus and DFM

It seems the DFM still contains words that don't pertain to my research, so I will clean the DFM to only contain words that appear in 50% of the tweets

Plotting co-occurence network

Plotting topic model