library(tidyverse)
library(dplyr)
library(magrittr)
library(RedditExtractoR)
library(anytime)
library(httr)
library(tidytext)
library(igraph)
library(ggraph)
library(wordcloud2)
library(textdata)
library(here)
library(stringi)
library(ggdark)
library(sentimentr)

Step 1. Describe in one sentence what you aim to examine using user-generated text data and sentiment analysis.

Public sentiment towards Elon Musk during the 2024 U.S. presidential election

Step 2. Search Reddit threads using a keyword of your choice

threads <- find_thread_urls(keywords= c("elon musk"),
                              sort_by = 'relevance', 
                              period = 'all') %>% 
  drop_na()

save(threads, file = "/Users/seungjaelieu/GaTech Dropbox/Seung Jae Lieu/GT 2-1/3_CP8883 Intro to UA/major_3/elon_threads.RData")

Step 3. Clean text data and then tokenize it

load("/Users/seungjaelieu/GaTech Dropbox/Seung Jae Lieu/GT 2-1/3_CP8883 Intro to UA/major_3/elon_threads.RData")

words <- threads %>% 
  unnest_tokens(output = word, input = text, token = "words")

data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- threads %>% 
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  unnest_tokens(word, text, token = "words") %>% 
  anti_join(stop_words, by = "word") %>% 
  filter(str_detect(word, "[a-z]"))

print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)
## Before: 1783, After: 824

Step 4. Generate a word cloud

n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

words_clean %>% 
  count(word, sort = TRUE) %>%
  mutate(n = ifelse(word %in% c("elon musk", "elon", "musk"), 0, n)) %>%
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

Step 5. Conduct a tri-gram analysis

# Extract tri-grams from your text data
words_ngram <- threads %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)

words_ngram_pair <- words_ngram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# Remove tri-grams containing stop words or non-alphabetic terms
words_ngram_pair_filtered <- words_ngram_pair %>%
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>% 
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]")  & str_detect(word3, "[a-z]"))

words_ngram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2) & stri_enc_isascii(word3))

words_counts <- words_ngram_pair_filtered %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

# Present the frequency of tri-grams in a table
head(words_counts, 20) %>% 
  knitr::kable()
word1 word2 word3 n
budget trump rally 2
elect donald trump 2
elon musk ukraine 2
federal budget trump 2
madison square garden 2
musk ukraine president 2
president elect donald 2
president kamala harris 2
president zelenskyy phone 2
trillion federal budget 2
trump elon musk 2
ukraine president zelenskyy 2
vice president kamala 2
zelenskyy phone call 2
1098format pjpgauto webps 1
10k 15k max 1
20federal 20government 20spent 1
20the 20federal 20government 1
2c 20the 20federal 1
6am est tomorrow 1

Discuss any noteworthy tri-grams you come across:

“elon musk ukraine” and “musk ukraine president”: These tri-grams indicate a focus on Musk’s involvement or commentary regarding Ukraine and possibly geopolitical issues.

“zelenskyy phone call” and “ukraine president zelenskyy”: These further emphasize the connection between Musk and Ukraine.

“budget trump rally” and “federal budget trump”: These suggest threads discussing Trump-related events or policies, potentially in the context of supporting the rally.

“president elect donald” and “trump elon musk”: These highlight associations between Elon Musk and Donald Trump, likely reflecting discussions about political alignment or involvement.

Step 6. Perform a sentiment analysis

# I have utilized a pre-trained BERT (Bidirectional Encoder Representations from Transformers) model via Colab and export the output as "elon_sample_bert.csv"

reddit_sentiment <- read_csv("/Users/seungjaelieu/GaTech Dropbox/Seung Jae Lieu/GT 2-1/3_CP8883 Intro to UA/major_3/elon_sample_bert.csv")
reddit_sentiment %<>% drop_na('bert_label')

reddit_sentiment %<>%
  mutate(title = replace_na(title, ""),
         text = replace_na(text, ""),
         title_text = str_c(title, text, sep = ". "))

reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)
reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)
reddit_sentiment %<>% mutate(bert_label_numeric = str_sub(bert_label, 1, 1) %>% as.numeric())

Step 7. Display 10 sample texts

bert_example <- reddit_sentiment %>%
  filter(bert_label_numeric %in% c(1:5)) %>%
  group_by(bert_label) %>%
  arrange(desc(bert_score)) %>%
  slice_head(n = 2) %>%
  ungroup()

bert_example %>%
  select(title_text, bert_label_numeric, sentiment_dict) %>%
  print()
## # A tibble: 10 × 3
##    title_text                                  bert_label_numeric sentiment_dict
##    <chr>                                                    <dbl>          <dbl>
##  1 "Elon Musk\u0019s false and misleading ele…                  1       -0.413  
##  2 "Illegal immigrant Elon Musk skipping like…                  1       -0.530  
##  3 "Elon Musk's big X interview with Trump ki…                  2        0.0139 
##  4 "Elon Musk doesn\u0019t love his kids, and…                  2        0.401  
##  5 "Elon Musk may already be overstaying his …                  3        0.151  
##  6 "Elon Musk says people should worry less a…                  3       -0.188  
##  7 "A young Elon Musk and his brother Kimbal …                  4        0.184  
##  8 "Elon Musk reacts to Taylor Swift's recent…                  4        0      
##  9 "The absolute master of pissing off Elon M…                  5        0.05   
## 10 "Tim Walz just called Elon Musk a \"dipshi…                  5        0.00536
cor(reddit_sentiment$bert_label_numeric, reddit_sentiment$sentiment_dict)
## [1] 0.2357682
ggplot(data = reddit_sentiment, aes(x = bert_label_numeric, y = sentiment_dict)) +
  geom_jitter(width = 0.1, height = 0) +
  geom_line(aes(y = 0), color = '#FFD700', lwd = 1, linetype='dashed') +
  dark_theme_grey()
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().

The correlation between the sentiment values from two different methods is 0.24 which implies a weak positive correlation. While the correlation suggests some agreement, the scatterplot reveals inconsistencies, particularly with high bert_label_numeric values (4-5 stars) often corresponding to negative or neutral dictionary-based sentiment.

Furthermore, in terms of qualitative review, I found out that “Illegal immigrant Elon Musk skipping like a dipshit.” received a bert_label_numeric of 1 (very negative), aligning with the dictionary sentiment score of -0.530. This agreement adds credibility. However, “Tim Walz just called Elon Musk a ‘dipshit’ - definite dad/coach vibes. Love it.” received a bert_label_numeric of 5 (very positive) despite its ambiguous tone, and the dictionary score is 0.00536 (essentially neutral). This suggests potential misclassification by the BERT model.

Step 8. Discuss intriguing insights

### First Plot: Words appearing in positive and negative threads
# Remove outliers
reddit_sentiment_rm_outlier <- reddit_sentiment %>%
  group_by(bert_label) %>%
  filter(
    between(
      comments,
      quantile(comments, 0.25) - 1.5 * IQR(comments),
      quantile(comments, 0.75) + 1.5 * IQR(comments)))

# Stop word removal and tokenization
reddit_sentiment_clean <- reddit_sentiment %>%
  mutate(title_text = str_replace_all(title_text, replace_reg, "")) %>%
  unnest_tokens(word, title_text, token = 'words') %>%
  anti_join(stop_words, by = "word") %>% 
  filter(str_detect(word, "[a-z]")) %>%
  filter(!word %in% c('elon musk','musk','elon'))

# negative text
reddit_sentiment_clean_negative <- reddit_sentiment_clean %>%
  filter(bert_label_numeric %in% c(1,2))
# positive text
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>%
  filter(bert_label_numeric %in% c(4,5))

# Remove words that are commonly seen in both negative and positive threads
reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>%
  anti_join(reddit_sentiment_clean_positive, by = 'word')
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
  anti_join(reddit_sentiment_clean_negative, by = 'word')

# Wordcloud with a custom color palette
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark nor too bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

# negative
reddit_sentiment_clean_negative_unique %>%
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal,
       minRotation = -pi/6,
       maxRotation = -pi/6,
       rotateRatio = 1)
# positive
reddit_sentiment_clean_positive_unique %>%
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal,
       minRotation = pi/6,
       maxRotation = pi/6,
       rotateRatio = 1)
### Second Plot: Sentiment by year using a stacked bar plot
reddit_sentiment %<>%
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date)) %>%
  mutate(year = year(date))

reddit_sentiment %>%
  ggplot(aes(x = year, fill = bert_label)) +
  geom_histogram(position = 'stack') +
  scale_x_continuous(breaks = seq(min(reddit_sentiment$year),
                                  max(reddit_sentiment$year),
                                  by = 1)) +
  scale_fill_brewer(palette = 'PuRd', direction = -1) +
  dark_theme_grey()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

When visualizing the sentiment analysis result from the BERT model, I have found out that words such as “voter,” “accused,” “fraud,” “illegal,” and “maga” dominate in negative threads. These words indicate discussions around political controversies or allegations and suggest potential association with contentious issues, possibly linked to elections or debates involving Elon Musk. Other terms like “biden,” “joe,” and “win” reflect a focus on U.S. political figures and election outcomes, implying that Elon Musk’s mentions might be tied to political narratives or divisive topics.

For positive threads, words such as “males,” “suggests,” “efficiency,” and “questions” suggest constructive discussions, perhaps focusing on innovation, leadership, or achievements. Names like “Cuban” and “Vivek” may indicate mentions of other prominent figures, suggesting collaborative or comparative contexts where Elon Musk is positively framed. For example, Elon Musk and Vivek Ramaswamy will lead new ‘Department of Government Efficiency’ in Trump administration.

Lastly, from the plot showing the sentiment by year, the spike in mentions in 2024 reflects a significant surge in discussions which aligns with the U.S. presidential election year. This suggests Elon Musk has become a focal point in political or public discourse during this period.