library(tidyverse)
library(dplyr)
library(magrittr)
library(RedditExtractoR)
library(anytime)
library(httr)
library(tidytext)
library(igraph)
library(ggraph)
library(wordcloud2)
library(textdata)
library(here)
library(stringi)
library(ggdark)
library(sentimentr)

Step 1. Describe in one sentence what you aim to examine using user-generated text data and sentiment analysis.

Public sentiment towards Elon Musk during the 2024 U.S. presidential election

Step 2. Search Reddit threads using a keyword of your choice

threads <- find_thread_urls(keywords= c("elon musk"),
                              sort_by = 'relevance', 
                              period = 'all') %>% 
  drop_na()

save(threads, file = "/Users/seungjaelieu/GaTech Dropbox/Seung Jae Lieu/GT 2-1/3_CP8883 Intro to UA/major_3/elon_threads.RData")

Step 3. Clean text data and then tokenize it

load("/Users/seungjaelieu/GaTech Dropbox/Seung Jae Lieu/GT 2-1/3_CP8883 Intro to UA/major_3/elon_threads.RData")

words <- threads %>% 
  unnest_tokens(output = word, input = text, token = "words")

data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- threads %>% 
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  unnest_tokens(word, text, token = "words") %>% 
  anti_join(stop_words, by = "word") %>% 
  filter(str_detect(word, "[a-z]"))

print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

## Before: 1783, After: 824

Step 4. Generate a word cloud

n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

words_clean %>% 
  count(word, sort = TRUE) %>%
  mutate(n = ifelse(word %in% c("elon musk", "elon", "musk"), 0, n)) %>%
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

Step 5. Conduct a tri-gram analysis

# Extract tri-grams from your text data
words_ngram <- threads %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)

words_ngram_pair <- words_ngram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# Remove tri-grams containing stop words or non-alphabetic terms
words_ngram_pair_filtered <- words_ngram_pair %>%
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>% 
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]")  & str_detect(word3, "[a-z]"))

words_ngram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2) & stri_enc_isascii(word3))

words_counts <- words_ngram_pair_filtered %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

# Present the frequency of tri-grams in a table
head(words_counts, 20) %>% 
  knitr::kable()

word1	word2	word3	n
budget	trump	rally	2
elect	donald	trump	2
elon	musk	ukraine	2
federal	budget	trump	2
madison	square	garden	2
musk	ukraine	president	2
president	elect	donald	2
president	kamala	harris	2
president	zelenskyy	phone	2
trillion	federal	budget	2
trump	elon	musk	2
ukraine	president	zelenskyy	2
vice	president	kamala	2
zelenskyy	phone	call	2
1098format	pjpgauto	webps	1
10k	15k	max	1
20federal	20government	20spent	1
20the	20federal	20government	1
2c	20the	20federal	1
6am	est	tomorrow	1

Discuss any noteworthy tri-grams you come across:

“elon musk ukraine” and “musk ukraine president”: These tri-grams indicate a focus on Musk’s involvement or commentary regarding Ukraine and possibly geopolitical issues.

“zelenskyy phone call” and “ukraine president zelenskyy”: These further emphasize the connection between Musk and Ukraine.

“budget trump rally” and “federal budget trump”: These suggest threads discussing Trump-related events or policies, potentially in the context of supporting the rally.

“president elect donald” and “trump elon musk”: These highlight associations between Elon Musk and Donald Trump, likely reflecting discussions about political alignment or involvement.

Step 6. Perform a sentiment analysis

# I have utilized a pre-trained BERT (Bidirectional Encoder Representations from Transformers) model via Colab and export the output as "elon_sample_bert.csv"

reddit_sentiment <- read_csv("/Users/seungjaelieu/GaTech Dropbox/Seung Jae Lieu/GT 2-1/3_CP8883 Intro to UA/major_3/elon_sample_bert.csv")
reddit_sentiment %<>% drop_na('bert_label')

reddit_sentiment %<>%
  mutate(title = replace_na(title, ""),
         text = replace_na(text, ""),
         title_text = str_c(title, text, sep = ". "))

reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)
reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)
reddit_sentiment %<>% mutate(bert_label_numeric = str_sub(bert_label, 1, 1) %>% as.numeric())

Step 7. Display 10 sample texts

bert_example <- reddit_sentiment %>%
  filter(bert_label_numeric %in% c(1:5)) %>%
  group_by(bert_label) %>%
  arrange(desc(bert_score)) %>%
  slice_head(n = 2) %>%
  ungroup()

bert_example %>%
  select(title_text, bert_label_numeric, sentiment_dict) %>%
  print()

## # A tibble: 10 × 3
##    title_text                                  bert_label_numeric sentiment_dict
##    <chr>                                                    <dbl>          <dbl>
##  1 "Elon Musk\u0019s false and misleading ele…                  1       -0.413  
##  2 "Illegal immigrant Elon Musk skipping like…                  1       -0.530  
##  3 "Elon Musk's big X interview with Trump ki…                  2        0.0139 
##  4 "Elon Musk doesn\u0019t love his kids, and…                  2        0.401  
##  5 "Elon Musk may already be overstaying his …                  3        0.151  
##  6 "Elon Musk says people should worry less a…                  3       -0.188  
##  7 "A young Elon Musk and his brother Kimbal …                  4        0.184  
##  8 "Elon Musk reacts to Taylor Swift's recent…                  4        0      
##  9 "The absolute master of pissing off Elon M…                  5        0.05   
## 10 "Tim Walz just called Elon Musk a \"dipshi…                  5        0.00536

cor(reddit_sentiment$bert_label_numeric, reddit_sentiment$sentiment_dict)

## [1] 0.2357682

ggplot(data = reddit_sentiment, aes(x = bert_label_numeric, y = sentiment_dict)) +
  geom_jitter(width = 0.1, height = 0) +
  geom_line(aes(y = 0), color = '#FFD700', lwd = 1, linetype='dashed') +
  dark_theme_grey()

## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().

The correlation between the sentiment values from two different methods is 0.24 which implies a weak positive correlation. While the correlation suggests some agreement, the scatterplot reveals inconsistencies, particularly with high bert_label_numeric values (4-5 stars) often corresponding to negative or neutral dictionary-based sentiment.

Furthermore, in terms of qualitative review, I found out that “Illegal immigrant Elon Musk skipping like a dipshit.” received a bert_label_numeric of 1 (very negative), aligning with the dictionary sentiment score of -0.530. This agreement adds credibility. However, “Tim Walz just called Elon Musk a ‘dipshit’ - definite dad/coach vibes. Love it.” received a bert_label_numeric of 5 (very positive) despite its ambiguous tone, and the dictionary score is 0.00536 (essentially neutral). This suggests potential misclassification by the BERT model.

Step 8. Discuss intriguing insights

### First Plot: Words appearing in positive and negative threads
# Remove outliers
reddit_sentiment_rm_outlier <- reddit_sentiment %>%
  group_by(bert_label) %>%
  filter(
    between(
      comments,
      quantile(comments, 0.25) - 1.5 * IQR(comments),
      quantile(comments, 0.75) + 1.5 * IQR(comments)))

# Stop word removal and tokenization
reddit_sentiment_clean <- reddit_sentiment %>%
  mutate(title_text = str_replace_all(title_text, replace_reg, "")) %>%
  unnest_tokens(word, title_text, token = 'words') %>%
  anti_join(stop_words, by = "word") %>% 
  filter(str_detect(word, "[a-z]")) %>%
  filter(!word %in% c('elon musk','musk','elon'))

# negative text
reddit_sentiment_clean_negative <- reddit_sentiment_clean %>%
  filter(bert_label_numeric %in% c(1,2))
# positive text
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>%
  filter(bert_label_numeric %in% c(4,5))

# Remove words that are commonly seen in both negative and positive threads
reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>%
  anti_join(reddit_sentiment_clean_positive, by = 'word')
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
  anti_join(reddit_sentiment_clean_negative, by = 'word')

# Wordcloud with a custom color palette
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark nor too bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

# negative
reddit_sentiment_clean_negative_unique %>%
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal,
       minRotation = -pi/6,
       maxRotation = -pi/6,
       rotateRatio = 1)

# positive
reddit_sentiment_clean_positive_unique %>%
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal,
       minRotation = pi/6,
       maxRotation = pi/6,
       rotateRatio = 1)

### Second Plot: Sentiment by year using a stacked bar plot
reddit_sentiment %<>%
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date)) %>%
  mutate(year = year(date))

reddit_sentiment %>%
  ggplot(aes(x = year, fill = bert_label)) +
  geom_histogram(position = 'stack') +
  scale_x_continuous(breaks = seq(min(reddit_sentiment$year),
                                  max(reddit_sentiment$year),
                                  by = 1)) +
  scale_fill_brewer(palette = 'PuRd', direction = -1) +
  dark_theme_grey()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

When visualizing the sentiment analysis result from the BERT model, I have found out that words such as “voter,” “accused,” “fraud,” “illegal,” and “maga” dominate in negative threads. These words indicate discussions around political controversies or allegations and suggest potential association with contentious issues, possibly linked to elections or debates involving Elon Musk. Other terms like “biden,” “joe,” and “win” reflect a focus on U.S. political figures and election outcomes, implying that Elon Musk’s mentions might be tied to political narratives or divisive topics.

For positive threads, words such as “males,” “suggests,” “efficiency,” and “questions” suggest constructive discussions, perhaps focusing on innovation, leadership, or achievements. Names like “Cuban” and “Vivek” may indicate mentions of other prominent figures, suggesting collaborative or comparative contexts where Elon Musk is positively framed. For example, Elon Musk and Vivek Ramaswamy will lead new ‘Department of Government Efficiency’ in Trump administration.

Lastly, from the plot showing the sentiment by year, the spike in mentions in 2024 reflects a significant surge in discussions which aligns with the U.S. presidential election year. This suggests Elon Musk has become a focal point in political or public discourse during this period.

Major_assignment_3

Seung Jae Lieu

2024-11-26