Major 3

Reddit data focus:

I will be looking at changes in sentiment towards FEMA in the past 12 months given the large impacts of flooding and hurricanes this year.

# Package names
packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "tidyverse", "igraph", "ggraph", "wordcloud2", "textdata", "sf", "tmap", "here", "ggdark", "syuzhet", "sentimentr", "lubridate")

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}

# Load packages
invisible(lapply(packages, library, character.only = TRUE))

Reddit threads

# using keyword
threads_1 <- find_thread_urls(keywords = 'FEMA', 
                              sort_by = 'relevance', 
                              period = 'year') %>% 
  drop_na()

## parsing URLs on page 1...
## parsing URLs on page 2...
## parsing URLs on page 3...

rownames(threads_1) <- NULL

colnames(threads_1)

## [1] "date_utc"  "timestamp" "title"     "text"      "subreddit" "comments" 
## [7] "url"

head(threads_1, 3) %>% knitr::kable()

date_utc	timestamp	title	subreddit	comments	url
2024-09-19	1726786161	FEMA accidentally declared a ‘Major Disaster’ for Texas over Saints’ thumping of Cowboys	nottheonion	21	https://www.reddit.com/r/nottheonion/comments/1fkxfl6/fema_accidentally_declared_a_major_disaster_for/
2024-10-10	1728588573	Harris campaign names Republicans who voted against FEMA funding	minnesota	51	https://www.reddit.com/r/minnesota/comments/1g0r0rp/harris_campaign_names_republicans_who_voted/
2024-10-14	1728908894	Analysts found that 33 posts containing claims debunked by FEMA, the White House and the US government had together generated more than 160 million views as of October 7	skeptic	58	https://www.reddit.com/r/skeptic/comments/1g3eir8/analysts_found_that_33_posts_containing_claims/

write_csv(threads_1,"threads_1.csv")

Clean Data and Tokenize

# Word tokenization
words <- threads_1 %>% 
  tidytext::unnest_tokens(output = word, input = text, token = "words") 

words %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

## Selecting by n

# Stop words
# load list of stop words - from the tidytext package
data("stop_words")
# view random 50 words
print(stop_words$word[sample(1:nrow(stop_words), 100)])

##   [1] "as"           "you're"       "one"          "c's"          "longer"      
##   [6] "nowhere"      "yours"        "they"         "theres"       "here's"      
##  [11] "a"            "is"           "getting"      "show"         "ever"        
##  [16] "came"         "which"        "say"          "gotten"       "quite"       
##  [21] "give"         "places"       "w"            "cause"        "nearly"      
##  [26] "perhaps"      "somebody"     "noone"        "just"         "and"         
##  [31] "so"           "down"         "known"        "done"         "backing"     
##  [36] "his"          "sure"         "que"          "per"          "aren't"      
##  [41] "onto"         "soon"         "why"          "last"         "highest"     
##  [46] "end"          "wouldn't"     "ever"         "around"       "different"   
##  [51] "you're"       "each"         "out"          "or"           "not"         
##  [56] "knows"        "or"           "again"        "q"            "under"       
##  [61] "nevertheless" "here"         "turns"        "been"         "via"         
##  [66] "pointing"     "you'd"        "nothing"      "some"         "i"           
##  [71] "and"          "now"          "wherein"      "k"            "therefore"   
##  [76] "we"           "it"           "aside"        "alone"        "after"       
##  [81] "do"           "says"         "problem"      "every"        "area"        
##  [86] "couldn't"     "below"        "an"           "already"      "then"        
##  [91] "wherever"     "rd"           "youngest"     "but"          "we're"       
##  [96] "states"       "even"         "room"         "few"          "nowhere"

# Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- threads_1 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = "words") %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

## Before: 7027, After: 2695

words_clean %>%
  count(word, sort = TRUE) %>%
  top_n(20, n) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

Word Cloud

n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

words_clean <- words_clean %>%
  filter(word != "fema")

word_cloud <- words_clean %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

word_cloud

Tri-gram

# Extract tri-grams from your text data
words_ngram <- threads_1 %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)

# Show ngrams with sorted values
words_ngram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()

paired_words	n
NA	172
r tx rep	9
if you have	6
a lot of	5
i can t	5
a group of	4
helene rumor response	4
in the area	4
the federal government	4
the fema disaster	4
there is no	4
all domestic terrorists	3
any kind of	3
are all domestic	3
assistance you may	3
carolina reported threats	3
disaster fund to	3
domestic terrorists cpac	3
emergency management agency	3
even if you	3

# Remove stop words 
#separate the paired words into two columns
words_ngram_pair <- words_ngram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# filter rows where there are stop words under word 1 column and word 2 column
words_ngram_pair_filtered <- words_ngram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]") & str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
# To see what's ASCII, google 'ASCII table'
library(stringi)
words_ngram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2) & stri_enc_isascii(word3))

# Sort the new tri-gram counts:
words_counts <- words_ngram_pair_filtered %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

head(words_counts, 20) %>% 
  knitr::kable()

word1	word2	word3	n
helene	rumor	response	4
carolina	reported	threats	3
domestic	terrorists	cpac	3
emergency	management	agency	3
federal	emergency	management	3
fema	disaster	relief	3
helene	north	carolina	3
helene	relief	efforts	3
north	carolina	reported	3
fema	disaster	fund	2
fema	relief	money	2
local	fire	departments	2
militia	hunting	fema	2
relief	efforts	fema	2
1027format	pjpgauto	webps	1
2nd	district	representative	1
3carter	county	armed	1
5th	district	representative	1
6th	district	senator	1
accuses	president	biden	1

Review of tri-grams There are some very interested tri-grams here as people were clearly discussing the rumors about FEMA aid that were circulated by the Trump administration (“helene rumor response”; “militia hunting fema”, “accuses president biden”). There also seem to have just been discussions around finding resources (“fema disaster relief”, “fema disaster fund”, “fema relief money”) but these may have also had to do with the issues around rumors about aid. It looks like most of the specific focus was on hurricane Helene, even though there were other hurricanes and disasters throughout the year.

Sentiment Analysis: Dictionary Method

# Join thread title and text.
reddit_sentiment <- threads_1 %>%
  mutate(title = replace_na(title, ""),
         text = replace_na(text, ""),
         title_text = str_c(title, text, sep = ". "))

# dictionary method
reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)

reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)

Sample texts and sentiment scores

sentimentr_example <- reddit_sentiment %>%
  distinct(title_text, .keep_all = TRUE) %>%
  mutate(sentimentr_abs = abs(sentiment_dict),
         sentimentr_binary = case_when(sentiment_dict > 0 ~ 'positive',
                                       TRUE ~ 'negative')) %>%
  group_by(sentimentr_binary) %>%
  arrange(desc(sentimentr_abs)) %>%
  slice_head(n = 5) %>%
  ungroup() %>%
  arrange(sentiment_dict)

# Create a gt table
library(gt)

## Warning: package 'gt' was built under R version 4.3.3

# Filter only the required columns for display
sentimentr_example_filtered <- sentimentr_example %>%
  select(title_text, sentiment_dict, sentimentr_binary)

# create table 
sentimentr_example_filtered %>%
  gt() %>%
  tab_header(
    title = "10 Reddit Posts by Sentiment") %>%
  cols_label(
    title_text = "Title",
    sentiment_dict = "Sentiment Score",
    sentimentr_binary = "Sentiment"
  ) %>%
  tab_style(
    style = cell_fill(color = "lightblue"),
    locations = cells_column_labels(columns = c(sentiment_dict, sentimentr_binary))
  ) %>%
  tab_style(
    style = cell_text(weight = "bold", color = "black"),
    locations = cells_body(columns = c(title_text))
  ) %>%
  opt_table_font(font = "Arial") %>%
  opt_table_lines()

Title	Sentiment Score	Sentiment
10 Reddit Posts by Sentiment
As Trump lies (and lies and lies) about FEMA, GOP gets called out on Hurricane Helene hypocrisy.	-1.1884246	negative
Desperate man says hurricane-hit father-in-law refusing FEMA help after falling for Trump lie.	-0.9125000	negative
Trump refuses to denounce threats to FEMA, doubles down on falsehoods.	-0.7236272	negative
Tuberville criticizes FEMAs Hurricane Helene response; agency says claims false.	-0.6633250	negative
I hate the federal government. Fuck FEMA..	-0.6330619	negative
Florida GOP Rep. Who Voted Against FEMA Funding Calls for Aid Before Milton.	0.2773501	positive
FEMA Reports They Spent All The Taxpayer Funding On A Sweet PowerPoint Presentation About Racial Equity.	0.3750000	positive
Why are Republicans against FEMA?. Why are Republicans against FEMA? Genuinely curious.	0.4142906	positive
Republicans don't forget to REFUSE those FEMA checks!.	0.4419417	positive
MAGA congressman says top priority is shutting down FEMA like the Department of Education.	0.5612486	positive

Dictionary Method Review The dictionary method does not seem very credible for this analysis. The text that comes up as negative seems to actually be negative, however, the text identified as positive does not seem to be positive at all. Therefore, I tried using the deep learning model.

# 2. Deep learning model
# import the data
reddit_sentiment <- read_csv("sample_reddit_bert.csv")

## New names:
## Rows: 237 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (5): title, text, subreddit, url, bert_label dbl (4): ...1, timestamp,
## comments, bert_score date (1): date_utc
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

# drop NAs
reddit_sentiment %<>% drop_na('bert_label')

reddit_sentiment %<>%
  mutate(title = replace_na(title, ""),
         text = replace_na(text, ""),
         title_text = str_c(title, text, sep = ". "))

# dictionary method
reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)

reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)

reddit_sentiment %<>% mutate(bert_label_numeric = str_sub(bert_label, 1, 1) %>% as.numeric())

cor(reddit_sentiment$bert_label_numeric, reddit_sentiment$sentiment_dict)

## [1] 0.1192125

bert_example <- reddit_sentiment %>%
  filter(bert_label_numeric %in% c(1,5)) %>%
  group_by(bert_label) %>%
  arrange(desc(bert_score)) %>%
  slice_head(n = 3) %>%
  ungroup()

# Filter only the required columns for display
bert_example_filtered <- bert_example %>%
  select(title_text, bert_label_numeric, bert_label)

# create table 
bert_example_filtered %>%
  gt() %>%
  tab_header(
    title = "10 Reddit Posts by Sentiment") %>%
  cols_label(
    title_text = "Title",
    bert_label_numeric = "Sentiment Score",
    bert_label = "Sentiment"
  ) %>%
  tab_style(
    style = cell_fill(color = "lightblue"),
    locations = cells_column_labels(columns = c(bert_label_numeric, bert_label))
  ) %>%
  tab_style(
    style = cell_text(weight = "bold", color = "black"),
    locations = cells_body(columns = c(title_text))
  ) %>%
  opt_table_font(font = "Arial") %>%
  opt_table_lines()

Title	Sentiment Score	Sentiment
10 Reddit Posts by Sentiment
'The worst I have ever seen': Disinformation chaos hammers FEMA.	1	1 star
FEMA just released us. And 250 other ambulances from north Florida. When we got there some folks said they were in camp for a week or more. Zero missions zero, back fill, and excess of poor communication. Boondoggle	1	1 star
Frankly ridiculous: FEMA administrator slams Trump for boosting false Helene recovery claims.	1	1 star
Thank you, FEMA. I wouldn't have shelter if not for you.. I can't recall her full name, but a FEMA volunteer named Debbie saw me lost and alone when I had nobody and nothing but the clothes on my back. She alone helped me find shelter, drove me there personally, and got me set up with several resources. I don't know how to contact her for thanks, but I owe her my life. Debbie, if you're out there reading this, thank you from the bottom of my heart for everything you did. You're a saint and I wish you the absolute best in life.	5	5 stars
Waffle House has a Storm Center with an entire operations team. So good they help FEMA..	5	5 stars
Just had my FEMA house inspection&&. &&and it was super easy. Asked some basic questions about the house and everyone living in it. Scanned the receipt from my chainsaw purchase, took pictures of the damage to the house and that was it. Super impressed with him coming out on a Sunday, he said they are out everyday. If you are waiting for FEMA to call you and you get a call from some random number thats probably it.	5	5 stars

ggplot(data = reddit_sentiment, aes(x = bert_label_numeric, y = sentiment_dict)) +
  geom_jitter(width = 0.1, height = 0) +
  geom_line(aes(y = 0), color = '#FFD700', lwd = 1, linetype='dashed') +
  dark_theme_grey()

## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().

Figure 1 Number of threads by sentiment category.

reddit_sentiment %>%
  ggplot(aes(x = bert_label)) +
  geom_bar(fill = "white") +
  dark_theme_gray()

The deep learning model seemed to work much better.

Discuss intriguing insights derived from the sentiment analysis, supporting your observations with at least two plots.

Given that the deep learning model worked better to identify positive and negative threads, I used that model for the rest of the figures. Clearly, most of the time that people talk about FEMA, they are posting negative sentiments (Figure 1). Hurricane Helene and the hurricane in Florida both happened in October and seems to be the biggest reason that people were talking about FEMA (Figure 4). Those with positive emotions about FEMA talked about shelter, Tim Walz, how FEMA helped, etc. (Figure 3). Those with negative emotions about FEMA talked about NC and FL, money, “claims”, etc. (not sure whether insurance or fraudulent claims) (Figure 2).

# Word Cloud 
# Stop word removal and tokenization
data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

reddit_sentiment_clean <- reddit_sentiment %>%
  mutate(title_text = str_replace_all(title_text, replace_reg, "")) %>%
  # tokenize
  unnest_tokens(word, title_text, token = "words") %>%
  # remove stop words
  anti_join(stop_words, by = "word") %>%
  filter(str_detect(word, "[a-z]")) %>%
  filter(!word %in% c('FEMA'))

# negative text
reddit_sentiment_clean_negative <- reddit_sentiment_clean %>%
  filter(bert_label_numeric %in% c(1,2))
# positive text
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>%
  filter(bert_label_numeric %in% c(4,5))

# Remove words that are commonly seen in both negative and positive threads
reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>%
  anti_join(reddit_sentiment_clean_positive, by = 'word')
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
  anti_join(reddit_sentiment_clean_negative, by = 'word')

Figure 2. Words appearing in Negative threads.

# Wordcloud with a custom color palette
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark nor too bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

negative <- reddit_sentiment_clean_negative_unique %>%
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal,
       minRotation = pi/6,
       maxRotation = pi/6,
       rotateRatio = 1)

library(webshot)
library(htmlwidgets)
saveWidget(negative, '1.html', selfcontained = F)
webshot('1.html', '1.png', vwidth=500,vheight=300, delay = 5)

Figure 3. Words appearing in Positive threads.

# Wordcloud with a custom color palette
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark nor too bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

positive <- reddit_sentiment_clean_positive_unique %>%
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal,
       minRotation = pi/6,
       maxRotation = pi/6,
       rotateRatio = 1)

saveWidget(positive, '2.html', selfcontained = F)
webshot('2.html', '2.png', vwidth=500,vheight=300, delay = 5)

# Temporal Analysis
reddit_sentiment %<>%
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date)) %>%
  mutate(month = month(date))

# sentiment by year
reddit_sentiment %>%
  ggplot(aes(x = month, fill = sentiment_dict)) +
  geom_bar(position = 'stack') +
  scale_x_continuous(breaks = seq(min(reddit_sentiment$month),
                                  max(reddit_sentiment$month),
                                  by = 1)) +
  scale_fill_brewer(palette = 'PuRd', direction = -1) +
  dark_theme_grey()

## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Figure 4. Sentiment by month.