I will be looking at changes in sentiment towards FEMA in the past 12 months given the large impacts of flooding and hurricanes this year.
# Package names
packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "tidyverse", "igraph", "ggraph", "wordcloud2", "textdata", "sf", "tmap", "here", "ggdark", "syuzhet", "sentimentr", "lubridate")
# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Load packages
invisible(lapply(packages, library, character.only = TRUE))
# using keyword
threads_1 <- find_thread_urls(keywords = 'FEMA',
sort_by = 'relevance',
period = 'year') %>%
drop_na()
## parsing URLs on page 1...
## parsing URLs on page 2...
## parsing URLs on page 3...
rownames(threads_1) <- NULL
colnames(threads_1)
## [1] "date_utc" "timestamp" "title" "text" "subreddit" "comments"
## [7] "url"
head(threads_1, 3) %>% knitr::kable()
| date_utc | timestamp | title | text | subreddit | comments | url |
|---|---|---|---|---|---|---|
| 2024-09-19 | 1726786161 | FEMA accidentally declared a ‘Major Disaster’ for Texas over Saints’ thumping of Cowboys | nottheonion | 21 | https://www.reddit.com/r/nottheonion/comments/1fkxfl6/fema_accidentally_declared_a_major_disaster_for/ | |
| 2024-10-10 | 1728588573 | Harris campaign names Republicans who voted against FEMA funding | minnesota | 51 | https://www.reddit.com/r/minnesota/comments/1g0r0rp/harris_campaign_names_republicans_who_voted/ | |
| 2024-10-14 | 1728908894 | Analysts found that 33 posts containing claims debunked by FEMA, the White House and the US government had together generated more than 160 million views as of October 7 | skeptic | 58 | https://www.reddit.com/r/skeptic/comments/1g3eir8/analysts_found_that_33_posts_containing_claims/ |
write_csv(threads_1,"threads_1.csv")
# Word tokenization
words <- threads_1 %>%
tidytext::unnest_tokens(output = word, input = text, token = "words")
words %>%
count(word, sort = TRUE) %>%
top_n(20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "words",
y = "counts",
title = "Unique wordcounts")
## Selecting by n
# Stop words
# load list of stop words - from the tidytext package
data("stop_words")
# view random 50 words
print(stop_words$word[sample(1:nrow(stop_words), 100)])
## [1] "as" "you're" "one" "c's" "longer"
## [6] "nowhere" "yours" "they" "theres" "here's"
## [11] "a" "is" "getting" "show" "ever"
## [16] "came" "which" "say" "gotten" "quite"
## [21] "give" "places" "w" "cause" "nearly"
## [26] "perhaps" "somebody" "noone" "just" "and"
## [31] "so" "down" "known" "done" "backing"
## [36] "his" "sure" "que" "per" "aren't"
## [41] "onto" "soon" "why" "last" "highest"
## [46] "end" "wouldn't" "ever" "around" "different"
## [51] "you're" "each" "out" "or" "not"
## [56] "knows" "or" "again" "q" "under"
## [61] "nevertheless" "here" "turns" "been" "via"
## [66] "pointing" "you'd" "nothing" "some" "i"
## [71] "and" "now" "wherein" "k" "therefore"
## [76] "we" "it" "aside" "alone" "after"
## [81] "do" "says" "problem" "every" "area"
## [86] "couldn't" "below" "an" "already" "then"
## [91] "wherever" "rd" "youngest" "but" "we're"
## [96] "states" "even" "room" "few" "nowhere"
# Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&|<|>"
words_clean <- threads_1 %>%
# drop URLs
mutate(text = str_replace_all(text, replace_reg, "")) %>%
# Tokenization (word tokens)
unnest_tokens(word, text, token = "words") %>%
# drop stop words
anti_join(stop_words, by = "word") %>%
# drop non-alphabet-only strings
filter(str_detect(word, "[a-z]"))
# Check the number of rows after removal of the stop words. There should be fewer words now
print(
glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)
## Before: 7027, After: 2695
words_clean %>%
count(word, sort = TRUE) %>%
top_n(20, n) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "words",
y = "counts",
title = "Unique wordcounts")
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright
df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))
words_clean <- words_clean %>%
filter(word != "fema")
word_cloud <- words_clean %>%
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = 0,
maxRotation = 0,
ellipticity = 0.8)
word_cloud
# Extract tri-grams from your text data
words_ngram <- threads_1 %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
select(text) %>%
unnest_tokens(output = paired_words,
input = text,
token = "ngrams",
n = 3)
# Show ngrams with sorted values
words_ngram %>%
count(paired_words, sort = TRUE) %>%
head(20) %>%
knitr::kable()
| paired_words | n |
|---|---|
| NA | 172 |
| r tx rep | 9 |
| if you have | 6 |
| a lot of | 5 |
| i can t | 5 |
| a group of | 4 |
| helene rumor response | 4 |
| in the area | 4 |
| the federal government | 4 |
| the fema disaster | 4 |
| there is no | 4 |
| all domestic terrorists | 3 |
| any kind of | 3 |
| are all domestic | 3 |
| assistance you may | 3 |
| carolina reported threats | 3 |
| disaster fund to | 3 |
| domestic terrorists cpac | 3 |
| emergency management agency | 3 |
| even if you | 3 |
# Remove stop words
#separate the paired words into two columns
words_ngram_pair <- words_ngram %>%
separate(paired_words, c("word1", "word2", "word3"), sep = " ")
# filter rows where there are stop words under word 1 column and word 2 column
words_ngram_pair_filtered <- words_ngram_pair %>%
# drop stop words
filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>%
# drop non-alphabet-only strings
filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]") & str_detect(word3, "[a-z]"))
# Filter out words that are not encoded in ASCII
# To see what's ASCII, google 'ASCII table'
library(stringi)
words_ngram_pair_filtered %<>%
filter(stri_enc_isascii(word1) & stri_enc_isascii(word2) & stri_enc_isascii(word3))
# Sort the new tri-gram counts:
words_counts <- words_ngram_pair_filtered %>%
count(word1, word2, word3) %>%
arrange(desc(n))
head(words_counts, 20) %>%
knitr::kable()
| word1 | word2 | word3 | n |
|---|---|---|---|
| helene | rumor | response | 4 |
| carolina | reported | threats | 3 |
| domestic | terrorists | cpac | 3 |
| emergency | management | agency | 3 |
| federal | emergency | management | 3 |
| fema | disaster | relief | 3 |
| helene | north | carolina | 3 |
| helene | relief | efforts | 3 |
| north | carolina | reported | 3 |
| fema | disaster | fund | 2 |
| fema | relief | money | 2 |
| local | fire | departments | 2 |
| militia | hunting | fema | 2 |
| relief | efforts | fema | 2 |
| 1027format | pjpgauto | webps | 1 |
| 2nd | district | representative | 1 |
| 3carter | county | armed | 1 |
| 5th | district | representative | 1 |
| 6th | district | senator | 1 |
| accuses | president | biden | 1 |
Review of tri-grams There are some very interested tri-grams here as people were clearly discussing the rumors about FEMA aid that were circulated by the Trump administration (“helene rumor response”; “militia hunting fema”, “accuses president biden”). There also seem to have just been discussions around finding resources (“fema disaster relief”, “fema disaster fund”, “fema relief money”) but these may have also had to do with the issues around rumors about aid. It looks like most of the specific focus was on hurricane Helene, even though there were other hurricanes and disasters throughout the year.
# Join thread title and text.
reddit_sentiment <- threads_1 %>%
mutate(title = replace_na(title, ""),
text = replace_na(text, ""),
title_text = str_c(title, text, sep = ". "))
# dictionary method
reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)
reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)
sentimentr_example <- reddit_sentiment %>%
distinct(title_text, .keep_all = TRUE) %>%
mutate(sentimentr_abs = abs(sentiment_dict),
sentimentr_binary = case_when(sentiment_dict > 0 ~ 'positive',
TRUE ~ 'negative')) %>%
group_by(sentimentr_binary) %>%
arrange(desc(sentimentr_abs)) %>%
slice_head(n = 5) %>%
ungroup() %>%
arrange(sentiment_dict)
# Create a gt table
library(gt)
## Warning: package 'gt' was built under R version 4.3.3
# Filter only the required columns for display
sentimentr_example_filtered <- sentimentr_example %>%
select(title_text, sentiment_dict, sentimentr_binary)
# create table
sentimentr_example_filtered %>%
gt() %>%
tab_header(
title = "10 Reddit Posts by Sentiment") %>%
cols_label(
title_text = "Title",
sentiment_dict = "Sentiment Score",
sentimentr_binary = "Sentiment"
) %>%
tab_style(
style = cell_fill(color = "lightblue"),
locations = cells_column_labels(columns = c(sentiment_dict, sentimentr_binary))
) %>%
tab_style(
style = cell_text(weight = "bold", color = "black"),
locations = cells_body(columns = c(title_text))
) %>%
opt_table_font(font = "Arial") %>%
opt_table_lines()
| 10 Reddit Posts by Sentiment | ||
| Title | Sentiment Score | Sentiment |
|---|---|---|
| As Trump lies (and lies and lies) about FEMA, GOP gets called out on Hurricane Helene hypocrisy. | -1.1884246 | negative |
| Desperate man says hurricane-hit father-in-law refusing FEMA help after falling for Trump lie. | -0.9125000 | negative |
| Trump refuses to denounce threats to FEMA, doubles down on falsehoods. | -0.7236272 | negative |
| Tuberville criticizes FEMAs Hurricane Helene response; agency says claims false. | -0.6633250 | negative |
| I hate the federal government. Fuck FEMA.. | -0.6330619 | negative |
| Florida GOP Rep. Who Voted Against FEMA Funding Calls for Aid Before Milton. | 0.2773501 | positive |
| FEMA Reports They Spent All The Taxpayer Funding On A Sweet PowerPoint Presentation About Racial Equity. | 0.3750000 | positive |
| Why are Republicans against FEMA?. Why are Republicans against FEMA? Genuinely curious. | 0.4142906 | positive |
| Republicans don't forget to REFUSE those FEMA checks!. | 0.4419417 | positive |
| MAGA congressman says top priority is shutting down FEMA like the Department of Education. | 0.5612486 | positive |
Dictionary Method Review The dictionary method does not seem very credible for this analysis. The text that comes up as negative seems to actually be negative, however, the text identified as positive does not seem to be positive at all. Therefore, I tried using the deep learning model.
# 2. Deep learning model
# import the data
reddit_sentiment <- read_csv("sample_reddit_bert.csv")
## New names:
## Rows: 237 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (5): title, text, subreddit, url, bert_label dbl (4): ...1, timestamp,
## comments, bert_score date (1): date_utc
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# drop NAs
reddit_sentiment %<>% drop_na('bert_label')
reddit_sentiment %<>%
mutate(title = replace_na(title, ""),
text = replace_na(text, ""),
title_text = str_c(title, text, sep = ". "))
# dictionary method
reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)
reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)
reddit_sentiment %<>% mutate(bert_label_numeric = str_sub(bert_label, 1, 1) %>% as.numeric())
cor(reddit_sentiment$bert_label_numeric, reddit_sentiment$sentiment_dict)
## [1] 0.1192125
bert_example <- reddit_sentiment %>%
filter(bert_label_numeric %in% c(1,5)) %>%
group_by(bert_label) %>%
arrange(desc(bert_score)) %>%
slice_head(n = 3) %>%
ungroup()
# Filter only the required columns for display
bert_example_filtered <- bert_example %>%
select(title_text, bert_label_numeric, bert_label)
# create table
bert_example_filtered %>%
gt() %>%
tab_header(
title = "10 Reddit Posts by Sentiment") %>%
cols_label(
title_text = "Title",
bert_label_numeric = "Sentiment Score",
bert_label = "Sentiment"
) %>%
tab_style(
style = cell_fill(color = "lightblue"),
locations = cells_column_labels(columns = c(bert_label_numeric, bert_label))
) %>%
tab_style(
style = cell_text(weight = "bold", color = "black"),
locations = cells_body(columns = c(title_text))
) %>%
opt_table_font(font = "Arial") %>%
opt_table_lines()
| 10 Reddit Posts by Sentiment | ||
| Title | Sentiment Score | Sentiment |
|---|---|---|
| 'The worst I have ever seen': Disinformation chaos hammers FEMA. | 1 | 1 star |
| FEMA just released us. And 250 other ambulances from north Florida. When we got there some folks said they were in camp for a week or more. Zero missions zero, back fill, and excess of poor communication. Boondoggle | 1 | 1 star |
| Frankly ridiculous: FEMA administrator slams Trump for boosting false Helene recovery claims. | 1 | 1 star |
| Thank you, FEMA. I wouldn't have shelter if not for you.. I can't recall her full name, but a FEMA volunteer named Debbie saw me lost and alone when I had nobody and nothing but the clothes on my back. She alone helped me find shelter, drove me there personally, and got me set up with several resources. I don't know how to contact her for thanks, but I owe her my life. Debbie, if you're out there reading this, thank you from the bottom of my heart for everything you did. You're a saint and I wish you the absolute best in life. | 5 | 5 stars |
| Waffle House has a Storm Center with an entire operations team. So good they help FEMA.. | 5 | 5 stars |
| Just had my FEMA house inspection&&. &&and it was super easy. Asked some basic questions about the house and everyone living in it. Scanned the receipt from my chainsaw purchase, took pictures of the damage to the house and that was it. Super impressed with him coming out on a Sunday, he said they are out everyday. If you are waiting for FEMA to call you and you get a call from some random number thats probably it. | 5 | 5 stars |
ggplot(data = reddit_sentiment, aes(x = bert_label_numeric, y = sentiment_dict)) +
geom_jitter(width = 0.1, height = 0) +
geom_line(aes(y = 0), color = '#FFD700', lwd = 1, linetype='dashed') +
dark_theme_grey()
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().
Figure 1 Number of threads by sentiment category.
reddit_sentiment %>%
ggplot(aes(x = bert_label)) +
geom_bar(fill = "white") +
dark_theme_gray()
The deep learning model seemed to work much better.
Discuss intriguing insights derived from the sentiment analysis, supporting your observations with at least two plots.
Given that the deep learning model worked better to identify positive and negative threads, I used that model for the rest of the figures. Clearly, most of the time that people talk about FEMA, they are posting negative sentiments (Figure 1). Hurricane Helene and the hurricane in Florida both happened in October and seems to be the biggest reason that people were talking about FEMA (Figure 4). Those with positive emotions about FEMA talked about shelter, Tim Walz, how FEMA helped, etc. (Figure 3). Those with negative emotions about FEMA talked about NC and FL, money, “claims”, etc. (not sure whether insurance or fraudulent claims) (Figure 2).
# Word Cloud
# Stop word removal and tokenization
data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&|<|>"
reddit_sentiment_clean <- reddit_sentiment %>%
mutate(title_text = str_replace_all(title_text, replace_reg, "")) %>%
# tokenize
unnest_tokens(word, title_text, token = "words") %>%
# remove stop words
anti_join(stop_words, by = "word") %>%
filter(str_detect(word, "[a-z]")) %>%
filter(!word %in% c('FEMA'))
# negative text
reddit_sentiment_clean_negative <- reddit_sentiment_clean %>%
filter(bert_label_numeric %in% c(1,2))
# positive text
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>%
filter(bert_label_numeric %in% c(4,5))
# Remove words that are commonly seen in both negative and positive threads
reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>%
anti_join(reddit_sentiment_clean_positive, by = 'word')
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
anti_join(reddit_sentiment_clean_negative, by = 'word')
Figure 2. Words appearing in Negative threads.
# Wordcloud with a custom color palette
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark nor too bright
df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))
negative <- reddit_sentiment_clean_negative_unique %>%
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = pi/6,
maxRotation = pi/6,
rotateRatio = 1)
library(webshot)
library(htmlwidgets)
saveWidget(negative, '1.html', selfcontained = F)
webshot('1.html', '1.png', vwidth=500,vheight=300, delay = 5)
Figure 3. Words appearing in Positive threads.
# Wordcloud with a custom color palette
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark nor too bright
df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))
positive <- reddit_sentiment_clean_positive_unique %>%
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = pi/6,
maxRotation = pi/6,
rotateRatio = 1)
saveWidget(positive, '2.html', selfcontained = F)
webshot('2.html', '2.png', vwidth=500,vheight=300, delay = 5)
# Temporal Analysis
reddit_sentiment %<>%
mutate(date = as.POSIXct(date_utc)) %>%
filter(!is.na(date)) %>%
mutate(month = month(date))
# sentiment by year
reddit_sentiment %>%
ggplot(aes(x = month, fill = sentiment_dict)) +
geom_bar(position = 'stack') +
scale_x_continuous(breaks = seq(min(reddit_sentiment$month),
max(reddit_sentiment$month),
by = 1)) +
scale_fill_brewer(palette = 'PuRd', direction = -1) +
dark_theme_grey()
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Figure 4. Sentiment by month.