The document examines changes in sentiment towards CDC over the last 12 months.
First, we would like to make sure to load all required packages for the analysis.
# Specify packages to load
packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "tidyverse", "igraph", "ggraph", "fastmap", "wordcloud2", "textdata", "sf", "tmap", "here", "glue", "stringi", "syuzhet", "sentimentr", "gt", "webshot", "htmlwidgets")
# Install non-installed packages
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Load packages
invisible(lapply(packages, library, character.only = TRUE))
webshot::install_phantomjs()
Next, we will search Reddit threads on CDC over the last 12 months and save the original thread.
# using keyword
threads_1 <- find_thread_urls(keywords = 'CDC',
sort_by = 'relevance',
period = 'year') %>%
drop_na()
rownames(threads_1) <- NULL
colnames(threads_1)
head(threads_1, 3) %>% knitr::kable()
write_csv(threads_1, "E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/reddit.csv")
threads_original <- read.csv("E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/reddit.csv")
Next, we will clean and tokenize the data by words. We can also visualize the frequency of unique words by a bar graph. From the bar graph, we can see words most frequently observed from the Reddit threads, including covid, time, people, test, etc.
# Load Regex that matches URL-type string
data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&|<|>"
# Clean and tokenize the dataset
threads_clean_token <- threads_original %>%
# drop URLs
mutate(text = str_replace_all(text, replace_reg, "")) %>%
# Tokenization (word tokens)
unnest_tokens(word, text, token = "words") %>%
# drop stop words
anti_join(stop_words, by = "word") %>%
# drop non-alphabet-only strings
filter(str_detect(word, "[a-z]"))
# Check the nrows of the cleaned & word-tokenized dataset
nrow(threads_clean_token)
# Check the frequency of words shown
threads_clean_token %>%
filter(word != "CDC") %>% # filter the word "CDC"
filter(word != "cdc") %>% # filter the word "cdc"
count(word, sort = TRUE) %>%
top_n(20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "words",
y = "counts",
title = "Unique wordcounts")
Moving forward, we can make a word cloud that shows the frequency of words except the keyword “CDC” and “cdc”. You can see words including “COVID”, “time”, and “people” shown in bigger sizes from the word cloud.
# Color Adjustment for the Word Cloud
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright
df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))
# Create Word Cloud
word_cloud_1 <- threads_clean_token %>%
filter(word != "CDC") %>% # filter the word "CDC"
filter(word != "cdc") %>% # filter the word "cdc"
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = 0,
maxRotation = 0,
ellipticity = 0.8)
saveWidget(word_cloud_1, 'word_cloud_1.html', selfcontained = F)
webshot('E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/word_cloud_1.html', 'E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/word_cloud_1.png', vwidth=1000,vheight=600, delay = 5)
Next, we extract trigrams from the text data and remove those containing stop words or non-alphabetic terms. The resulting table highlights interesting patterns in public sentiment toward mask mandates and broader health-related discourse. For example, phrases like “liberals-killed-masking,” “angeles-mask-ban,” “adams-support-mask,” and “daily-fiber-intake” suggest polarized views on masking policies. In contrast, phrases such as “average-life-expectancy,” “covid-cancer-increase,” “cancer-increase-link,” and “deadlier-puzzling-scientists” possibly reflect how people’s exposure to COVID and COVID vaccination affects their health outcomes. Additionally, the phrase “mcdonalds-shares-fall” may be linked to public concern over food safety or health violations following the CDC’s latest inspection of McDonald’s.
# get tri-grams
words_trigram <- threads_original %>%
select(text) %>%
unnest_tokens(output = paired_words, input = text, token = "ngrams", n = 3)
# Show tri-grams with sorted values
words_trigram %>%
count(paired_words, sort = TRUE) %>%
head(20) %>%
knitr::kable()
| paired_words | n |
|---|---|
| NA | 178 |
| a lot of | 10 |
| you have to | 8 |
| e coli outbreak | 7 |
| in the u.s | 7 |
| to be a | 7 |
| u sma pao | 7 |
| coli outbreak linked | 6 |
| on the road | 6 |
| one of the | 6 |
| outbreak linked to | 6 |
| you need to | 6 |
| going to be | 5 |
| he said he | 5 |
| i told him | 5 |
| life expectancy in | 5 |
| that the cdc | 5 |
| the cdc says | 5 |
| the fact that | 5 |
| the number of | 5 |
#Separate the paired words into three columns
words_trigram_pair <- words_trigram %>%
separate(paired_words, c("word1", "word2", "word3"), sep = " ")
# filter rows where there are stop words under word 1, word 2, and word 3 columns
words_trigram_pair_filtered <- words_trigram_pair %>%
# drop stop words
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
# Keep only alphabetic words in each trigram
filter(str_detect(word1, "^[a-z]+$"),
str_detect(word2, "^[a-z]+$"),
str_detect(word3, "^[a-z]+$")) %>%
# drop non-alphabet-only strings
filter(str_detect(word1, "[a-z]"),
str_detect(word2, "[a-z]"),
str_detect(word3, "[a-z]"))
# Filter out words that are not encoded in ASCII
words_trigram_pair_filtered %<>%
filter(stri_enc_isascii(word1),
stri_enc_isascii(word2),
stri_enc_isascii(word3))
# Sort the new tri-gram counts
trigram_words_counts <- words_trigram_pair_filtered %>%
count(word1, word2, word3) %>%
arrange(desc(n))
head(trigram_words_counts, 20) %>%
knitr::kable()
| word1 | word2 | word3 | n |
|---|---|---|---|
| coli | outbreak | linked | 6 |
| health | health | news | 3 |
| liberals | killed | masking | 3 |
| mcdonalds | shares | fall | 3 |
| adams | support | mask | 2 |
| angeles | mask | ban | 2 |
| army | war | college | 2 |
| attacks | post | covid | 2 |
| average | life | expectancy | 2 |
| avoid | mounting | kerb | 2 |
| ban | covid | https | 2 |
| ban | subway | crime | 2 |
| btt | amp | ftt | 2 |
| cancer | increase | link | 2 |
| checkout | true | https | 2 |
| covid | cancer | increase | 2 |
| current | affairs | magazine | 2 |
| daily | fiber | intake | 2 |
| deadlier | puzzling | scientists | 2 |
| disease | deadlier | puzzling | 2 |
Moving forward, we conducted a sentiment analysis on the text data using a dictionary method that accommodates negations.
# Join thread title and text.
reddit_sentiment <- threads_original %>%
mutate(title = replace_na(title, ""),
text = replace_na(text, ""),
title_text = str_c(title, text, sep = ". "))
# dictionary method
reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)
reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)
# Define positive and negative sentiment
sentimentr_example <- reddit_sentiment %>%
mutate(sentimentr_abs = abs(sentiment_dict),
sentimentr_binary = case_when(sentiment_dict > 0 ~ 'positive',
TRUE ~ 'negative')) %>%
group_by(sentimentr_binary) %>%
arrange(desc(sentimentr_abs)) %>%
ungroup() %>%
arrange(sentiment_dict)
From the sentiment analysis, we can display 10 sample texts for further analysis, along with their sentiment scores, to evaluate the credibility of the sentiment analysis outcomes. The results predominantly show negative sentiment in the threads, which aligns with the broader context of public sentiment surrounding the CDC’s role in public health. The overall negative sentiment is likely driven by people’s concerns and disappointment over the CDC’s ineffective control of various diseases including COVID-19 and highlights the public’s critical view of health authorities.
# Select the columns, clean the data, and select 10 rows to make the table
sentimentr_example_filtered <- sentimentr_example %>%
select(title_text, sentiment_dict, sentimentr_binary) %>%
mutate(title_text = str_trim(str_replace_all(title_text, "[^[:print:][:space:]]", ""))) %>% # delete special characters
mutate(title_text = str_trim(str_replace_all(title_text, "[[:punct:]]+$", ""))) %>% # delete unnecessary punctuation marks
distinct(title_text, .keep_all = TRUE) %>%
head(10)
# Create and format the table
ten_sentiment_table <- sentimentr_example_filtered %>%
mutate(sentiment_dict = round(sentiment_dict, 3)) %>%
gt() %>%
tab_header(title = "10 Reddit Threads by Sentiment") %>%
cols_label(
title_text = "Title",
sentimentr_binary = "Sentiment (Positive/Negative)",
sentiment_dict = "Sentiment Score"
) %>%
tab_style(
style = cell_text(weight = "bold", color = "black", align = "left"),
locations = cells_body(columns = c(title_text))
) %>%
tab_style(
style = cell_text(align = "left"),
locations = cells_body(columns = c(sentiment_dict, sentimentr_binary))
) %>%
tab_options(
table.font.names = "Times New Roman",
table.font.size = 24,
row.striping.include_table_body = TRUE,
column_labels.font.weight = "bold",
data_row.padding = px(5)
) %>%
opt_table_lines()
# Display the table
gtsave(ten_sentiment_table, "ten_sentiment_table.html")
webshot('E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/ten_sentiment_table.html', 'E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/ten_sentiment_table.png')
Moving forward, we can discuss the sentiments observed in Reddit threads related to the CDC over the last 12 months. The first bar chart reveals that there are 160 negative threads (65% of the total) and 86 positive threads (35% of the total), which highlight the predominance of negative sentiment regarding the CDC during the 12-months period. The second histogram shows that the sentiment scores of the threads are generally skewed toward zero, with positive threads exhibiting a stronger negative skew. This suggests that the overall sentiment of threads about the CDC is somewhat negative. Lastly, word clouds over positive threads and negative threads were created respectively. Word cloud on positive threads shows action-related words (such as instructors, change, lessons) and family-related words (such as mom, sister, dogs) that highlight social connection and partnership. On the other hand, the word cloud for negative threads highlights words related to the chronology of epidemics and the authorities’ counteractions (outbreak, ban, isolation, masking) which also show concerns and frustrations surrounding public health measures, as well as technical terms that describe the origin and symptoms of the diseases (coli, tdap, fever, virus).
# Calculate counts and proportions
sentimentr_example_summary <- sentimentr_example %>%
group_by(sentimentr_binary) %>%
summarise(count = n(), total = nrow(sentimentr_example), proportion = count / total)
# Bar chart: No. of threads by binary sentiment
bar_plot <- ggplot(sentimentr_example, aes(x = sentimentr_binary, fill = sentimentr_binary)) +
geom_bar() +
geom_text(data = sentimentr_example_summary,
aes(x = sentimentr_binary,
y = count + 5,
label = paste0(count, " (", scales::percent(proportion), ")")),
size = 4,
color = "black",
fontface = "bold") +
labs(title = "Bar Chart: No. of Threads by Binary Sentiment",
x = "Binary Sentiment", y = "No. of threads") +
scale_fill_manual(values = c("red", "skyblue"), guide = "none") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 12),
axis.title = element_text(size = 10),
axis.text = element_text(size = 10),
legend.position = "none"
)
print(bar_plot)
# Histogram: No. of threads by sentiment score
histogram_plot <- ggplot(sentimentr_example, aes(x = sentiment_dict, fill = sentimentr_binary)) +
geom_histogram(binwidth = 0.03, alpha = 0.5, position = "identity", color = "black") +
labs(title = "Histogram: No. of Threads by Sentiment Score",
x = "Sentiment Score", y = "No. of Threads") +
scale_fill_manual(values = c("red", "skyblue"), guide = "none") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 12),
axis.title = element_text(size = 10),
axis.text = element_text(size = 10),
legend.position = "none")
print(histogram_plot)
# Data Tokenization: Text to Words
reddit_sentiment_clean <- sentimentr_example %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
# Create separate dataset for threads with positive and negative sentiment
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>%
filter(sentiment_dict >= 0)
reddit_sentiment_clean_negative <- reddit_sentiment_clean %>%
filter(sentiment_dict < 0)
# Remove common words between positive and negative sentiment (to get unique words)
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
anti_join(reddit_sentiment_clean_negative, by = 'word')
reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>%
anti_join(reddit_sentiment_clean_positive, by = 'word')
# Word cloud: positive threads
positive_word_cloud <- reddit_sentiment_clean_positive_unique %>%
filter(word != "CDC") %>% # filter the word "CDC"
filter(word != "cdc") %>% # filter the word "cdc"
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = pi/6,
maxRotation = pi/6,
rotateRatio = 1)
saveWidget(positive_word_cloud, 'positive.html', selfcontained = F)
webshot('E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/positive.html', 'E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/positive.png', vwidth=1000,vheight=600, delay = 5)
negative_word_cloud <- reddit_sentiment_clean_negative_unique %>%
filter(word != "CDC") %>% # filter the word "CDC"
filter(word != "cdc") %>% # filter the word "cdc"
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = pi/6,
maxRotation = pi/6,
rotateRatio = 1)
saveWidget(negative_word_cloud, 'negative.html', selfcontained = F)
webshot('E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/negative.html', 'E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/negative.png', vwidth=1000,vheight=600, delay = 5)