CP8883 Major_Assignment 3

Introduction

The document examines changes in sentiment towards CDC over the last 12 months.

1. Resource Preparation

First, we would like to make sure to load all required packages for the analysis.

# Specify packages to load
packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "tidyverse", "igraph", "ggraph", "fastmap", "wordcloud2", "textdata", "sf", "tmap", "here", "glue", "stringi", "syuzhet", "sentimentr", "gt", "webshot", "htmlwidgets")

# Install non-installed packages
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}

# Load packages
invisible(lapply(packages, library, character.only = TRUE))

webshot::install_phantomjs()

2. Search Reddit Threads

Next, we will search Reddit threads on CDC over the last 12 months and save the original thread.

# using keyword
threads_1 <- find_thread_urls(keywords = 'CDC', 
                              sort_by = 'relevance', 
                              period = 'year') %>%
  drop_na()

rownames(threads_1) <- NULL

colnames(threads_1)

head(threads_1, 3) %>% knitr::kable()

write_csv(threads_1, "E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/reddit.csv")

threads_original <- read.csv("E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/reddit.csv")

3. Data Cleaning & Tokenization

Next, we will clean and tokenize the data by words. We can also visualize the frequency of unique words by a bar graph. From the bar graph, we can see words most frequently observed from the Reddit threads, including covid, time, people, test, etc.

# Load Regex that matches URL-type string
data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

# Clean and tokenize the dataset
threads_clean_token <- threads_original %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = "words") %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the nrows of the cleaned & word-tokenized dataset
nrow(threads_clean_token)

# Check the frequency of words shown
threads_clean_token %>%
  filter(word != "CDC") %>% # filter the word "CDC"
  filter(word != "cdc") %>% # filter the word "cdc"  
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

4. Word Cloud

Moving forward, we can make a word cloud that shows the frequency of words except the keyword “CDC” and “cdc”. You can see words including “COVID”, “time”, and “people” shown in bigger sizes from the word cloud.

# Color Adjustment for the Word Cloud
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

# Create Word Cloud
word_cloud_1 <- threads_clean_token %>% 
  filter(word != "CDC") %>% # filter the word "CDC"
  filter(word != "cdc") %>% # filter the word "cdc"  
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

saveWidget(word_cloud_1, 'word_cloud_1.html', selfcontained = F)
webshot('E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/word_cloud_1.html', 'E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/word_cloud_1.png', vwidth=1000,vheight=600, delay = 5)

5. Tri-Gram Analysis

Next, we extract trigrams from the text data and remove those containing stop words or non-alphabetic terms. The resulting table highlights interesting patterns in public sentiment toward mask mandates and broader health-related discourse. For example, phrases like “liberals-killed-masking,” “angeles-mask-ban,” “adams-support-mask,” and “daily-fiber-intake” suggest polarized views on masking policies. In contrast, phrases such as “average-life-expectancy,” “covid-cancer-increase,” “cancer-increase-link,” and “deadlier-puzzling-scientists” possibly reflect how people’s exposure to COVID and COVID vaccination affects their health outcomes. Additionally, the phrase “mcdonalds-shares-fall” may be linked to public concern over food safety or health violations following the CDC’s latest inspection of McDonald’s.

# get tri-grams
words_trigram <- threads_original %>%
  select(text) %>%
  unnest_tokens(output = paired_words, input = text, token = "ngrams", n = 3)

# Show tri-grams with sorted values
words_trigram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()

paired_words	n
NA	178
a lot of	10
you have to	8
e coli outbreak	7
in the u.s	7
to be a	7
u sma pao	7
coli outbreak linked	6
on the road	6
one of the	6
outbreak linked to	6
you need to	6
going to be	5
he said he	5
i told him	5
life expectancy in	5
that the cdc	5
the cdc says	5
the fact that	5
the number of	5

#Separate the paired words into three columns
words_trigram_pair <- words_trigram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# filter rows where there are stop words under word 1, word 2, and word 3 columns
words_trigram_pair_filtered <- words_trigram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>% 
  # Keep only alphabetic words in each trigram
  filter(str_detect(word1, "^[a-z]+$"),
         str_detect(word2, "^[a-z]+$"),
         str_detect(word3, "^[a-z]+$")) %>%
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]"),
         str_detect(word2, "[a-z]"),
         str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
words_trigram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1),
         stri_enc_isascii(word2),
         stri_enc_isascii(word3))

# Sort the new tri-gram counts
trigram_words_counts <- words_trigram_pair_filtered %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

head(trigram_words_counts, 20) %>% 
  knitr::kable()

word1	word2	word3	n
coli	outbreak	linked	6
health	health	news	3
liberals	killed	masking	3
mcdonalds	shares	fall	3
adams	support	mask	2
angeles	mask	ban	2
army	war	college	2
attacks	post	covid	2
average	life	expectancy	2
avoid	mounting	kerb	2
ban	covid	https	2
ban	subway	crime	2
btt	amp	ftt	2
cancer	increase	link	2
checkout	true	https	2
covid	cancer	increase	2
current	affairs	magazine	2
daily	fiber	intake	2
deadlier	puzzling	scientists	2
disease	deadlier	puzzling	2

6. Sentiment Analysis

Moving forward, we conducted a sentiment analysis on the text data using a dictionary method that accommodates negations.

# Join thread title and text.
reddit_sentiment <- threads_original %>%
  mutate(title = replace_na(title, ""),
         text = replace_na(text, ""),
         title_text = str_c(title, text, sep = ". "))

# dictionary method
reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)
reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)

# Define positive and negative sentiment
sentimentr_example <- reddit_sentiment %>%
  mutate(sentimentr_abs = abs(sentiment_dict),
         sentimentr_binary = case_when(sentiment_dict > 0 ~ 'positive',
                                       TRUE ~ 'negative')) %>%
  group_by(sentimentr_binary) %>%
  arrange(desc(sentimentr_abs)) %>%
  ungroup() %>%
  arrange(sentiment_dict)

7. Display 10 sample texts

From the sentiment analysis, we can display 10 sample texts for further analysis, along with their sentiment scores, to evaluate the credibility of the sentiment analysis outcomes. The results predominantly show negative sentiment in the threads, which aligns with the broader context of public sentiment surrounding the CDC’s role in public health. The overall negative sentiment is likely driven by people’s concerns and disappointment over the CDC’s ineffective control of various diseases including COVID-19 and highlights the public’s critical view of health authorities.

# Select the columns, clean the data, and select 10 rows to make the table
sentimentr_example_filtered <- sentimentr_example %>%
  select(title_text, sentiment_dict, sentimentr_binary) %>% 
  mutate(title_text = str_trim(str_replace_all(title_text, "[^[:print:][:space:]]", ""))) %>% # delete special characters
  mutate(title_text = str_trim(str_replace_all(title_text, "[[:punct:]]+$", ""))) %>% # delete unnecessary punctuation marks
  distinct(title_text, .keep_all = TRUE) %>% 
  head(10)

# Create and format the table
ten_sentiment_table <- sentimentr_example_filtered %>% 
  mutate(sentiment_dict = round(sentiment_dict, 3)) %>% 
  gt() %>%
  tab_header(title = "10 Reddit Threads by Sentiment") %>%
  cols_label(
    title_text = "Title",
    sentimentr_binary = "Sentiment (Positive/Negative)",
    sentiment_dict = "Sentiment Score"
  ) %>%
  tab_style(
    style = cell_text(weight = "bold", color = "black", align = "left"),
    locations = cells_body(columns = c(title_text))
  ) %>%
  tab_style(
    style = cell_text(align = "left"),
    locations = cells_body(columns = c(sentiment_dict, sentimentr_binary))
  ) %>%
  tab_options(
    table.font.names = "Times New Roman",
    table.font.size = 24,
    row.striping.include_table_body = TRUE,
    column_labels.font.weight = "bold",
    data_row.padding = px(5)
  ) %>%
  opt_table_lines()

# Display the table
gtsave(ten_sentiment_table, "ten_sentiment_table.html")
webshot('E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/ten_sentiment_table.html', 'E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/ten_sentiment_table.png')

8. Discuss intriguing insights

Moving forward, we can discuss the sentiments observed in Reddit threads related to the CDC over the last 12 months. The first bar chart reveals that there are 160 negative threads (65% of the total) and 86 positive threads (35% of the total), which highlight the predominance of negative sentiment regarding the CDC during the 12-months period. The second histogram shows that the sentiment scores of the threads are generally skewed toward zero, with positive threads exhibiting a stronger negative skew. This suggests that the overall sentiment of threads about the CDC is somewhat negative. Lastly, word clouds over positive threads and negative threads were created respectively. Word cloud on positive threads shows action-related words (such as instructors, change, lessons) and family-related words (such as mom, sister, dogs) that highlight social connection and partnership. On the other hand, the word cloud for negative threads highlights words related to the chronology of epidemics and the authorities’ counteractions (outbreak, ban, isolation, masking) which also show concerns and frustrations surrounding public health measures, as well as technical terms that describe the origin and symptoms of the diseases (coli, tdap, fever, virus).

8.1. Bar Chart: No. of Threads by Binary Sentiment

# Calculate counts and proportions
sentimentr_example_summary <- sentimentr_example %>%
  group_by(sentimentr_binary) %>%
  summarise(count = n(), total = nrow(sentimentr_example), proportion = count / total)

# Bar chart: No. of threads by binary sentiment
bar_plot <- ggplot(sentimentr_example, aes(x = sentimentr_binary, fill = sentimentr_binary)) +
  geom_bar() +
  geom_text(data = sentimentr_example_summary,
            aes(x = sentimentr_binary, 
                y = count + 5,
                label = paste0(count, " (", scales::percent(proportion), ")")),
            size = 4, 
            color = "black",
            fontface = "bold") +
  labs(title = "Bar Chart: No. of Threads by Binary Sentiment",
       x = "Binary Sentiment", y = "No. of threads") +
  scale_fill_manual(values = c("red", "skyblue"), guide = "none") + 
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12),
    axis.title = element_text(size = 10),
    axis.text = element_text(size = 10),
    legend.position = "none"
  )

print(bar_plot)

8.2. Histogram: No. of Threads by Sentiment Score

# Histogram: No. of threads by sentiment score
histogram_plot <- ggplot(sentimentr_example, aes(x = sentiment_dict, fill = sentimentr_binary)) +
  geom_histogram(binwidth = 0.03, alpha = 0.5, position = "identity", color = "black") +  
  labs(title = "Histogram: No. of Threads by Sentiment Score",
       x = "Sentiment Score", y = "No. of Threads") +
  scale_fill_manual(values = c("red", "skyblue"), guide = "none") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        axis.title = element_text(size = 10),
        axis.text = element_text(size = 10),
        legend.position = "none")

print(histogram_plot)

8.3. Word Cloud: Positive Threads

# Data Tokenization: Text to Words
reddit_sentiment_clean <- sentimentr_example %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

# Create separate dataset for threads with positive and negative sentiment
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>%
  filter(sentiment_dict >= 0)

reddit_sentiment_clean_negative <- reddit_sentiment_clean %>%
  filter(sentiment_dict < 0)

# Remove common words between positive and negative sentiment (to get unique words)
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
  anti_join(reddit_sentiment_clean_negative, by = 'word')

reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>%
  anti_join(reddit_sentiment_clean_positive, by = 'word')

# Word cloud: positive threads
positive_word_cloud <- reddit_sentiment_clean_positive_unique %>% 
  filter(word != "CDC") %>% # filter the word "CDC"
  filter(word != "cdc") %>% # filter the word "cdc"  
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal,
       minRotation = pi/6,
       maxRotation = pi/6,
       rotateRatio = 1)

saveWidget(positive_word_cloud, 'positive.html', selfcontained = F)
webshot('E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/positive.html', 'E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/positive.png', vwidth=1000,vheight=600, delay = 5)

8.4. Word Cloud: Negative Threads

negative_word_cloud <- reddit_sentiment_clean_negative_unique %>% 
  filter(word != "CDC") %>% # filter the word "CDC"
  filter(word != "cdc") %>% # filter the word "cdc"  
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal,
       minRotation = pi/6,
       maxRotation = pi/6,
       rotateRatio = 1)

saveWidget(negative_word_cloud, 'negative.html', selfcontained = F)
webshot('E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/negative.html', 'E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/negative.png', vwidth=1000,vheight=600, delay = 5)