CP8883 Major_Assignment 3

Introduction

This document wishes to examine the changes in sentiment towards CDC over the last 12 months. The methods and results of the analysis is as follows:

1. Resource Preparation

First, we would like to make sure to load all required packages for the analysis.

# Specify packages to load
packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "tidyverse", "igraph", "ggraph", "wordcloud2", "textdata", "sf", "tmap", "here", "glue", "stringi", "syuzhet", "sentimentr", "gt")

# Install non-installed packages
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}

# Load packages
invisible(lapply(packages, library, character.only = TRUE))

2. Search Reddit Threads

Next, let’s search reddit threads on CDC over the last 12 months, and save the original threads as a csv file.

# using keyword
threads_1 <- find_thread_urls(keywords = 'CDC', 
                              sort_by = 'relevance', 
                              period = 'year') %>%
  drop_na()

rownames(threads_1) <- NULL

colnames(threads_1)

head(threads_1, 3) %>% knitr::kable()

write_csv(threads_1, "E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/reddit.csv")

threads_original <- read.csv("E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment/major_assignment_3/reddit.csv")

3. Data Cleaning & Tokenization

Next, we will clean and tokenize the data by words. After this process, the dataset will have 5,908 rows. We can also visualize the frequency of unique words by a bar graph.

# Load Regex that matches URL-type string
data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

# Clean and tokenize the dataset
threads_clean_token <- threads_original %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = "words") %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the nrows of the cleaned & word-tokenized dataset
nrow(threads_clean_token)

# Check the frequency of words shown
threads_clean_token %>%
  filter(word != "CDC") %>% # filter the word "CDC"
  filter(word != "cdc") %>% # filter the word "cdc"  
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

4. Word Cloud

Moving forward, we can make a word cloud that shows the frequency of words except the keyword “CDC” and “cdc”. You can see frequently-used words including “COVID”, “time”, and “people” shown in the word cloud.

# Color Adjustment for the Word Cloud
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

# Create Word Cloud
threads_clean_token %>% 
  filter(word != "CDC") %>% # filter the word "CDC"
  filter(word != "cdc") %>% # filter the word "cdc"  
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

5. Tri-Gram Analysis

Next, we extract trigrams from the text data and remove those containing stop words or non-alphabetic terms. The resulting table highlights interesting patterns in public sentiment toward mask mandates and broader health-related discourse. For example, phrases like “liberals-killed-masking,” “angeles-mask-ban,” “adams-support-mask,” and “daily-fiber-intake” suggest polarized views on masking policies. In contrast, phrases such as “average-life-expectancy,” “covid-cancer-increase,” “cancer-increase-link,” and “deadlier-puzzling-scientists” possibly reflect how people’s exposure to COVID and COVID vaccination affects their health outcomes. Additionally, the phrase “mcdonalds-shares-fall” may be linked to public concern over food safety or health violations following the CDC’s latest inspection of McDonald’s.

# get tri-grams
words_trigram <- threads_original %>%
  select(text) %>%
  unnest_tokens(output = paired_words, input = text, token = "ngrams", n = 3)

# Show tri-grams with sorted values
words_trigram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()

#Separate the paired words into three columns
words_trigram_pair <- words_trigram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# filter rows where there are stop words under word 1, word 2, and word 3 columns
words_trigram_pair_filtered <- words_trigram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>% 
  # Keep only alphabetic words in each trigram
  filter(str_detect(word1, "^[a-z]+$"),
         str_detect(word2, "^[a-z]+$"),
         str_detect(word3, "^[a-z]+$")) %>%
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]"),
         str_detect(word2, "[a-z]"),
         str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
words_trigram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1),
         stri_enc_isascii(word2),
         stri_enc_isascii(word3))

# Sort the new tri-gram counts
trigram_words_counts <- words_trigram_pair_filtered %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

head(trigram_words_counts, 20) %>% 
  knitr::kable()

6. Sentiment Analysis

Moving forward, we conducted a sentiment analysis on the text data using a dictionary method that accommodates negations.

# Join thread title and text.
reddit_sentiment <- threads_original %>%
  mutate(title = replace_na(title, ""),
         text = replace_na(text, ""),
         title_text = str_c(title, text, sep = ". "))

# dictionary method
reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)
reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)

# Define positive and negative sentiment
sentimentr_example <- reddit_sentiment %>%
  mutate(sentimentr_abs = abs(sentiment_dict),
         sentimentr_binary = case_when(sentiment_dict > 0 ~ 'positive',
                                       TRUE ~ 'negative')) %>%
  group_by(sentimentr_binary) %>%
  arrange(desc(sentimentr_abs)) %>%
  ungroup() %>%
  arrange(sentiment_dict)

7. Display 10 sample texts

From the sentiment analysis, we can display 10 sample texts for further analysis, along with their sentiment scores, to evaluate the credibility of the sentiment analysis outcomes. The results predominantly show negative sentiment in the threads, which aligns with the broader context of public sentiment surrounding the CDC’s role in public health. The overall negative sentiment is likely driven by people’s concerns and disappointment over the CDC’s ineffective control of various deseases including COVID-19 and highlights the public’s critical view of health authorities.

# Select the columns, clean the data and select 10 rows to make the table
sentimentr_example_filtered <- sentimentr_example %>%
  select(title_text, sentiment_dict, sentimentr_binary) %>%
  mutate(title_text = str_trim(str_replace_all(title_text, "[^[:print:][:space:]]", ""))) %>% # delete spacial characters
  mutate(title_text = str_trim(str_replace_all(title_text, "[[:punct:]]+$", ""))) %>% # delete unnecessary signs
  distinct(title_text, .keep_all = TRUE) %>%
  head(10)

# Create and format the table
sentimentr_example_filtered %>%
  mutate(sentiment_dict = round(sentiment_dict, 3)) %>%
  gt() %>%
  tab_header(title = "10 Reddit Threads by Sentiment") %>%
  cols_label(title_text = "Title",
             sentimentr_binary = "Sentiment (Positive/Negative)",
             sentiment_dict = "Sentiment Score") %>%
  tab_style(style = cell_text(weight = "bold", color = "black", align = "left"),
    locations = cells_body(columns = c(title_text))) %>%
  tab_style(style = cell_text(align = "left"), locations = cells_body(columns = c(sentiment_dict, sentimentr_binary))) %>%
  tab_options(
    table.font.names = "Times New Roman",
    table.font.size = 13,
    row.striping.include_table_body = TRUE,
    column_labels.font.weight = "bold",
    data_row.padding = px(5)) %>%
  opt_table_lines()

8. Discuss intriguing insights

Moving forward, we can discuss the sentiments observed in Reddit threads related to the CDC over the past 12 months. The first bar chart reveals that there are 160 negative threads (65% of the total) and 86 positive threads (35% of the total), highlighting the predominance of negative sentiment regarding the CDC during this period. Additionally, the second histogram shows that the sentiment scores of the threads are generally skewed toward zero, with positive threads exhibiting a stronger negative skew. This suggests that the overall sentiment of threads about the CDC is somewhat negative. Lastly, word clouds over positive threads and negative threads were created respectively. Word cloud on positive threads shows action-related words (change, lesson(s)) and family-related words (mom, sister, fiance, dog), which highlights ~. On the other hand, the word cloud for negative threads highlights words related to the chronology of epidemics and the authorities’ counteractions (“outbreak,” “ban,” “study,” “isolation,” “mask,” and “crisis”), which show concerns and frustrations surrounding public health measures, as well as technical terms that describe the origin and symptoms of diseases (“TDAPs,” “fever,” “antigen”).

8.1. Bar Chart: No. of Threads by Binary Sentiment

# Calculate counts and proportions
sentimentr_example_summary <- sentimentr_example %>%
  group_by(sentimentr_binary) %>%
  summarise(count = n(), total = nrow(sentimentr_example), proportion = count / total)

# Bar chart: No. of threads by binary sentiment
bar_plot <- ggplot(sentimentr_example, aes(x = sentimentr_binary, fill = sentimentr_binary)) +
  geom_bar() +
  geom_text(data = sentimentr_example_summary,
            aes(x = sentimentr_binary, 
                y = count + 5,
                label = paste0(count, " (", scales::percent(proportion), ")")),
            size = 4, 
            color = "black",
            fontface = "bold") +
  labs(title = "Bar Chart: No. of Threads by Binary Sentiment",
       x = "Binary Sentiment", y = "No. of threads") +
  scale_fill_manual(values = c("red", "skyblue"), guide = "none") + 
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12),
    axis.title = element_text(size = 10),
    axis.text = element_text(size = 10),
    legend.position = "none"
  )

print(bar_plot)

8.2. Histogram: No. of Threads by Sentiment Score

# Histogram: No. of threads by sentiment score
histogram_plot <- ggplot(sentimentr_example, aes(x = sentiment_dict, fill = sentimentr_binary)) +
  geom_histogram(binwidth = 0.03, alpha = 0.5, position = "identity", color = "black") +  
  labs(title = "Histogram: No. of Threads by Sentiment Score",
       x = "Sentiment Score", y = "No. of Threads") +
  scale_fill_manual(values = c("red", "skyblue"), guide = "none") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        axis.title = element_text(size = 10),
        axis.text = element_text(size = 10),
        legend.position = "none")

print(histogram_plot)

8.3. Word Cloud: Positive Threads

# Data Tokenization: Text to Words
reddit_sentiment_clean <- sentimentr_example %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

## Joining with `by = join_by(word)`

# Create separate dataset for threads with positive and negative sentiment
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>%
  filter(sentiment_dict >= 0)

reddit_sentiment_clean_negative <- reddit_sentiment_clean %>%
  filter(sentiment_dict < 0)

# Remove common words between positive and negative sentiment (to get unique words)
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
  anti_join(reddit_sentiment_clean_negative, by = 'word')

reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>%
  anti_join(reddit_sentiment_clean_positive, by = 'word')

# Word cloud: positive threads
reddit_sentiment_clean_positive_unique %>% 
  filter(word != "CDC") %>% # filter the word "CDC"
  filter(word != "cdc") %>% # filter the word "cdc"  
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

8.4. Word Cloud: Negative Threads

reddit_sentiment_clean_negative_unique %>% 
  filter(word != "CDC") %>% # filter the word "CDC"
  filter(word != "cdc") %>% # filter the word "cdc"  
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)