Major Assignment 3

###1. Describe in one sentence what you aim to examine using user-generated text data and sentiment analysis. I am aiming to examine the redditers in r/poppunkers attitude towards the band Blink-182.

###2. Search Reddit threads using a keyword of your choice. Specifying a subreddit for your search is optional. It is okay to combine data obtained by searching the keyword across multiple subreddits. You can choose any period, but ensure you gather a sufficient amount of data so that you can get meaningful results.

packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "tidyverse", "igraph", "ggraph", "textdata", "ggdark", "syuzhet", "sentimentr", "lubridate", "sf", "tmap", "here", "devtools")
invisible(lapply(packages, library, character.only = TRUE))

## Warning: package 'RedditExtractoR' was built under R version 4.4.2

## Warning: package 'anytime' was built under R version 4.4.2

## Warning: package 'httr' was built under R version 4.4.2

## Warning: package 'tidytext' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'igraph' was built under R version 4.4.2

## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## 
## The following object is masked from 'package:tidyr':
## 
##     crossing
## 
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union

## Warning: package 'ggraph' was built under R version 4.4.2

## Warning: package 'textdata' was built under R version 4.4.2

## 
## Attaching package: 'textdata'
## 
## The following object is masked from 'package:httr':
## 
##     cache_info

## Warning: package 'syuzhet' was built under R version 4.4.2

## Warning: package 'sentimentr' was built under R version 4.4.2

## 
## Attaching package: 'sentimentr'
## 
## The following object is masked from 'package:syuzhet':
## 
##     get_sentences

## Warning: package 'sf' was built under R version 4.4.2

## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
## here() starts at C:/Users/benso/Documents/Georgia Tech/CP8883
## Loading required package: usethis

## Warning: package 'usethis' was built under R version 4.4.2

library(wordcloud2)

threads <- find_thread_urls(keywords = "blink", 
                            subreddit = "poppunkers",
                            sort_by = 'relevance', 
                            period = 'all')

## parsing URLs on page 1...
## parsing URLs on page 2...
## parsing URLs on page 3...

###3. Clean your text data and then tokenize it. #Clean the date to not be NA

threads %<>% 
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date))

threads %>% 
  filter(year(date) == 2023) %>% 
  ggplot(aes(x = date)) +
  geom_histogram(color="black", position = 'stack', binwidth = 604800) +
  scale_x_datetime(date_labels = "%b %y",
                   breaks = seq(min(threads$date, na.rm = T), 
                                max(threads$date, na.rm = T), 
                                by = "1 month")) +
  theme_minimal()

write.csv(threads, file="blink_threads.csv")

threads <- read_csv("blink_threads.csv")

## New names:
## Rows: 233 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (5): ...1, title, text, subreddit, url dbl (2): timestamp, comments date (2):
## date_utc, date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

#Before Cleaning Stopwords

words <- threads %>% 
  unnest_tokens(output = word, input = text, token = "words")

words %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

## Selecting by n

#After Cleaning Stopwords

# Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- threads %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = "words") %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

## Before: 6424, After: 2148

words_clean %>%
  count(word, sort = TRUE) %>%
  top_n(15, n) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

###4. Generate a word cloud that illustrates the frequency of words except your keyword.

n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.8, 1) # vivid
v <- runif(n, 0.7, 0.9) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("#aaaaaa", 1000))
words_clean %>% 
  filter(!word %in% "blink") %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2(
             color=pal, 
             backgroundColor="black",
             fontFamily = "Chiller",
             size = 1.6,
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 1)

###5. Conduct a tri-gram analysis. Extract tri-grams from your text data. Remove tri-grams containing stop words or non-alphabetic terms. Present the frequency of tri-grams in a table. Discuss any noteworthy tri-grams you come across. If no meaningful tri-grams are found, you may analyze bi-grams as well. However, you still need to show results of the tri-grams.

words_trigram <- threads %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)

words_trigram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()

paired_words	n
NA	135
a lot of	15
all the small	4
but i m	4
but i think	4
enema of the	4
fall out boy	4
from blink 182	4
i can t	4
of the state	4
one more time	4
the small things	4
would love to	4
a pop punk	3
and i m	3
and tim armstrong	3
bah da di	3
blink 182 and	3
blink 182 i	3
blink 182 is	3

#separate the paired words into three columns
words_trigram_pair <- words_trigram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# filter rows where there are stop words under word 1 column, word 2 column or word 3 column
words_trigram_pair_filtered <- words_trigram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") | str_detect(word2, "[a-z]") | str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
library(stringi)
words_trigram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2) & stri_enc_isascii(word3))

# Sort the new tri-gram (n=3) counts:
words_counts <- words_trigram_pair_filtered %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

head(words_counts, 10) %>% 
  knitr::kable()

word1	word2	word3	n
bah	da	di	3
box	car	racer	3
1ckrw6l	uvqlist	plp	2
da	bah	da	2
da	di	dah	2
dah	da	bah	2
di	dah	da	2
dogs	eating	dogs	2
ezrrorq8aasindex	14pp	iaqb8aub	2
machine	gun	kelly	2

There are not many meaningful tri-grams found. I will do a bi-gram analysis instead.

words_bigram <- threads %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 2)

words_bigram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()

paired_words	n
NA	133
blink 182	34
in the	23
but i	22
of the	22
a lot	18
i m	15
lot of	15
on the	15
and i	14
pop punk	14
for the	12
i know	11
i think	11
it s	10
that i	10
to the	10
of their	9
to be	9
and they	8

#separate the paired words into three columns
words_bigram_pair <- words_bigram %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

# filter rows where there are stop words under word 1 column or word 2 column
words_bigram_pair_filtered <- words_bigram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") | str_detect(word2, "[a-z]"))

# Filter out words that are not encoded in ASCII
library(stringi)
words_bigram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2))

# Sort the new bi-gram counts:
words_counts <- words_bigram_pair_filtered %>%
  count(word1, word2) %>%
  arrange(desc(n))

head(words_counts, 10) %>% 
  knitr::kable()

word1	word2	n
blink	182	34
pop	punk	14
sum	41	5
dynamic	pricing	4
hl	en	4
view	poll	4
bah	da	3
box	car	3
car	racer	3
da	di	3

words_counts %>%
  filter(n >= 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8) +
  labs(title = "Word Networks",
       x = "", y = "")

## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can see that the most connected bi-gram is the band blink-182, as well as pop-punk (their genre), sum 41 (another similar band), travis barker (their drummer), etc.

###6. Perform a sentiment analysis on your text data using a dictionary method that accommodates negations. You are welcome to apply a deep learning-based model to enrich your analysis, but employing the dictionary method is imperative.

Import the bert analysis data

# import the data
reddit_sentiment <- read_csv("blink_reddit_bert.csv")

## Rows: 233 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): date_utc, title, text, subreddit, url, date, bert_label
## dbl (3): timestamp, comments, bert_score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# drop NAs
reddit_sentiment %<>% drop_na('bert_label')

# Join thread title and text.
reddit_sentiment %<>%
  mutate(title = replace_na(title, ""),
         text = replace_na(text, ""),
         title_text = str_c(title, text, sep = ". "))

# dictionary method
reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text)

reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)

reddit_sentiment %<>% mutate(bert_label_numeric = str_sub(bert_label, 1, 1) %>% as.numeric())

###7. Display 10 sample texts alongside their sentiment scores and evaluate the credibility of the sentiment analysis outcomes.

bert_example <- reddit_sentiment %>%
  filter(bert_label_numeric %in% c(1,5)) %>% 
  group_by(bert_label) %>%
  arrange(desc(bert_score)) %>%
  slice_head(n = 5) %>%
  ungroup()

# 1 star
bert_example %>% filter(bert_label_numeric == 1) %>% pull(title_text) %>% print()

## [1] "I hate everything Tom Delonge posts that isn\031t related to Blink-182. I know this is a super unpopular opinion. It\031s like I\031m following a 12 year old boy. His most recent post is a picture of half-naked women on a truck with a caption saying \034me, Mark and Travis explored Brazil today\035. Everything is overly-sexualized and offensive. I don\031t like how he objectifies women by making them the punchline of his jokes. \r\n"
## [2] "The ticket prices for blink-182 are fucking ridiculous. Mark is claiming it is \"dynamic pricing\" to \"discourage scalpers\" and that the band \"has no control over it.\" Horseshit on all accounts. All dynamic pricing does is make Ticketmaster the scalpers. And the artist can opt out of dynamic pricing with Ticketmaster before the tickets go on sale. \r\n\r\nShame on you, blink-182, and fuck you Ticketmaster."                       
## [3] "Reckless Abandon by blink-182 should have been a single with a music video. "                                                                                                                                                                                                                                                                                                                                                                        
## [4] "Why do so many people think my own worst enemy is blink but not lit?. "                                                                                                                                                                                                                                                                                                                                                                              
## [5] "Blink -182\031s website is down and they took all their posts down on Instagram. Is it really happening!?"

These threads generally has a negative opinion about blink-182. “I hate everything Tom DeLonge …” and “fucking ridiculous” are great examples of negative sentiment.

bert_example %>% filter(bert_label_numeric == 5) %>% pull(title_text) %>% print()

## [1] "What's Your Favorite New Pop Punk Album 2023 - 2024 so far? Love that we've got new Fall Out Boy, Blink 182, Yellowcard, Green Day, Neck Deep this past year/this month.. "                                                                                                                                                                                                                                                    
## [2] "What is the best blink 182 song of all time and why?. I personally think \"Always\", I don't know why but it just resonates with me and is sonically appealing.\r\n\r\nWhat does everyone else think?"                                                                                                                                                                                                                         
## [3] "To those going to see blink-182 with KennyHoopla as the opener.. Be prepared for awesomeness. In my opinion, Kenny is the best in modern poppunk right now. Absolute energy."                                                                                                                                                                                                                                                  
## [4] "Favorite songs from Blink 182. What are your favorite songs from Blink 182?"                                                                                                                                                                                                                                                                                                                                                   
## [5] "ONE MORE TIME& is actually one of blink\031s best albums, lyrically &amp; musically. Don\031t get me wrong, I\031ll always love EOTS and TOYPAJ, those\031ll always be classics. But I\031m honestly SO impressed by the new album &amp; can\031t wait to see them play it live. I especially love Terrified, Turpentine &amp; See You.\r\n\r\nTravis\031 drumming is on point, as always =L<û\r\n\r\nAnyone with me on this?!"

These bert examples do show great appreciation towards blink-182 and their songs. For example, “be prepared for awesomeness” in thread 3 is correctly classified as a positive emotion.

Now let’s do a dictionary example as an alternative.

sentimentr_example <- reddit_sentiment %>%
  mutate(sentimentr_abs = abs(sentiment_dict),
         sentimentr_binary = case_when(sentiment_dict > 0 ~ 'positive',
                                       TRUE ~ 'negative')) %>%
  group_by(sentimentr_binary) %>%
  arrange(desc(sentimentr_abs)) %>%
  slice_head(n = 5) %>%
  ungroup() %>%
  arrange(sentiment_dict)

# negative
sentimentr_example %>% filter(sentimentr_binary == 'negative') %>% pull(title_text) %>% print()

## [1] "blink-182 - TERRIFIED. "                                                     
## [2] "Reckless Abandon by blink-182 should have been a single with a music video. "
## [3] "Blink-182 - I Miss You. "                                                    
## [4] "Why do people get so upset when I confuse green charlotte with blink 41?. "  
## [5] "Why do so many people think my own worst enemy is blink but not lit?. "

# positive
sentimentr_example %>% filter(sentimentr_binary == 'positive') %>% pull(title_text) %>% print()

## [1] "New Blink-182 music live for 182 minutes. "                                                                                                                                                  
## [2] "Heavily influenced by blink-182, especially the self titled album (personal all time favorite album) any and all feedback appreciated!. "                                                    
## [3] "Who else thinks New Found Glory should've been as big as blink back in their prime?. "                                                                                                       
## [4] "Blink-182 - A New Hope. "                                                                                                                                                                    
## [5] "State Champs, Four Year Strong, Drug Church, Microwave, Spanish Love Songs, Red City Radio + more covering Blink-182, Green Day, MCR, Sublime + more in a new compilation out August 29th!. "

The result is not as good as the bert model results, which is reasonable as bert model is a deep learning model. Some of the titles only include the song names with neutral sentiment (such as Blink 182 - I miss you), but the dictionary method mistakenly take them as the sentimental words from the redditer.

###8. Discuss intriguing insights derived from the sentiment analysis, supporting your observations with at least two plots.

sentiment by year

reddit_sentiment %<>%
  mutate(sentimentr_abs = abs(sentiment_dict),
         sentimentr_binary = case_when(sentiment_dict > 0 ~ 'positive',
                                       TRUE ~ 'negative')) %>% 
  group_by(sentimentr_binary) %>%
  arrange(desc(sentimentr_abs)) %>%
  ungroup() %>% 
  arrange(sentiment_dict) %>%  
  mutate(date = as.POSIXct(date_utc, format = "%m/%d/%Y")) %>%
  filter(!is.na(date)) %>% 
  mutate(year = year(date),
         day_of_week = wday(date, label = TRUE),
         is_weekend = ifelse(day_of_week %in% c("Sat", "Sun"), "Weekend", "Weekday"),
         time = timestamp %>% 
           anytime(tz = anytime:::getTZ()) %>% 
           str_split('-| |:') %>% 
           sapply(function(x) as.numeric(x[4])))

Displaying the bert model sentiment by year

reddit_sentiment %>%
  ggplot(aes(x = year, fill = bert_label)) +
  geom_bar(position = 'stack') +
  scale_x_continuous(breaks = seq(min(reddit_sentiment$year),
                                  max(reddit_sentiment$year),
                                  by = 1)) +
  scale_fill_brewer(palette = 'PuRd', direction = -1) +
  dark_theme_grey()

## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().

We can also display binary sentiment from dictionary method by year.

reddit_sentiment %>%
  ggplot(aes(x = year, fill = sentimentr_binary)) +
  geom_bar(position = 'stack') +
  scale_x_continuous(breaks = seq(min(reddit_sentiment$year),
                                  max(reddit_sentiment$year),
                                  by = 1)) +
  scale_fill_brewer(palette = 'Blues', direction = -1) +
  dark_theme_grey()

As a result, more reddit threads are classified as negative sentiment in the binary dictionary method. Overall, we can say that over time, redditers in r/poppunkers generally show more positive attitude than negative attitude towards the band blink-182, which makes sense as blink-182 is one of the most well-known pop punk bands.

Negative vs positive word clouds

data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

reddit_sentiment_clean <- reddit_sentiment %>%
  mutate(title_text = str_replace_all(title_text, replace_reg, "")) %>%
  # tokenize
  unnest_tokens(word, title_text, token = "words") %>%
  # remove stop words
  anti_join(stop_words, by = "word") %>%
  filter(str_detect(word, "[a-z]")) %>%
  filter(!word %in% c('blink'))

# negative text
reddit_sentiment_clean_negative <- reddit_sentiment_clean %>%
  filter(bert_label_numeric %in% c(1,2))
# positive text
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>%
  filter(bert_label_numeric %in% c(4,5))

# Remove words that are commonly seen in both negative and positive threads
reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>%
  anti_join(reddit_sentiment_clean_positive, by = 'word')
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
  anti_join(reddit_sentiment_clean_negative, by = 'word')

Negative wordcloud:

n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.8, 1) # vivid
v <- runif(n, 0.7, 0.9) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("#aaaaaa", 10000))
reddit_sentiment_clean_negative_unique %>%
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal,
       backgroundColor = "black",
       minRotation = -pi/6,
       maxRotation = -pi/6,
       rotateRatio = 1)

Positive wordcloud:

reddit_sentiment_clean_positive_unique %>%
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal,
             backgroundColor = "black",
       minRotation = pi/6,
       maxRotation = pi/6,
       rotateRatio = 1)

Sentiment and number of comments

reddit_sentiment %>%
  ggplot(aes(x = as.factor(bert_label_numeric), y = comments)) +
  geom_boxplot() +
  dark_theme_grey()

There is no significant correlation between the sentiment and number of comments.

Major Assignment 3

2024-11-27