Step 0. Packages

#Package names
packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "tidyverse", "igraph", "ggraph", "wordcloud2", "textdata", "here", "syuzhet", "sentimentr", "ggthemes")

#Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}

invisible(lapply(packages, library, character.only = TRUE))

library(syuzhet)
library(sentimentr)
library(dplyr)
library(tidyverse)
library(magrittr)
library(ggthemes)
library(ggplot2)

Step 1. Focus of Study for Sentiment Analysis

The objective of this study is to see changes in sentiment towards Labubus over the past 6 months.

Step 2. Search Reddit Threads using “Labubu” Keyword &
Step 3. Clean the Data and Tokenize It

Using Subreddit

threads <- find_thread_urls(keywords = 'Labubu', 
                            sort_by = 'relevance', 
                            period = 'all') %>% 
  drop_na()

rownames(threads) <- NULL

#Sanitize text
threads %<>% 
  mutate(across(
    where(is.character),
    ~ .x %>%
      str_replace_all("\\|", "/") %>%   # replace vertical bars
      str_replace_all("\\n", " ") %>%   # replace newlines
      str_squish()                      # clean up extra spaces
  ))

#search for subreddits
subreddit_list <- RedditExtractoR::find_subreddits('Labubu')
subreddit_list %>% 
  arrange(desc(subscribers)) %>% 
  .[1:25,c('subreddit','title','subscribers')]

#check how many threads were found for that keyword within each subreddit
threads$subreddit %>% table() %>% sort(decreasing = T) %>% head(20)

threads <- find_thread_urls(subreddit = 'labubu', 
                            sort_by = 'top', 
                            period = 'year') %>% 
  drop_na()

rownames(threads) <- NULL

#Sanitize text
threads %<>% 
  mutate(across(
    where(is.character),
    ~ .x %>%
      str_replace_all("\\|", "/") %>% 
      str_replace_all("\\n", " ") %>%
      str_squish()
  ))

head(threads, 3) %>% knitr::kable()

Analyzing Posting Date and Time

#create new column: date
threads_1 <- threads
threads_1 %<>% 
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date))

#create new columns: day_of_week, is_weekend
threads_1 %<>%  
  mutate(day_of_week = wday(date, label = TRUE)) %>% 
  mutate(is_weekend = ifelse(day_of_week %in% c("Sat", "Sun"), "Weekend", "Weekday"))

#extract the time of day from the timestamp column
print(threads_1$timestamp[1])
print(threads_1$timestamp[1] %>% anytime(tz = anytime:::getTZ()))

threads_1 %<>%  
  mutate(time = timestamp %>% 
           anytime(tz = anytime:::getTZ()) %>% 
           str_split('-| |:') %>% 
           sapply(function(x) as.numeric(x[4])))

#save threads_1 as csv
write.csv(threads_1, "reddit_threads.csv", row.names = FALSE)

Tokenization and Removal of stop words

#load list of stop words - from the tidytext package
data("stop_words")
#view random 50 words
print(stop_words$word[sample(1:nrow(stop_words), 100)])

##   [1] "weren't"       "what"          "said"          "indicate"     
##   [5] "everywhere"    "everybody"     "mostly"        "if"           
##   [9] "things"        "whereas"       "everything"    "really"       
##  [13] "shows"         "did"           "be"            "sup"          
##  [17] "see"           "kind"          "wants"         "latter"       
##  [21] "part"          "whenever"      "neither"       "where"        
##  [25] "does"          "toward"        "it's"          "furthered"    
##  [29] "aside"         "must"          "next"          "know"         
##  [33] "please"        "what's"        "isn't"         "h"            
##  [37] "anybody"       "goes"          "everything"    "in"           
##  [41] "out"           "ourselves"     "under"         "among"        
##  [45] "since"         "after"         "known"         "here's"       
##  [49] "pointing"      "s"             "during"        "out"          
##  [53] "with"          "on"            "whatever"      "such"         
##  [57] "appreciate"    "got"           "opened"        "two"          
##  [61] "whether"       "unfortunately" "least"         "do"           
##  [65] "themselves"    "up"            "thence"        "would"        
##  [69] "asks"          "we'll"         "following"     "looks"        
##  [73] "towards"       "theres"        "even"          "were"         
##  [77] "eg"            "you'll"        "haven't"       "nevertheless" 
##  [81] "doing"         "again"         "herself"       "go"           
##  [85] "comes"         "any"           "shall"         "corresponding"
##  [89] "much"          "wants"         "this"          "we've"        
##  [93] "can"           "does"          "important"     "interested"   
##  [97] "entirely"      "overall"       "us"            "fully"

#Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

threads_1 <- read.csv("./reddit_threads.csv")

words_clean <- threads_1 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = "words") %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]")) %>%
  filter(!word %in% c("Labubu", "labubu", "labubus"))

Step 4. Generate a Word Cloud

n <- 20 # number of words with color
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

words_clean %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

Brief Discussion on Word Cloud Result

The biggest word in the word cloud says “love”, indicating the majority’s love for Labubu. Some other words that showed up include secret, boxes, cute, luck, and buy all show popularity in buying Labubu and the blindbox craze.

Step 5. Conduct a Tri-gram Analysis

#Get ngrams
words_ngram <- threads_1 %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = trigram,
                input = text,
                token = "ngrams",
                n = 3) #tri-gram

#filter out any rows where either column contains a stop word using the filter function
#separate the words into three columns
words_ngram_tri <- words_ngram %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ")

#filter rows where there are stop words under word 1 column, word 2 column, word 3 column
words_ngram_tri_filtered <- words_ngram_tri %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & 
           !word2 %in% stop_words$word & 
           !word3 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") &
           str_detect(word2, "[a-z]") &
           str_detect(word3, "[a-z]")) %>%
  drop_na(word1, word2, word3) #drop na values

#filter out words that are not encoded in ASCII
library(stringi)
words_ngram_tri_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2) & stri_enc_isascii(word3))

# Sort the new tri-gram (n=3) counts
words_counts <- words_ngram_tri_filtered %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

## Table of Frequency of Tri-grams
head(words_counts, 20) %>% 
  knitr::kable()

word1	word2	word3	n
tie	dye	pattern	2
2nd	pic	xd	1
3rd	blind	box	1
90s	slap	bracelets	1
added	actual	photos	1
apologized	swapped	insurance	1
artist	venessa	stockard	1
atrocious	hellscape	half	1
avoid	religious	subjects	1
biggest	belly	labubus	1
biggest	pokemon	card	1
black	sparkly	eyes	1
blind	box	set	1
brow	tag	appears	1
chase	figure	stay	1
cheetah	print	customization	1
chestnut	cocoa	mini	1
chestnut	cocoa	pulled	1
chip	bag	pillow	1
classic	hot	rods	1

Brief Discussion on Any Noteworthy Tri-grams

Tri-gram “tie-dye-pattern” has the highest frequency, indicating the popularity of the tie dye pattern variation of Labubu collection. Tri-grams such as “3rd-blind-box”, “blind-box-set”, and “chestnut-cocoa-pulled” all reflect the blind box and collectibles culture that are made popular by Labubu. Tri-grams such as “biggest-belly-labubus”, “black-sparkly-eyes”, and “cheetah-print-customization” may reflect the customization demand to make special Labubus to stand out.

Visualize Tri-gram Words Occurring in Groups

trigram_edges <- words_counts %>%
  select(word1, word2, word3, n)

# reshape into two edges per trigram
edges <- trigram_edges %>%
  mutate(prev = word2) %>% 
  tidyr::pivot_longer(cols = c(word2, word3),
                      names_to = "pos",
                      values_to = "to") %>%
  mutate(from = ifelse(pos == "word2", word1, prev)) %>%
  select(from, to, n) %>%
  slice(1:10) #just show top 10 rows

graph <- graph_from_data_frame(edges)

ggraph(graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8) +
  labs(title = "Trigram Word Network",
       x = "", y = "")

Step 6. Perform Sentiment Analysis - AFINN Dictionary Model

allthreads <- read.csv("./reddit_threads.csv") %>%
  filter(!is.na(text)) %>%        # remove NA
  filter(str_trim(text) != "")    # remove empty strings

#extract text column from df
allthreads_text <- allthreads$text

#perform sentimental analysis using AFINN model of syuzhet package
allthreads$afinn <- get_sentiment(allthreads_text, method = "afinn")

Step 7. 10 Sample Texts and their Sentiment Scores

display <- allthreads %>%
  select(text, afinn) %>%
  rename(Text = text,
         SentimentScore = afinn) %>%
  slice(1:10) %>%
  knitr::kable()

Brief Discussion on Outcome Credibility Evaluation of the Sentiment Analysis

The first sentence expresses the mom’s continuing purchases for Labubu just to get the secret version because her son loves Labubu. The sentiment analysis gives the sentence a score of 6, which is pretty accurate considering there wasn’t specifically a strong sentiment showing excitement of Labubus but it does show a love for it. The second sentence stated that the person named his/her Labubu Belle, which doesn’t contain any sentiment related, therefore the score of 0 is pretty credible. The third sentence is also pretty accurate in capturing the excitement of getting two complete sets of Labubus and expressed the joy of being lucky by giving the sentence a score of 13. The fourth sentence shows mild annoyance of getting banned in the subreddit for Labubu dupes and called the mod a “weirdo”; the sentiment analysis gave it a score of -2, which is pretty reasonable but could be slightly more negative. One noticable over-scoring of sentiment analysis is for the ninth sentence, where the sentence only expressed buying a Labubu for fun; the analysis gave it a mscore of 4 when the sentence has a more indifferent tone. This may be due to the sentence having the word “fun”.

Step 8. Intriguing Insights & Plots

8-1. Average Daily Sentiment Score over Time

sent_daily <- allthreads %>%
  mutate(date = as.Date(date)) %>%
  group_by(date) %>%
  summarize(mean_afinn = mean(afinn, na.rm = TRUE)) %>%
  arrange(date)

#find local maxima above mean sentiment score 17
local_max <- sent_daily %>%
  mutate(
    prev_val = lag(mean_afinn),
    next_val = lead(mean_afinn),
    is_local_max = mean_afinn > prev_val & mean_afinn > next_val
  ) %>%
  filter(is_local_max, mean_afinn > 17)

#find local minima below mean sentiment score -5
local_min <- sent_daily %>%
  mutate(
    prev_val = lag(mean_afinn),
    next_val = lead(mean_afinn),
    is_local_min = mean_afinn < prev_val & mean_afinn < next_val
  ) %>%
  filter(is_local_min, mean_afinn < -5)

#plot on ggplot
ggplot(sent_daily, aes(x = date, y = mean_afinn)) +
  geom_line(size = 0.5, alpha = 0.9) +
  
  #local maxima
  geom_point( #highlight point of local maxima above mean sentiment score 17
    data = local_max,
    aes(x = date, y = mean_afinn),
    color = "#4a8548",
    size = 3
  ) +
  geom_text( #add labels of the date of the local maxima above mean sentiment score 17
    data = local_max,
    aes(
      label = format(date, "%Y-%m-%d"),
      x = date,
      y = mean_afinn
    ),
    vjust = -1,
    size = 4,
    color = "#4a8548",
    fontface = "bold"
  ) + 
  
  #local minima
  geom_point( #highlight point of local minima below mean sentiment score -5
    data = local_min,
    aes(x = date, y = mean_afinn),
    color = "#d15e56",
    size = 3
  ) +
  geom_text( ##add labels of the date of the local minima below mean sentiment score -5
    data = local_min,
    aes(label = format(date, "%Y-%m-%d")),
    vjust = 1.9,
    size = 4,
    color = "#d15e56",
    fontface = "bold"
  ) +
  
  scale_x_date(
    date_breaks = "1 month",
    date_labels = "%Y-%m"
  ) +
  labs(
    title = "Average Daily Sentiment Score over Time",
    x = "Date",
    y = "Mean Sentiment Score"
  )

Brief Discussion on Intriguing Insights from Plots

With a line graph showing daily average sentiment scores for Labubus over time, there isn’t an prominent trend showing general increase or decrease in sentiment over time. However, one interesting observation is higher fluctuations between highly positive to highly negative sentiment within May 2025 and also across October to November 2025, indicating there may be some event, such as new Labubu collection release, prompting more extreme reactions. Another intriguing observation is that although there is not much of a prominent increase/decrease trend, the mean sentiment score over majority time are above 0, meaning most times people have positive sentiments towards Labubus.

8-2. Average Sentiment Score by Day of Week

weekdays <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")

allthreads$day_of_week <- factor(allthreads$day_of_week, 
                                 levels = weekdays)

#Compute mean sentiment by weekday
sent_dow <- allthreads %>%
  group_by(day_of_week) %>%
  summarize(mean_afinn = mean(afinn, na.rm = TRUE))

#Plot
ggplot(sent_dow, aes(x = day_of_week, y = mean_afinn)) +
  geom_col(fill = "#80b1d3") +
  labs(
    title = "Average Sentiment Score by Day of Week",
    x = "Day of Week",
    y = "Mean Sentiment"
  ) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

Brief Discussion on Intriguing Insights from Plots

From the bar chart showing mean sentiment score by day of week, we can notice highest sentiment scores on Friday and lowest sentiment scores on Tuesday; the mean sentiment score drops at the lowest at Tuesday and gradually increases during the week, peaking at Friday. for all days of the week are above 1, reflecting the popularity and overall positive sentiment towards Labubu.

8-3. Proportion of Sentiment Categories by Day of Week

#Classify sentiment to positive (>0), neutral (=0), and negative (<0)
sent_prop <- allthreads %>%
  mutate(
    sentiment_type = case_when(
      afinn > 0 ~ "Positive",
      afinn < 0 ~ "Negative",
      TRUE      ~ "Neutral"
    )
  ) %>%
  group_by(day_of_week, sentiment_type) %>%
  summarize(n = n(), .groups = "drop") %>%
  group_by(day_of_week) %>%
  mutate(prop = n / sum(n))

#Plot
ggplot(sent_prop, aes(x = day_of_week, y = prop, fill = sentiment_type)) +
  geom_col(position = "fill") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2)) +
  scale_fill_manual(values = c(
    "Positive" = "#a9c48d",
    "Neutral"  = "#ede1c0",
    "Negative" = "#DCA1A1"
  )) +
  labs(
    title = "Proportion of Sentiment Categories by Day of Week",
    x = "Day of Week",
    y = "Proportion of Threads",
    fill = "Sentiment"
  ) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

Brief Discussion on Intriguing Insights from Plots

From this proportional stacked bar chart, we can see that Friday again has highest proportion of threads with positive sentiment and Monday and Sunday having the two days with highest proportion of threads with negative sentiment. We see that Friday has the highest proportion of threads with positive sentiment and lowest proportion with negative sentiment, coiinciding with the conclusion made in the previous bar chart indicating Friday has the highest mean sentiment score out of all days in the week. One intriguing insight expanding upon the previous bar chart is that we have learned from the previous bar chart that Tuesday has the lowest mean sentiment out of all days in the week; however, looking at this proportional stacked bar chart we can notice that it is not exactly caused by more threads having negative sentiments. The low mean sentiment score on Tuesday is caused by large proportions of threads with neutral tones (score 0), dragging down the average.

Major Assignment 3

Vivian Lin

2025-12-01

Step 0. Packages

Step 1. Focus of Study for Sentiment Analysis

Step 2. Search Reddit Threads using “Labubu” Keyword &
Step 3. Clean the Data and Tokenize It

Using Subreddit

Analyzing Posting Date and Time

Tokenization and Removal of stop words

Step 4. Generate a Word Cloud

Brief Discussion on Word Cloud Result

Step 5. Conduct a Tri-gram Analysis

Brief Discussion on Any Noteworthy Tri-grams

Visualize Tri-gram Words Occurring in Groups

Step 6. Perform Sentiment Analysis - AFINN Dictionary Model

Step 7. 10 Sample Texts and their Sentiment Scores

Brief Discussion on Outcome Credibility Evaluation of the Sentiment Analysis

Step 8. Intriguing Insights & Plots

8-1. Average Daily Sentiment Score over Time

Brief Discussion on Intriguing Insights from Plots

8-2. Average Sentiment Score by Day of Week

Brief Discussion on Intriguing Insights from Plots

8-3. Proportion of Sentiment Categories by Day of Week

Brief Discussion on Intriguing Insights from Plots

Major Assignment 3

Vivian Lin

2025-12-01

Step 0. Packages

Step 1. Focus of Study for Sentiment Analysis

Step 2. Search Reddit Threads using “Labubu” Keyword &Step 3. Clean the Data and Tokenize It

Using Subreddit

Analyzing Posting Date and Time

Tokenization and Removal of stop words

Step 4. Generate a Word Cloud

Brief Discussion on Word Cloud Result

Step 5. Conduct a Tri-gram Analysis

Brief Discussion on Any Noteworthy Tri-grams

Visualize Tri-gram Words Occurring in Groups

Step 6. Perform Sentiment Analysis - AFINN Dictionary Model

Step 7. 10 Sample Texts and their Sentiment Scores

Brief Discussion on Outcome Credibility Evaluation of the Sentiment Analysis

Step 8. Intriguing Insights & Plots

8-1. Average Daily Sentiment Score over Time

Brief Discussion on Intriguing Insights from Plots

8-2. Average Sentiment Score by Day of Week

Brief Discussion on Intriguing Insights from Plots

8-3. Proportion of Sentiment Categories by Day of Week

Brief Discussion on Intriguing Insights from Plots

Step 2. Search Reddit Threads using “Labubu” Keyword &
Step 3. Clean the Data and Tokenize It