#Package names
packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "tidyverse", "igraph", "ggraph", "wordcloud2", "textdata", "here", "syuzhet", "sentimentr", "ggthemes")
#Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
invisible(lapply(packages, library, character.only = TRUE))
library(syuzhet)
library(sentimentr)
library(dplyr)
library(tidyverse)
library(magrittr)
library(ggthemes)
library(ggplot2)
The objective of this study is to see changes in sentiment towards Labubus over the past 6 months.
threads <- find_thread_urls(keywords = 'Labubu',
sort_by = 'relevance',
period = 'all') %>%
drop_na()
rownames(threads) <- NULL
#Sanitize text
threads %<>%
mutate(across(
where(is.character),
~ .x %>%
str_replace_all("\\|", "/") %>% # replace vertical bars
str_replace_all("\\n", " ") %>% # replace newlines
str_squish() # clean up extra spaces
))
#search for subreddits
subreddit_list <- RedditExtractoR::find_subreddits('Labubu')
subreddit_list %>%
arrange(desc(subscribers)) %>%
.[1:25,c('subreddit','title','subscribers')]
#check how many threads were found for that keyword within each subreddit
threads$subreddit %>% table() %>% sort(decreasing = T) %>% head(20)
threads <- find_thread_urls(subreddit = 'labubu',
sort_by = 'top',
period = 'year') %>%
drop_na()
rownames(threads) <- NULL
#Sanitize text
threads %<>%
mutate(across(
where(is.character),
~ .x %>%
str_replace_all("\\|", "/") %>%
str_replace_all("\\n", " ") %>%
str_squish()
))
head(threads, 3) %>% knitr::kable()
#create new column: date
threads_1 <- threads
threads_1 %<>%
mutate(date = as.POSIXct(date_utc)) %>%
filter(!is.na(date))
#create new columns: day_of_week, is_weekend
threads_1 %<>%
mutate(day_of_week = wday(date, label = TRUE)) %>%
mutate(is_weekend = ifelse(day_of_week %in% c("Sat", "Sun"), "Weekend", "Weekday"))
#extract the time of day from the timestamp column
print(threads_1$timestamp[1])
print(threads_1$timestamp[1] %>% anytime(tz = anytime:::getTZ()))
threads_1 %<>%
mutate(time = timestamp %>%
anytime(tz = anytime:::getTZ()) %>%
str_split('-| |:') %>%
sapply(function(x) as.numeric(x[4])))
#save threads_1 as csv
write.csv(threads_1, "reddit_threads.csv", row.names = FALSE)
#load list of stop words - from the tidytext package
data("stop_words")
#view random 50 words
print(stop_words$word[sample(1:nrow(stop_words), 100)])
## [1] "weren't" "what" "said" "indicate"
## [5] "everywhere" "everybody" "mostly" "if"
## [9] "things" "whereas" "everything" "really"
## [13] "shows" "did" "be" "sup"
## [17] "see" "kind" "wants" "latter"
## [21] "part" "whenever" "neither" "where"
## [25] "does" "toward" "it's" "furthered"
## [29] "aside" "must" "next" "know"
## [33] "please" "what's" "isn't" "h"
## [37] "anybody" "goes" "everything" "in"
## [41] "out" "ourselves" "under" "among"
## [45] "since" "after" "known" "here's"
## [49] "pointing" "s" "during" "out"
## [53] "with" "on" "whatever" "such"
## [57] "appreciate" "got" "opened" "two"
## [61] "whether" "unfortunately" "least" "do"
## [65] "themselves" "up" "thence" "would"
## [69] "asks" "we'll" "following" "looks"
## [73] "towards" "theres" "even" "were"
## [77] "eg" "you'll" "haven't" "nevertheless"
## [81] "doing" "again" "herself" "go"
## [85] "comes" "any" "shall" "corresponding"
## [89] "much" "wants" "this" "we've"
## [93] "can" "does" "important" "interested"
## [97] "entirely" "overall" "us" "fully"
#Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&|<|>"
threads_1 <- read.csv("./reddit_threads.csv")
words_clean <- threads_1 %>%
# drop URLs
mutate(text = str_replace_all(text, replace_reg, "")) %>%
# Tokenization (word tokens)
unnest_tokens(word, text, token = "words") %>%
# drop stop words
anti_join(stop_words, by = "word") %>%
# drop non-alphabet-only strings
filter(str_detect(word, "[a-z]")) %>%
filter(!word %in% c("Labubu", "labubu", "labubus"))
n <- 20 # number of words with color
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright
df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))
words_clean %>%
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = 0,
maxRotation = 0,
ellipticity = 0.8)
The biggest word in the word cloud says “love”, indicating the majority’s love for Labubu. Some other words that showed up include secret, boxes, cute, luck, and buy all show popularity in buying Labubu and the blindbox craze.
#Get ngrams
words_ngram <- threads_1 %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
select(text) %>%
unnest_tokens(output = trigram,
input = text,
token = "ngrams",
n = 3) #tri-gram
#filter out any rows where either column contains a stop word using the filter function
#separate the words into three columns
words_ngram_tri <- words_ngram %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ")
#filter rows where there are stop words under word 1 column, word 2 column, word 3 column
words_ngram_tri_filtered <- words_ngram_tri %>%
# drop stop words
filter(!word1 %in% stop_words$word &
!word2 %in% stop_words$word &
!word3 %in% stop_words$word) %>%
# drop non-alphabet-only strings
filter(str_detect(word1, "[a-z]") &
str_detect(word2, "[a-z]") &
str_detect(word3, "[a-z]")) %>%
drop_na(word1, word2, word3) #drop na values
#filter out words that are not encoded in ASCII
library(stringi)
words_ngram_tri_filtered %<>%
filter(stri_enc_isascii(word1) & stri_enc_isascii(word2) & stri_enc_isascii(word3))
# Sort the new tri-gram (n=3) counts
words_counts <- words_ngram_tri_filtered %>%
count(word1, word2, word3) %>%
arrange(desc(n))
## Table of Frequency of Tri-grams
head(words_counts, 20) %>%
knitr::kable()
| word1 | word2 | word3 | n |
|---|---|---|---|
| tie | dye | pattern | 2 |
| 2nd | pic | xd | 1 |
| 3rd | blind | box | 1 |
| 90s | slap | bracelets | 1 |
| added | actual | photos | 1 |
| apologized | swapped | insurance | 1 |
| artist | venessa | stockard | 1 |
| atrocious | hellscape | half | 1 |
| avoid | religious | subjects | 1 |
| biggest | belly | labubus | 1 |
| biggest | pokemon | card | 1 |
| black | sparkly | eyes | 1 |
| blind | box | set | 1 |
| brow | tag | appears | 1 |
| chase | figure | stay | 1 |
| cheetah | customization | 1 | |
| chestnut | cocoa | mini | 1 |
| chestnut | cocoa | pulled | 1 |
| chip | bag | pillow | 1 |
| classic | hot | rods | 1 |
Tri-gram “tie-dye-pattern” has the highest frequency, indicating the popularity of the tie dye pattern variation of Labubu collection. Tri-grams such as “3rd-blind-box”, “blind-box-set”, and “chestnut-cocoa-pulled” all reflect the blind box and collectibles culture that are made popular by Labubu. Tri-grams such as “biggest-belly-labubus”, “black-sparkly-eyes”, and “cheetah-print-customization” may reflect the customization demand to make special Labubus to stand out.
trigram_edges <- words_counts %>%
select(word1, word2, word3, n)
# reshape into two edges per trigram
edges <- trigram_edges %>%
mutate(prev = word2) %>%
tidyr::pivot_longer(cols = c(word2, word3),
names_to = "pos",
values_to = "to") %>%
mutate(from = ifelse(pos == "word2", word1, prev)) %>%
select(from, to, n) %>%
slice(1:10) #just show top 10 rows
graph <- graph_from_data_frame(edges)
ggraph(graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8) +
labs(title = "Trigram Word Network",
x = "", y = "")
allthreads <- read.csv("./reddit_threads.csv") %>%
filter(!is.na(text)) %>% # remove NA
filter(str_trim(text) != "") # remove empty strings
#extract text column from df
allthreads_text <- allthreads$text
#perform sentimental analysis using AFINN model of syuzhet package
allthreads$afinn <- get_sentiment(allthreads_text, method = "afinn")
display <- allthreads %>%
select(text, afinn) %>%
rename(Text = text,
SentimentScore = afinn) %>%
slice(1:10) %>%
knitr::kable()
The first sentence expresses the mom’s continuing purchases for Labubu just to get the secret version because her son loves Labubu. The sentiment analysis gives the sentence a score of 6, which is pretty accurate considering there wasn’t specifically a strong sentiment showing excitement of Labubus but it does show a love for it. The second sentence stated that the person named his/her Labubu Belle, which doesn’t contain any sentiment related, therefore the score of 0 is pretty credible. The third sentence is also pretty accurate in capturing the excitement of getting two complete sets of Labubus and expressed the joy of being lucky by giving the sentence a score of 13. The fourth sentence shows mild annoyance of getting banned in the subreddit for Labubu dupes and called the mod a “weirdo”; the sentiment analysis gave it a score of -2, which is pretty reasonable but could be slightly more negative. One noticable over-scoring of sentiment analysis is for the ninth sentence, where the sentence only expressed buying a Labubu for fun; the analysis gave it a mscore of 4 when the sentence has a more indifferent tone. This may be due to the sentence having the word “fun”.
sent_daily <- allthreads %>%
mutate(date = as.Date(date)) %>%
group_by(date) %>%
summarize(mean_afinn = mean(afinn, na.rm = TRUE)) %>%
arrange(date)
#find local maxima above mean sentiment score 17
local_max <- sent_daily %>%
mutate(
prev_val = lag(mean_afinn),
next_val = lead(mean_afinn),
is_local_max = mean_afinn > prev_val & mean_afinn > next_val
) %>%
filter(is_local_max, mean_afinn > 17)
#find local minima below mean sentiment score -5
local_min <- sent_daily %>%
mutate(
prev_val = lag(mean_afinn),
next_val = lead(mean_afinn),
is_local_min = mean_afinn < prev_val & mean_afinn < next_val
) %>%
filter(is_local_min, mean_afinn < -5)
#plot on ggplot
ggplot(sent_daily, aes(x = date, y = mean_afinn)) +
geom_line(size = 0.5, alpha = 0.9) +
#local maxima
geom_point( #highlight point of local maxima above mean sentiment score 17
data = local_max,
aes(x = date, y = mean_afinn),
color = "#4a8548",
size = 3
) +
geom_text( #add labels of the date of the local maxima above mean sentiment score 17
data = local_max,
aes(
label = format(date, "%Y-%m-%d"),
x = date,
y = mean_afinn
),
vjust = -1,
size = 4,
color = "#4a8548",
fontface = "bold"
) +
#local minima
geom_point( #highlight point of local minima below mean sentiment score -5
data = local_min,
aes(x = date, y = mean_afinn),
color = "#d15e56",
size = 3
) +
geom_text( ##add labels of the date of the local minima below mean sentiment score -5
data = local_min,
aes(label = format(date, "%Y-%m-%d")),
vjust = 1.9,
size = 4,
color = "#d15e56",
fontface = "bold"
) +
scale_x_date(
date_breaks = "1 month",
date_labels = "%Y-%m"
) +
labs(
title = "Average Daily Sentiment Score over Time",
x = "Date",
y = "Mean Sentiment Score"
)
With a line graph showing daily average sentiment scores for Labubus over time, there isn’t an prominent trend showing general increase or decrease in sentiment over time. However, one interesting observation is higher fluctuations between highly positive to highly negative sentiment within May 2025 and also across October to November 2025, indicating there may be some event, such as new Labubu collection release, prompting more extreme reactions. Another intriguing observation is that although there is not much of a prominent increase/decrease trend, the mean sentiment score over majority time are above 0, meaning most times people have positive sentiments towards Labubus.
weekdays <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
allthreads$day_of_week <- factor(allthreads$day_of_week,
levels = weekdays)
#Compute mean sentiment by weekday
sent_dow <- allthreads %>%
group_by(day_of_week) %>%
summarize(mean_afinn = mean(afinn, na.rm = TRUE))
#Plot
ggplot(sent_dow, aes(x = day_of_week, y = mean_afinn)) +
geom_col(fill = "#80b1d3") +
labs(
title = "Average Sentiment Score by Day of Week",
x = "Day of Week",
y = "Mean Sentiment"
) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
From the bar chart showing mean sentiment score by day of week, we can notice highest sentiment scores on Friday and lowest sentiment scores on Tuesday; the mean sentiment score drops at the lowest at Tuesday and gradually increases during the week, peaking at Friday. for all days of the week are above 1, reflecting the popularity and overall positive sentiment towards Labubu.
#Classify sentiment to positive (>0), neutral (=0), and negative (<0)
sent_prop <- allthreads %>%
mutate(
sentiment_type = case_when(
afinn > 0 ~ "Positive",
afinn < 0 ~ "Negative",
TRUE ~ "Neutral"
)
) %>%
group_by(day_of_week, sentiment_type) %>%
summarize(n = n(), .groups = "drop") %>%
group_by(day_of_week) %>%
mutate(prop = n / sum(n))
#Plot
ggplot(sent_prop, aes(x = day_of_week, y = prop, fill = sentiment_type)) +
geom_col(position = "fill") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2)) +
scale_fill_manual(values = c(
"Positive" = "#a9c48d",
"Neutral" = "#ede1c0",
"Negative" = "#DCA1A1"
)) +
labs(
title = "Proportion of Sentiment Categories by Day of Week",
x = "Day of Week",
y = "Proportion of Threads",
fill = "Sentiment"
) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
From this proportional stacked bar chart, we can see that Friday again has highest proportion of threads with positive sentiment and Monday and Sunday having the two days with highest proportion of threads with negative sentiment. We see that Friday has the highest proportion of threads with positive sentiment and lowest proportion with negative sentiment, coiinciding with the conclusion made in the previous bar chart indicating Friday has the highest mean sentiment score out of all days in the week. One intriguing insight expanding upon the previous bar chart is that we have learned from the previous bar chart that Tuesday has the lowest mean sentiment out of all days in the week; however, looking at this proportional stacked bar chart we can notice that it is not exactly caused by more threads having negative sentiments. The low mean sentiment score on Tuesday is caused by large proportions of threads with neutral tones (score 0), dragging down the average.