In this report,
# Package names
packages <- c("RedditExtractoR", "tidytext", "tidyverse", "textdata", "anytime", "magrittr", "wordcloud2",
"ggdark","gofastr", "syuzhet", "sentimentr", "lubridate", "here")
# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Load packages
invisible(lapply(packages, library, character.only = TRUE))
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
##
## Attaching package: 'sentimentr'
##
##
## The following object is masked from 'package:syuzhet':
##
## get_sentences
##
##
## here() starts at /Users/xy/Downloads/tutorial
## Read the RDS data and write as .csv file
data <- readRDS("reddit.rds")
# Loop through each row of the dataframe
results <- character(nrow(data))
for (i in seq_len(nrow(data))) {
# Extract the string that follows '/r/' and precede the next '/'
results[i] <- str_extract(data$permalink[i], "(?<=/r/)[^/]+")
}
# You can add the results back to your data frame if you want
data$subreddit <- results
data$extracted_bs<- NULL
write.csv(data, "sample_reddit1.csv", row.names = FALSE)
reddit_sentiment <- read_csv('sample_reddit_bert2.csv') %>% drop_na('bert_label')
## Rows: 1574 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): tag, title, author, comments, id, permalink, url, comments_feed, ...
## dbl (9): ...1, num_comments, score, upvote_ratio, created_utc, time, bert_...
## dttm (1): date
## date (1): year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
reddit_sentiment <- reddit_sentiment %>%
mutate(group = case_when(
subreddit == "Atlanta" | tag == "beltline" ~ "Atlanta",
subreddit == "chicago" | subreddit == "chibike" | tag == "606" | tag == "bloomingdale trail" ~ "Chicago",
subreddit == "nyc" | subreddit == "NYCbike" | subreddit == "MicromobilityNYC" | tag == "highline" ~ "NYC",
subreddit == "Detroit" | subreddit == "BikeDetroit" | tag == "Joe Louis Greenway" | tag == "dequindre cut" ~ "Detroit",
TRUE ~ "Other" # Assign "Other" to rows that do not match any of the above conditions
))
reddit_sentiment %<>% mutate(bert_label_numeric = str_sub(bert_label, 1, 1) %>% as.numeric())
We’ve extracted thread texts related to rail to trails from Reddit for our dataset, and processed this data through the BERT model.Below is the visualization for the findings.
reddit_sentiment %>%
ggplot(aes(x = bert_label)) +
geom_bar(fill = "gray") +
geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5, size = 2.5, color = "black") + # Adjust the 'size' for smaller text
labs(title = "Distribution of BERT Labels", x = "BERT Label", y = "Count") + # Add plot title and axis labels
theme_light() +
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5), # Adjust the plot title size
axis.title.x = element_text(size = 12, face = "bold"), # Adjust the x axis label size
axis.title.y = element_text(size = 12, face = "bold"), # Adjust the y axis label size
axis.text.x = element_text(size = 10), # Adjust the x axis text size
axis.text.y = element_text(size = 10) # Adjust the y axis text size
)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Define colors for each group
group_colors <- c("Other" = "gray",
"Atlanta" = "pink",
"NYC" = "plum",
"Detroit" = "lightblue",
"Chicago" = "lightgreen")
# Ensure your data has the 'group' column as a factor and levels exactly match the names in group_colors
reddit_sentiment$group1 <- factor(reddit_sentiment$group, levels = names(group_colors))
# Calculate the percentage for each group within each bert_label
reddit_sentiment1 <- reddit_sentiment %>%
group_by(bert_label, group) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(percentage = count / sum(count))
# Create the plot
gg <- reddit_sentiment1 %>%
ggplot(aes(x = bert_label, y = percentage, fill = group)) +
geom_bar(position = "dodge", stat = "identity", color = "black") +
scale_fill_manual(values = group_colors) + # Manually set colors
theme_light() + # Apply a light theme
labs(title = "Sentiment Analysis by BERT Label and Cities",
y = "Percentage", fill = "Group") # Update y-axis label and legend label
# Print the plot
print(gg)
In this section,we explore whether there is a relationship between sentiment level and other variables,like word counts for the thread and numbers of comments.
reddit_sentiment %>%
ggplot(aes(x = bert_label, y = word_count)) +
geom_jitter(height = 0, width = 0.05, color = "gray") + # Set dots to gray
stat_summary(fun = mean, geom = "crossbar", width = 0.4, color = "red") +
theme_light() +
labs(title = "Word Count by BERT Label") # Add a title to the plot
#data in group:
library(ggplot2)
library(dplyr)
# Filter out the 'Other' group
reddit_sentiment_filtered <- reddit_sentiment %>%
filter(group != "Other")
# Create the plot
gg <- reddit_sentiment_filtered %>%
ggplot(aes(x = bert_label, y = word_count, color = group)) +
geom_jitter(height = 0, width = 0.05, size = 1) + # Smaller points
stat_summary(fun = mean, geom = "crossbar", width = 0.4, color = "black") +
theme_light() # Light theme
# Print the plot
print(gg)
* Findings: The spread of word counts in the negative sentiment
categories (1 and 2 stars) shows a wider range compared to positive
sentiments (4 and 5 stars). This could suggest that users are more
verbose when expressing dissatisfaction.Cities like Detroit and Chicago
have a notable presence in the higher word count range for 1-star
ratings, which might imply that users from these cities provide more
elaborate negative feedback.Atlanta shows a consistent pattern across
all sentiment categories, with most word counts clustering around the
median, indicating uniformity in the length of feedback irrespective of
sentiment.
reddit_sentiment %>%
ggplot(aes(x = bert_label, y = num_comments)) +
geom_jitter(height = 0, width = 0.05, size=0.6, color = "darkgray") + # Set dots to gray
stat_summary(fun = mean, geom = "crossbar", width = 0.4,linewidth=0.2, color = "red") +
theme_light() +
labs(title = "Numbers of comments by BERT Label") # Add a title to the plot
cor.test(reddit_sentiment$num_comments, reddit_sentiment$bert_label_numeric, method = "spearman")
## Warning in cor.test.default(reddit_sentiment$num_comments,
## reddit_sentiment$bert_label_numeric, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: reddit_sentiment$num_comments and reddit_sentiment$bert_label_numeric
## S = 687653045, p-value = 0.02127
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.05805102
cor.test(reddit_sentiment$word_count, reddit_sentiment$bert_label_numeric, method = "spearman")
## Warning in cor.test.default(reddit_sentiment$word_count,
## reddit_sentiment$bert_label_numeric, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: reddit_sentiment$word_count and reddit_sentiment$bert_label_numeric
## S = 684535791, p-value = 0.03463
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.05325469
Using word clouds, we can visualize words that are frequently seen in either positive or negative threads. Using the same code from the previous Rmd file, we tokenize the texts and remove stop words.
data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&|<|>"
reddit_sentiment_clean <- reddit_sentiment %>%
mutate(title = str_replace_all(title, replace_reg, "")) %>%
unnest_tokens(word, title, token = "words") %>%
anti_join(stop_words, by = "word") %>%
filter(str_detect(word, "[a-z]")) %>%
filter(!word %in% c('greenway'))
We are not interested in words that are commonly seen in both
positive and negative threads. We can identify words that are uniquely
seen in either positive or negative threads using
anti_join.
reddit_sentiment_clean_negative <- reddit_sentiment_clean %>%
filter(bert_label_numeric %in% c(1,2))
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>%
filter(bert_label_numeric %in% c(4,5))
reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>%
anti_join(reddit_sentiment_clean_positive, by = 'word')
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
anti_join(reddit_sentiment_clean_negative, by = 'word')
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright
df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))
reddit_sentiment_clean_negative_unique %>%
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = 0,
maxRotation = 0,
ellipticity = 0.8)
reddit_sentiment_clean_positive_unique %>%
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = 0,
maxRotation = 0,
ellipticity = 0.8)
reddit_sentiment %>%
ggplot(aes(x = date, y = bert_label_numeric)) +
geom_jitter(width = 0, height = 0.05, color = "darkgray") + # Set the dots to gray
scale_x_datetime(date_labels = "%y",
breaks = seq(min(reddit_sentiment$date, na.rm = TRUE),
max(reddit_sentiment$date, na.rm = TRUE),
by = "1 year")) +
theme_light() + # Use the light theme
labs(title = "BERT Label (stars) From 2008-2023",
x = "Year",
y = "BERT Label Numeric") + # Add a title and axis labels
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5), # Make the title bigger and bold
axis.title.x = element_text(size = 12), # Make the x-axis label bigger
axis.title.y = element_text(size = 12) # Make the y-axis label bigger
)
* Findings: The stacked bar chart illustrates sentiment trends from 2008
to 2023, with each color representing a different sentiment rating from
one to five stars.There’s an observable increase in sentiment expression
over the years, with a particular upsurge in 4-star and 5-star ratings
in recent years, suggesting a trend toward more positive feedback.The
number of 1-star ratings also shows growth, but not as pronounced as the
positive ones, possibly indicating a general trend toward more extreme
expressions of sentiment, either positive or negative.The relatively
stable but lower count of 2-star and 3-star ratings over time could
suggest that users are less inclined to express nuanced or moderate
sentiments.
# Assuming the 'year' column is in a four-digit year format
reddit_sentiment <- reddit_sentiment %>%
mutate(two_digit_year = substr(as.character(year), 3, 4))
# Now plot with the shortened year labels
reddit_sentiment %>%
ggplot(aes(x = two_digit_year, fill = bert_label)) +
geom_bar(position = 'stack') +
scale_x_discrete(breaks = unique(reddit_sentiment$two_digit_year)) +
scale_fill_brewer(palette = 'PuRd', direction = -1) +
dark_theme_grey() # Switching to a light theme for better visibility
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().
library(ggplot2)
library(dplyr)
# Filter out the 'Other' group
reddit_sentiment_filtered <- reddit_sentiment %>%
filter(group != "Other") %>%
mutate(two_digit_year = format(as.Date(paste0(year, "-01-01")), "%y")) # Ensure you have a two_digit_year column
# Define custom colors
custom_colors <- c( "#DE2D26", "#FC9272", "yellow","#9ECAE1","#6BAED6")
# Plot with the filtered data
gg <- reddit_sentiment_filtered %>%
ggplot(aes(x = two_digit_year, fill = bert_label)) +
geom_bar(position = 'stack') +
scale_x_discrete(breaks = unique(reddit_sentiment_filtered$two_digit_year)) +
scale_fill_manual(values = custom_colors) + # Use custom colors
facet_wrap(~group) + # Facet by 'group'
theme_light() + # Apply a light theme
labs(title = "Sentiment by Year", x = "Year", y="%",fill = "BERT Label")
# Print the plot
print(gg)
# sentiment by day
reddit_sentiment %>%
ggplot(aes(x = day_of_week, fill = bert_label)) +
geom_bar(position = 'fill') +
scale_fill_brewer(palette = 'PuRd', direction = -1) +
dark_theme_grey()
* Findings: The data reveals that weekends witness a lower incidence of
negative sentiments and a higher occurrence of positive sentiments.
Conversely, weekdays see the opposite effect, with more negative
sentiments. This suggests that the mood of Reddit threads shifts in a
interesting way, aligning positively with leisure time and more
negatively with the typical workweek.
reddit_sentiment %>%
ggplot(aes(x = time, fill = bert_label)) +
geom_histogram(bins = 24, position = 'stack', color = 'lightblue', lwd = 0.2) +
scale_x_continuous(breaks = seq(0, 24, by=1)) +
scale_fill_manual(values = c('#bc5090', '#bc5090', 'thistle', 'lavender', 'lavender')) +
dark_theme_grey()
Findings: The chart displays the distribution of sentiment ratings
throughout a 24-hour period.There’s a clear pattern where sentiment
expression peaks during certain hours, likely corresponding to high user
activity times.9-12am show a significant increase in sentiment
expression, particularly for positive sentiments (4 and 5 stars), which
could suggest that users are more likely to share positive experiences
during leisure hours, especially in time close to noon.1-star ratings,
which indicate negative sentiments, are less frequent overall but are
still present during the peak hours, possibly reflecting the times when
negative experiences are shared or discussed.