R Markdown
- Describe in one sentence what you aim to examine using
user-generated text data and sentiment analysis.
-
How opinions about drugs have changed over time
- Search Reddit threads using a keyword of your choice.
- Specifying a subreddit for your search is optional.
- It is okay to combine data obtained by searching the keyword across
multiple subreddits.
- You can choose any period, but ensure you gather a sufficient amount
of data so that you can get meaningful results.
packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "dplyr", "tidyverse", "igraph", "ggraph", "wordcloud2", "textdata", "sf", "tmap", "here")
# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
library("RedditExtractoR")
library("anytime")
library("magrittr")
library("httr")
library("tidytext")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("igraph")
##
## Attaching package: 'igraph'
##
## The following objects are masked from 'package:lubridate':
##
## %--%, union
##
## The following objects are masked from 'package:purrr':
##
## compose, simplify
##
## The following object is masked from 'package:tidyr':
##
## crossing
##
## The following object is masked from 'package:tibble':
##
## as_data_frame
##
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
##
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
##
## The following object is masked from 'package:base':
##
## union
library("ggraph")
library("wordcloud2")
library("textdata")
##
## Attaching package: 'textdata'
##
## The following object is masked from 'package:httr':
##
## cache_info
library("sf")
## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE
library("tmap")
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library("here")
## here() starts at /home/rstudio
# Load packages
invisible(lapply(packages, library, character.only = TRUE))
threads_2 <- find_thread_urls(keywords = 'drugs',
sort_by = 'relevance',
period = 'all') %>% drop_na()
subreddit_counts <- threads_2 %>%
drop_na(text)%>%
group_by(subreddit) %>%
summarise(post_count = n()) %>%
arrange(desc(post_count))
iama <- find_thread_urls(keywords= 'drugs', subreddit = 'NoStupidQuestions', sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(politics) <- NULL
ami <- find_thread_urls(keywords= 'drugs', subreddit ='AMA',sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(ami) <- NULL
made <- find_thread_urls(keywords= 'drugs', subreddit ='therewasanattempt',sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(made) <- NULL
interesting <- find_thread_urls(keywords= 'drugs', subreddit ='interestingasfuck',sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(interesting) <- NULL
interesting <- interesting %>% mutate(source = 'interestingasfuck')
iama <- iama %>% mutate(source = 'iamatotalpieceofshit')
ami <- ami %>% mutate(source = 'AmItheAsshole')
made <- made %>% mutate(source = 'MadeMeSmile')
# 데이터프레임 합치기
drugs <- bind_rows(news, iama, ami, made)
write.csv(drugs,'drugs.csv')
- Clean your text data and then tokenize it.
library(dplyr)
drugs <- read.csv('drugs.csv')
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&|<|>"
data("stop_words")
drugs_clean <- drugs %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
unnest_tokens(word, text, token = "words") %>%
anti_join(stop_words, by = "word") %>%
filter(str_detect(word, "[a-z]"))
- Generate a word cloud that illustrates the frequency of words except
your keyword.
except_keywords <- c("drugs")
drugs_clean <- drugs_clean %>%
filter(!str_detect(drugs_clean$word, paste(except_keywords, collapse = "|")))
drugs_clean %>%
count(word, sort = TRUE) %>%
wordcloud2()
- Conduct a tri-gram analysis.
- Extract tri-grams from your text data.
- Remove tri-grams containing stop words or non-alphabetic terms.
- Present the frequency of tri-grams in a table.
- Discuss any noteworthy tri-grams you come across.
- If no meaningful tri-grams are found, you may analyze bi-grams as
well. However, you still need to show results of the tri-grams.
drugs_ngram <- drugs %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
select(text) %>%
unnest_tokens(output = paired_words,
input = text,
token = "ngrams",
n = 3)
drugs_ngram %>%
count(paired_words, sort = TRUE) %>%
head(20) %>%
knitr::kable()
| NA |
565 |
| a lot of |
27 |
| i don t |
27 |
| ask me anything |
24 |
| when i was |
22 |
| i have a |
21 |
| i was a |
12 |
| i ended up |
11 |
| i was in |
11 |
| a drug addict |
10 |
| and i was |
10 |
| at the time |
10 |
| i ve been |
10 |
| i found out |
9 |
| i had a |
9 |
| i m a |
9 |
| i m just |
9 |
| i m not |
9 |
| i went to |
9 |
| in front of |
9 |
drugs_ngram_trio <- drugs_ngram %>%
separate(paired_words, c("word1", "word2", "word3"), sep = " ")
# filter rows where there are stop words under word 1 column and word 2 column
drugs_ngram_filter <- drugs_ngram_trio %>%
# drop stop words
filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>%
# drop non-alphabet-only strings
filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]")& str_detect(word3, "[a-z]"))
# Filter out words that are not encoded in ASCII
# To see what's ASCII, google 'ASCII table'
library(stringi)
drugs_ngram_filter %<>%
filter(stri_enc_isascii(word1) & stri_enc_isascii(word2)&stri_enc_isascii(word3))
# Sort the new bi-gram (n=2) counts:
drugs_counts <- drugs_ngram_filter %>%
count(word1, word2, word3) %>%
arrange(desc(n))
head(drugs_counts, 20) %>% knitr::kable()
| happened |
feel |
free |
2 |
| initial |
phone |
call |
2 |
| shareutm_medium |
ios_apputm_name |
iossmf |
2 |
| utm_source |
shareutm_medium |
ios_apputm_name |
2 |
| 0800a.m |
eastern |
time |
1 |
| 11am |
1pm |
est |
1 |
| 24h |
mcdonalds |
im |
1 |
| 26th |
birthday |
drinking |
1 |
| 2cb |
dmt |
psilocybin |
1 |
| 2h30 |
trip |
sitting |
1 |
| _by_showing_my_girlfriend_my_actual_strength |
utm_source |
shareutm_medium |
1 |
| _girl_is_cringe_the_guy_is_chad |
utm_source |
shareutm_medium |
1 |
| abortion |
centre |
laws |
1 |
| absolutely |
dysfunctional |
hysterical |
1 |
| absolutely |
popped |
positive |
1 |
| abuse |
ama |
edit |
1 |
| abusive |
alcoholic |
father |
1 |
| accelerated |
spiritual |
growth |
1 |
| accidental |
fentanyl |
od |
1 |
| accidentally |
mother |
missed |
1 |
drugs_ngram_bi <- drugs_ngram %>%
separate(paired_words, c("word1", "word2"), sep = " ")
## Warning: Expected 2 pieces. Additional pieces discarded in 35101 rows [219, 220, 221,
## 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
## 238, ...].
drugs_ngram_bi_filter <- drugs_ngram_bi %>%
filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word) %>%
filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]"))
drugs_ngram_bi_filter %<>%
filter(stri_enc_isascii(word1) & stri_enc_isascii(word2))
# Sort the new bi-gram (n=2) counts:
drugs_bi_counts <- drugs_ngram_bi_filter %>%
count(word1, word2) %>%
arrange(desc(n))
head(drugs_bi_counts, 20) %>% knitr::kable()
| drug |
addict |
11 |
| drug |
dealers |
7 |
| feel |
free |
6 |
| hard |
drugs |
6 |
| mental |
health |
6 |
| drug |
dealer |
5 |
| multiple |
times |
5 |
| days |
ago |
4 |
| drug |
tests |
4 |
| illegal |
drugs |
4 |
| nights |
week |
4 |
| parking |
lot |
4 |
| phone |
call |
4 |
| prescription |
drugs |
4 |
| recreational |
drugs |
4 |
| sell |
drugs |
4 |
| smoke |
weed |
4 |
| afab |
people |
3 |
| ama |
edit |
3 |
| ama |
update |
3 |
Bi-gram Analysis
Since it was challenging to extract meaningful insights from
tri-grams, I performed a bi-gram analysis instead, which provided more
interpretable results. Unsurprisingly, familiar terms associated with
drugs appeared frequently, such as “drug addict,” “drug dealer,” “the
feeling when using drugs,” and “hard drugs.”
- Perform a sentiment analysis on your text data using a dictionary
method that accommodates negations.
- You are welcome to apply a deep learning-based model to enrich your
analysis, but employing the dictionary method is imperative.
library(sentimentr)
sentiment_drugs <- sentiment(drugs$text) %>%
arrange(desc(sentiment))
head(sentiment_drugs, 10) %>%
knitr::kable()
| 588 |
50 |
85 |
2.1389343 |
| 424 |
52 |
39 |
1.2213775 |
| 654 |
5 |
15 |
1.1618950 |
| 445 |
47 |
7 |
1.0545209 |
| 628 |
5 |
20 |
1.0285913 |
| 571 |
13 |
5 |
1.0230011 |
| 445 |
54 |
11 |
0.9768968 |
| 546 |
10 |
2 |
0.9545942 |
| 628 |
9 |
2 |
0.9545942 |
| 475 |
13 |
34 |
0.9175174 |
- Display 10 sample texts alongside their sentiment scores and
evaluate the credibility of the sentiment analysis outcomes.
set.seed(123)
drugs_sentiment_10 <- drugs %>%
filter(nzchar(text) & !grepl("http[s]?://", text)) %>%
mutate(sentiment_score = sapply(text, function(text) {
sentiment(text)$sentiment[1]
}))%>%
mutate(sentiment_pn = case_when(sentiment_score > 0 ~ 'positive',
sentiment_score == 0 ~ 'neutral',
TRUE ~ 'negative'))
drugs_sentiment_sample <- drugs_sentiment_10 %>%
sample_n(10) %>%
mutate(title = strtrim(title, 50),
text = strtrim(text, 50)) %>%
arrange(desc(sentiment_score))
drugs_sentiment_sample %>%
select(title, sentiment_score, sentiment_pn) %>%
knitr::kable()
| My bf is diagnosed ASPD (diagnosed as being an act |
0.2309401 |
positive |
| I was a high level drug supplier for 5 years. Neve |
0.1788854 |
positive |
| My mom was a sociopath and manipulative drug addic |
0.0000000 |
neutral |
| Ask a old man anything. |
0.0000000 |
neutral |
| what is up with Ozempic?? |
-0.0416025 |
negative |
| I went through a major traumatic event that change |
-0.0589256 |
negative |
| To buy drugs from the wrong phone number |
-0.2041241 |
negative |
| Why is it that conservatives want less regulation |
-0.2453739 |
negative |
| I was paralyzed for 2 and a half months and remain |
-0.4244373 |
negative |
| Do the drug addicts realize the hell they are livi |
-0.9009344 |
negative |
- Discuss intriguing insights derived from the sentiment analysis,
supporting your observations with at least two plots.
drugs_sentiment_plot <- drugs_sentiment_10 %>% select(date_utc, title, text,subreddit, sentiment_score)
ggplot(drugs_sentiment_plot, aes(x = sentiment_score)) +
geom_density(fill = "lightblue", alpha = 0.9) +
labs(title = "Distribution of Sentiment", x = "Sentiment_score", y = "Density") +
theme_dark()

drugs_sentiment_plot <- drugs_sentiment_10 %>% select(date_utc, title, text,subreddit, sentiment_score, sentiment_pn)
drugs_sentiment_year <- drugs_sentiment_10 %>%
mutate(date = as.POSIXct(date_utc)) %>%
filter(!is.na(date)) %>%
mutate(year = year(date),
time = timestamp %>%
anytime(tz = anytime:::getTZ()) %>%
str_split('-| |:') %>%
sapply(function(x) as.numeric(x[4])))
drugs_sentiment_year %>%
ggplot(aes(x = year, fill = sentiment_pn)) +
geom_bar(position = 'stack') +
scale_x_continuous(breaks = seq(min(drugs_sentiment_year$year),
max(drugs_sentiment_year$year),
by = 1)) +
scale_fill_brewer(palette = 'PuRd', direction = -1)

pn_ratio<- drugs_sentiment_year %>%
group_by(year) %>%
summarise(total_count = n(),
negative_count = sum(sentiment_pn == "negative", na.rm = TRUE),
positive_count = sum(sentiment_pn == "positive", na.rm = TRUE),
neutral_count = sum(sentiment_pn == "neutral", na.rm = TRUE)) %>%
mutate(negative_ratio = negative_count / total_count,
positive_ratio = positive_count / total_count,
neutral_ratio = neutral_count / total_count)
pn_ratio_plot <- pn_ratio %>%
select(year, negative_ratio, positive_ratio, neutral_ratio) %>%
pivot_longer(cols = c(negative_ratio, positive_ratio, neutral_ratio),
names_to = "sentiment",
values_to = "ratio")
ggplot(pn_ratio_plot, aes(x = year, y = ratio, color = sentiment, alpha = 0.5)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_y_continuous(labels = scales::percent_format()) +
scale_x_continuous(breaks = seq(min(pn_ratio_plot$year), max(pn_ratio_plot$year), by = 1)) +
labs(title = "Yearly Negative, Neutral and Positive Sentiment Ratios",
x = "Year",
y = "Sentiment Ratio",
color = "Sentiment") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

I analyzed how opinions about drugs have changed over time.
Regarding the overall distribution of sentiment scores, a score of 0
(neutral) is notably high, with the proportion of neutral opinions
reaching 25% in 2024. To further explore this trend, I visualized the
distribution over time and observed a significant increase in posts
related to drugs in recent years, especially in 2024.
However, when examining the overall proportion of each sentiment
type in 2024—amid a drastic increase in the number of opinions—it is
clear that the total number of comments has risen across all sentiment
categories (negative, positive, and neutral).