R Markdown

  1. Describe in one sentence what you aim to examine using user-generated text data and sentiment analysis.
    How opinions about drugs have changed over time
  2. Search Reddit threads using a keyword of your choice.
    • Specifying a subreddit for your search is optional.
    • It is okay to combine data obtained by searching the keyword across multiple subreddits.
    • You can choose any period, but ensure you gather a sufficient amount of data so that you can get meaningful results.
packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "dplyr", "tidyverse", "igraph", "ggraph", "wordcloud2", "textdata", "sf", "tmap", "here")

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}
library("RedditExtractoR")
library("anytime")
library("magrittr")
library("httr")
library("tidytext")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("igraph")
## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## 
## The following object is masked from 'package:tidyr':
## 
##     crossing
## 
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## 
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union
library("ggraph")
library("wordcloud2")
library("textdata")
## 
## Attaching package: 'textdata'
## 
## The following object is masked from 'package:httr':
## 
##     cache_info
library("sf")
## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE
library("tmap")
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library("here")
## here() starts at /home/rstudio
# Load packages
invisible(lapply(packages, library, character.only = TRUE))

threads_2 <- find_thread_urls(keywords = 'drugs', 
                              sort_by = 'relevance', 
                              period = 'all') %>% drop_na()


subreddit_counts <- threads_2 %>%
  drop_na(text)%>%
  group_by(subreddit) %>%
  summarise(post_count = n()) %>%
  arrange(desc(post_count))


iama <- find_thread_urls(keywords= 'drugs', subreddit = 'NoStupidQuestions',  sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(politics) <- NULL


ami <- find_thread_urls(keywords= 'drugs', subreddit ='AMA',sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(ami) <- NULL

made <- find_thread_urls(keywords= 'drugs', subreddit ='therewasanattempt',sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(made) <- NULL

interesting <- find_thread_urls(keywords= 'drugs', subreddit ='interestingasfuck',sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(interesting) <- NULL


interesting <- interesting %>% mutate(source = 'interestingasfuck')
iama <- iama %>% mutate(source = 'iamatotalpieceofshit')
ami <- ami %>% mutate(source = 'AmItheAsshole')
made <- made %>% mutate(source = 'MadeMeSmile')

# 데이터프레임 합치기
drugs <- bind_rows(news, iama, ami, made)

write.csv(drugs,'drugs.csv')
  1. Clean your text data and then tokenize it.
library(dplyr)
drugs <- read.csv('drugs.csv')

replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"
data("stop_words")

drugs_clean <- drugs %>% 
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  unnest_tokens(word, text, token = "words") %>% 
  anti_join(stop_words, by = "word") %>% 
  filter(str_detect(word, "[a-z]"))
  1. Generate a word cloud that illustrates the frequency of words except your keyword.
except_keywords <- c("drugs")
drugs_clean <- drugs_clean %>%
  filter(!str_detect(drugs_clean$word, paste(except_keywords, collapse = "|")))

drugs_clean %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2()
  1. Conduct a tri-gram analysis.
    • Extract tri-grams from your text data.
    • Remove tri-grams containing stop words or non-alphabetic terms.
    • Present the frequency of tri-grams in a table.
    • Discuss any noteworthy tri-grams you come across.
    • If no meaningful tri-grams are found, you may analyze bi-grams as well. However, you still need to show results of the tri-grams.
drugs_ngram <- drugs %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)

drugs_ngram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()
paired_words n
NA 565
a lot of 27
i don t 27
ask me anything 24
when i was 22
i have a 21
i was a 12
i ended up 11
i was in 11
a drug addict 10
and i was 10
at the time 10
i ve been 10
i found out 9
i had a 9
i m a 9
i m just 9
i m not 9
i went to 9
in front of 9
drugs_ngram_trio <- drugs_ngram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# filter rows where there are stop words under word 1 column and word 2 column
drugs_ngram_filter <- drugs_ngram_trio %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]")& str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
# To see what's ASCII, google 'ASCII table'
library(stringi)

drugs_ngram_filter %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2)&stri_enc_isascii(word3))

# Sort the new bi-gram (n=2) counts:
drugs_counts <- drugs_ngram_filter %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

head(drugs_counts, 20) %>% knitr::kable()
word1 word2 word3 n
happened feel free 2
initial phone call 2
shareutm_medium ios_apputm_name iossmf 2
utm_source shareutm_medium ios_apputm_name 2
0800a.m eastern time 1
11am 1pm est 1
24h mcdonalds im 1
26th birthday drinking 1
2cb dmt psilocybin 1
2h30 trip sitting 1
_by_showing_my_girlfriend_my_actual_strength utm_source shareutm_medium 1
_girl_is_cringe_the_guy_is_chad utm_source shareutm_medium 1
abortion centre laws 1
absolutely dysfunctional hysterical 1
absolutely popped positive 1
abuse ama edit 1
abusive alcoholic father 1
accelerated spiritual growth 1
accidental fentanyl od 1
accidentally mother missed 1
drugs_ngram_bi <- drugs_ngram %>%
  separate(paired_words, c("word1", "word2"), sep = " ")
## Warning: Expected 2 pieces. Additional pieces discarded in 35101 rows [219, 220, 221,
## 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
## 238, ...].
drugs_ngram_bi_filter <- drugs_ngram_bi %>%
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word) %>% 
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]"))


drugs_ngram_bi_filter %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2))

# Sort the new bi-gram (n=2) counts:
drugs_bi_counts <- drugs_ngram_bi_filter %>%
  count(word1, word2) %>%
  arrange(desc(n))

head(drugs_bi_counts, 20) %>% knitr::kable()
word1 word2 n
drug addict 11
drug dealers 7
feel free 6
hard drugs 6
mental health 6
drug dealer 5
multiple times 5
days ago 4
drug tests 4
illegal drugs 4
nights week 4
parking lot 4
phone call 4
prescription drugs 4
recreational drugs 4
sell drugs 4
smoke weed 4
afab people 3
ama edit 3
ama update 3

Bi-gram Analysis

Since it was challenging to extract meaningful insights from tri-grams, I performed a bi-gram analysis instead, which provided more interpretable results. Unsurprisingly, familiar terms associated with drugs appeared frequently, such as “drug addict,” “drug dealer,” “the feeling when using drugs,” and “hard drugs.”

  1. Perform a sentiment analysis on your text data using a dictionary method that accommodates negations.
    • You are welcome to apply a deep learning-based model to enrich your analysis, but employing the dictionary method is imperative.
library(sentimentr)
sentiment_drugs <- sentiment(drugs$text) %>%
  arrange(desc(sentiment)) 
                    
head(sentiment_drugs, 10) %>% 
  knitr::kable()
element_id sentence_id word_count sentiment
588 50 85 2.1389343
424 52 39 1.2213775
654 5 15 1.1618950
445 47 7 1.0545209
628 5 20 1.0285913
571 13 5 1.0230011
445 54 11 0.9768968
546 10 2 0.9545942
628 9 2 0.9545942
475 13 34 0.9175174
  1. Display 10 sample texts alongside their sentiment scores and evaluate the credibility of the sentiment analysis outcomes.
set.seed(123) 

drugs_sentiment_10 <- drugs %>%
  filter(nzchar(text) & !grepl("http[s]?://", text)) %>%
  mutate(sentiment_score = sapply(text, function(text) {
    sentiment(text)$sentiment[1]
  }))%>%
  mutate(sentiment_pn = case_when(sentiment_score > 0 ~ 'positive',
                                  sentiment_score == 0 ~ 'neutral',
                                  TRUE ~ 'negative'))

drugs_sentiment_sample <- drugs_sentiment_10 %>%
  sample_n(10) %>%
  mutate(title = strtrim(title, 50),
         text = strtrim(text, 50)) %>%
  arrange(desc(sentiment_score))
  

drugs_sentiment_sample %>%
  select(title, sentiment_score, sentiment_pn) %>%
  knitr::kable()
title sentiment_score sentiment_pn
My bf is diagnosed ASPD (diagnosed as being an act 0.2309401 positive
I was a high level drug supplier for 5 years. Neve 0.1788854 positive
My mom was a sociopath and manipulative drug addic 0.0000000 neutral
Ask a old man anything. 0.0000000 neutral
what is up with Ozempic?? -0.0416025 negative
I went through a major traumatic event that change -0.0589256 negative
To buy drugs from the wrong phone number -0.2041241 negative
Why is it that conservatives want less regulation -0.2453739 negative
I was paralyzed for 2 and a half months and remain -0.4244373 negative
Do the drug addicts realize the hell they are livi -0.9009344 negative
  1. Discuss intriguing insights derived from the sentiment analysis, supporting your observations with at least two plots.
drugs_sentiment_plot <- drugs_sentiment_10 %>% select(date_utc, title, text,subreddit, sentiment_score)

ggplot(drugs_sentiment_plot, aes(x = sentiment_score)) +
  geom_density(fill = "lightblue", alpha = 0.9) +
  labs(title = "Distribution of Sentiment", x = "Sentiment_score", y = "Density") +
  theme_dark()

drugs_sentiment_plot <- drugs_sentiment_10 %>% select(date_utc, title, text,subreddit, sentiment_score, sentiment_pn)

drugs_sentiment_year <- drugs_sentiment_10 %>%
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date)) %>%
  mutate(year = year(date),
         time = timestamp %>%
           anytime(tz = anytime:::getTZ()) %>%
           str_split('-| |:') %>%
           sapply(function(x) as.numeric(x[4])))

drugs_sentiment_year %>%
  ggplot(aes(x = year, fill = sentiment_pn)) +
  geom_bar(position = 'stack') +
  scale_x_continuous(breaks = seq(min(drugs_sentiment_year$year),
                                  max(drugs_sentiment_year$year),
                                  by = 1)) +
  scale_fill_brewer(palette = 'PuRd', direction = -1)

pn_ratio<- drugs_sentiment_year %>%
  group_by(year) %>%
  summarise(total_count = n(),
            negative_count = sum(sentiment_pn == "negative", na.rm = TRUE),
            positive_count = sum(sentiment_pn == "positive", na.rm = TRUE),
            neutral_count = sum(sentiment_pn == "neutral", na.rm = TRUE)) %>%
  mutate(negative_ratio = negative_count / total_count,
         positive_ratio = positive_count / total_count,
         neutral_ratio = neutral_count / total_count)

pn_ratio_plot <- pn_ratio %>%
  select(year, negative_ratio, positive_ratio, neutral_ratio) %>%
  pivot_longer(cols = c(negative_ratio, positive_ratio, neutral_ratio),
               names_to = "sentiment",
               values_to = "ratio")

ggplot(pn_ratio_plot, aes(x = year, y = ratio, color = sentiment, alpha = 0.5)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_x_continuous(breaks = seq(min(pn_ratio_plot$year), max(pn_ratio_plot$year), by = 1)) +
  labs(title = "Yearly Negative, Neutral and Positive Sentiment Ratios",
       x = "Year",
       y = "Sentiment Ratio",
       color = "Sentiment") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

However, when examining the overall proportion of each sentiment type in 2024—amid a drastic increase in the number of opinions—it is clear that the total number of comments has risen across all sentiment categories (negative, positive, and neutral).