Major3

R Markdown

Describe in one sentence what you aim to examine using user-generated text data and sentiment analysis.

How opinions about drugs have changed over time
Search Reddit threads using a keyword of your choice.
- Specifying a subreddit for your search is optional.
- It is okay to combine data obtained by searching the keyword across multiple subreddits.
- You can choose any period, but ensure you gather a sufficient amount of data so that you can get meaningful results.

packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "dplyr", "tidyverse", "igraph", "ggraph", "wordcloud2", "textdata", "sf", "tmap", "here")

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}
library("RedditExtractoR")
library("anytime")
library("magrittr")
library("httr")
library("tidytext")
library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("tidyverse")

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library("igraph")

## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## 
## The following object is masked from 'package:tidyr':
## 
##     crossing
## 
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## 
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union

library("ggraph")
library("wordcloud2")
library("textdata")

## 
## Attaching package: 'textdata'
## 
## The following object is masked from 'package:httr':
## 
##     cache_info

library("sf")

## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE

library("tmap")

## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')

library("here")

## here() starts at /home/rstudio

# Load packages
invisible(lapply(packages, library, character.only = TRUE))

threads_2 <- find_thread_urls(keywords = 'drugs', 
                              sort_by = 'relevance', 
                              period = 'all') %>% drop_na()


subreddit_counts <- threads_2 %>%
  drop_na(text)%>%
  group_by(subreddit) %>%
  summarise(post_count = n()) %>%
  arrange(desc(post_count))


iama <- find_thread_urls(keywords= 'drugs', subreddit = 'NoStupidQuestions',  sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(politics) <- NULL


ami <- find_thread_urls(keywords= 'drugs', subreddit ='AMA',sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(ami) <- NULL

made <- find_thread_urls(keywords= 'drugs', subreddit ='therewasanattempt',sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(made) <- NULL

interesting <- find_thread_urls(keywords= 'drugs', subreddit ='interestingasfuck',sort_by = 'relevance', period = 'all') %>% drop_na()
rownames(interesting) <- NULL


interesting <- interesting %>% mutate(source = 'interestingasfuck')
iama <- iama %>% mutate(source = 'iamatotalpieceofshit')
ami <- ami %>% mutate(source = 'AmItheAsshole')
made <- made %>% mutate(source = 'MadeMeSmile')

# 데이터프레임 합치기
drugs <- bind_rows(news, iama, ami, made)

write.csv(drugs,'drugs.csv')

Clean your text data and then tokenize it.

library(dplyr)
drugs <- read.csv('drugs.csv')

replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"
data("stop_words")

drugs_clean <- drugs %>% 
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  unnest_tokens(word, text, token = "words") %>% 
  anti_join(stop_words, by = "word") %>% 
  filter(str_detect(word, "[a-z]"))

Generate a word cloud that illustrates the frequency of words except your keyword.

except_keywords <- c("drugs")
drugs_clean <- drugs_clean %>%
  filter(!str_detect(drugs_clean$word, paste(except_keywords, collapse = "|")))

drugs_clean %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2()

Conduct a tri-gram analysis.
- Extract tri-grams from your text data.
- Remove tri-grams containing stop words or non-alphabetic terms.
- Present the frequency of tri-grams in a table.
- Discuss any noteworthy tri-grams you come across.
- If no meaningful tri-grams are found, you may analyze bi-grams as well. However, you still need to show results of the tri-grams.

drugs_ngram <- drugs %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)

drugs_ngram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()

paired_words	n
NA	565
a lot of	27
i don t	27
ask me anything	24
when i was	22
i have a	21
i was a	12
i ended up	11
i was in	11
a drug addict	10
and i was	10
at the time	10
i ve been	10
i found out	9
i had a	9
i m a	9
i m just	9
i m not	9
i went to	9
in front of	9

drugs_ngram_trio <- drugs_ngram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# filter rows where there are stop words under word 1 column and word 2 column
drugs_ngram_filter <- drugs_ngram_trio %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]")& str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
# To see what's ASCII, google 'ASCII table'
library(stringi)

drugs_ngram_filter %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2)&stri_enc_isascii(word3))

# Sort the new bi-gram (n=2) counts:
drugs_counts <- drugs_ngram_filter %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

head(drugs_counts, 20) %>% knitr::kable()

word1	word2	word3	n
happened	feel	free	2
initial	phone	call	2
shareutm_medium	ios_apputm_name	iossmf	2
utm_source	shareutm_medium	ios_apputm_name	2
0800a.m	eastern	time	1
11am	1pm	est	1
24h	mcdonalds	im	1
26th	birthday	drinking	1
2cb	dmt	psilocybin	1
2h30	trip	sitting	1
_by_showing_my_girlfriend_my_actual_strength	utm_source	shareutm_medium	1
_girl_is_cringe_the_guy_is_chad	utm_source	shareutm_medium	1
abortion	centre	laws	1
absolutely	dysfunctional	hysterical	1
absolutely	popped	positive	1
abuse	ama	edit	1
abusive	alcoholic	father	1
accelerated	spiritual	growth	1
accidental	fentanyl	od	1
accidentally	mother	missed	1

drugs_ngram_bi <- drugs_ngram %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

## Warning: Expected 2 pieces. Additional pieces discarded in 35101 rows [219, 220, 221,
## 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
## 238, ...].

drugs_ngram_bi_filter <- drugs_ngram_bi %>%
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word) %>% 
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]"))


drugs_ngram_bi_filter %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2))

# Sort the new bi-gram (n=2) counts:
drugs_bi_counts <- drugs_ngram_bi_filter %>%
  count(word1, word2) %>%
  arrange(desc(n))

head(drugs_bi_counts, 20) %>% knitr::kable()

word1	word2	n
drug	addict	11
drug	dealers	7
feel	free	6
hard	drugs	6
mental	health	6
drug	dealer	5
multiple	times	5
days	ago	4
drug	tests	4
illegal	drugs	4
nights	week	4
parking	lot	4
phone	call	4
prescription	drugs	4
recreational	drugs	4
sell	drugs	4
smoke	weed	4
afab	people	3
ama	edit	3
ama	update	3

Bi-gram Analysis

Since it was challenging to extract meaningful insights from tri-grams, I performed a bi-gram analysis instead, which provided more interpretable results. Unsurprisingly, familiar terms associated with drugs appeared frequently, such as “drug addict,” “drug dealer,” “the feeling when using drugs,” and “hard drugs.”

Perform a sentiment analysis on your text data using a dictionary method that accommodates negations.
- You are welcome to apply a deep learning-based model to enrich your analysis, but employing the dictionary method is imperative.

library(sentimentr)
sentiment_drugs <- sentiment(drugs$text) %>%
  arrange(desc(sentiment)) 
                    
head(sentiment_drugs, 10) %>% 
  knitr::kable()

element_id	sentence_id	word_count	sentiment
588	50	85	2.1389343
424	52	39	1.2213775
654	5	15	1.1618950
445	47	7	1.0545209
628	5	20	1.0285913
571	13	5	1.0230011
445	54	11	0.9768968
546	10	2	0.9545942
628	9	2	0.9545942
475	13	34	0.9175174

Display 10 sample texts alongside their sentiment scores and evaluate the credibility of the sentiment analysis outcomes.

set.seed(123) 

drugs_sentiment_10 <- drugs %>%
  filter(nzchar(text) & !grepl("http[s]?://", text)) %>%
  mutate(sentiment_score = sapply(text, function(text) {
    sentiment(text)$sentiment[1]
  }))%>%
  mutate(sentiment_pn = case_when(sentiment_score > 0 ~ 'positive',
                                  sentiment_score == 0 ~ 'neutral',
                                  TRUE ~ 'negative'))

drugs_sentiment_sample <- drugs_sentiment_10 %>%
  sample_n(10) %>%
  mutate(title = strtrim(title, 50),
         text = strtrim(text, 50)) %>%
  arrange(desc(sentiment_score))
  

drugs_sentiment_sample %>%
  select(title, sentiment_score, sentiment_pn) %>%
  knitr::kable()

title	sentiment_score	sentiment_pn
My bf is diagnosed ASPD (diagnosed as being an act	0.2309401	positive
I was a high level drug supplier for 5 years. Neve	0.1788854	positive
My mom was a sociopath and manipulative drug addic	0.0000000	neutral
Ask a old man anything.	0.0000000	neutral
what is up with Ozempic??	-0.0416025	negative
I went through a major traumatic event that change	-0.0589256	negative
To buy drugs from the wrong phone number	-0.2041241	negative
Why is it that conservatives want less regulation	-0.2453739	negative
I was paralyzed for 2 and a half months and remain	-0.4244373	negative
Do the drug addicts realize the hell they are livi	-0.9009344	negative

Discuss intriguing insights derived from the sentiment analysis, supporting your observations with at least two plots.

drugs_sentiment_plot <- drugs_sentiment_10 %>% select(date_utc, title, text,subreddit, sentiment_score)

ggplot(drugs_sentiment_plot, aes(x = sentiment_score)) +
  geom_density(fill = "lightblue", alpha = 0.9) +
  labs(title = "Distribution of Sentiment", x = "Sentiment_score", y = "Density") +
  theme_dark()

drugs_sentiment_plot <- drugs_sentiment_10 %>% select(date_utc, title, text,subreddit, sentiment_score, sentiment_pn)

drugs_sentiment_year <- drugs_sentiment_10 %>%
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date)) %>%
  mutate(year = year(date),
         time = timestamp %>%
           anytime(tz = anytime:::getTZ()) %>%
           str_split('-| |:') %>%
           sapply(function(x) as.numeric(x[4])))

drugs_sentiment_year %>%
  ggplot(aes(x = year, fill = sentiment_pn)) +
  geom_bar(position = 'stack') +
  scale_x_continuous(breaks = seq(min(drugs_sentiment_year$year),
                                  max(drugs_sentiment_year$year),
                                  by = 1)) +
  scale_fill_brewer(palette = 'PuRd', direction = -1)

pn_ratio<- drugs_sentiment_year %>%
  group_by(year) %>%
  summarise(total_count = n(),
            negative_count = sum(sentiment_pn == "negative", na.rm = TRUE),
            positive_count = sum(sentiment_pn == "positive", na.rm = TRUE),
            neutral_count = sum(sentiment_pn == "neutral", na.rm = TRUE)) %>%
  mutate(negative_ratio = negative_count / total_count,
         positive_ratio = positive_count / total_count,
         neutral_ratio = neutral_count / total_count)

pn_ratio_plot <- pn_ratio %>%
  select(year, negative_ratio, positive_ratio, neutral_ratio) %>%
  pivot_longer(cols = c(negative_ratio, positive_ratio, neutral_ratio),
               names_to = "sentiment",
               values_to = "ratio")

ggplot(pn_ratio_plot, aes(x = year, y = ratio, color = sentiment, alpha = 0.5)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_x_continuous(breaks = seq(min(pn_ratio_plot$year), max(pn_ratio_plot$year), by = 1)) +
  labs(title = "Yearly Negative, Neutral and Positive Sentiment Ratios",
       x = "Year",
       y = "Sentiment Ratio",
       color = "Sentiment") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

However, when examining the overall proportion of each sentiment type in 2024—amid a drastic increase in the number of opinions—it is clear that the total number of comments has risen across all sentiment categories (negative, positive, and neutral).

Major3

Sujin Lee

2024-11-24

R Markdown

Bi-gram Analysis

However, when examining the overall proportion of each sentiment type in 2024—amid a drastic increase in the number of opinions—it is clear that the total number of comments has risen across all sentiment categories (negative, positive, and neutral).