library(tm)
## Warning: package 'tm' was built under R version 4.1.3
## Loading required package: NLP
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(textdata)
## Warning: package 'textdata' was built under R version 4.1.3
library(tidyr)
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.1.3
library(dplyr)
library(stringr)
get_sentiments("afinn")
get_sentiments("bing")
get_sentiments("nrc")
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
tidy_books
jane_austen_sentiment
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.1.3
## Loading required package: RColorBrewer
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.1.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
I really wanted to see sentiment analysis on WSB. WSB is a very popular subreddit responsible for a lot of the unique retail trading positions of late!
url.data <- "https://raw.githubusercontent.com/fivethirtyeight/superbowl-ads/main/superbowl-ads.csv"
raw <- read.csv("WSB_Posts/reddit_wsb.csv", header = TRUE,)
raw
Firstly these all seem rather useful financial statement terms, but I really want to know how they will do for Wall Street Bets style data (https://www.reddit.com/r/wallstreetbets/)!
get_sentiments("loughran")
s <- strsplit(raw$body, split = " ")
wsb_codex <- data.frame(ID = rep(raw$id, sapply(s, length)), word = gsub("[^[:alnum:][:space:]]","",str_trim(tolower(unlist(s)))))
wsb_codex
At this point, we have one word per line, let’s make a word cloud of our data!
wsb_codex %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 300))
## Joining, by = "word"
It looks pretty interesting! I love how you can see AMC, GME, Short, all of which were highly focused topics on the subreddit.
wsb_sentiment_counts <- wsb_codex %>%
inner_join(get_sentiments("loughran")) %>%
count(word, ID, sentiment, sort = TRUE,) %>%
rename(word_count = n) %>%
ungroup()
## Joining, by = "word"
wsb_sentiment_counts
First lets get all the options from the loughran set:
wsb_neg <- get_sentiments("loughran") %>%
filter(sentiment == "negative")
wsb_pos <- get_sentiments("loughran") %>%
filter(sentiment == "positive")
wsb_unc <- get_sentiments("loughran") %>%
filter(sentiment == "uncertainty")
wsb_lit <- get_sentiments("loughran") %>%
filter(sentiment == "litigious")
Now let’s try an example of data, using a focus on the negative set
head(wsb_codex)
neg_codex <- wsb_codex %>%
semi_join(wsb_neg) %>%
group_by(ID) %>%
summarize(neg_words = n())
## Joining, by = "word"
neg_codex <- distinct(neg_codex, ID,.keep_all = TRUE)
head(neg_codex)
neg_codex <- select(neg_codex, c("ID","neg_words"))
neg_codex
So now we will make it into a function
sentiment_aug <- function(codex, purpose) {
out_codex <- codex %>%
semi_join(purpose) %>%
group_by(ID) %>%
summarize(words = n())
out_codex <- distinct(out_codex, ID,.keep_all = TRUE)
out_codex <- select(out_codex, c("ID","words"))
}
And just to ensure that our changes apply properly, and to compare that our dataframes are the same
test <- sentiment_aug(wsb_codex,wsb_neg)
## Joining, by = "word"
test
all_equal(test, neg_codex)
## Cols in `y` but not `x`: `neg_words`.
## Cols in `x` but not `y`: `words`.
What we will do here is a mess of augmenting counts for each of the words into the dataframe, and then tie it together with an ID!
final_codex <- sentiment_aug(wsb_codex,wsb_neg)
## Joining, by = "word"
final_codex <- final_codex %>%
rename(neg_words = words) %>%
left_join(sentiment_aug(wsb_codex,wsb_pos))
## Joining, by = "word"
## Joining, by = "ID"
final_codex <- final_codex %>%
rename(positive = words) %>%
left_join(sentiment_aug(wsb_codex,wsb_unc))
## Joining, by = "word"
## Joining, by = "ID"
final_codex <- final_codex %>%
rename(uncertain = words) %>%
left_join(sentiment_aug(wsb_codex,wsb_lit))
## Joining, by = "word"
## Joining, by = "ID"
final_codex <- final_codex %>% rename(lit = words)
final_codex
At this point, we have managed to attach sentiment to blog posts on the popular WSB subreddit. This could have a lot of fun uses, espeically if you factor in a ticker symbol filter!
I’d love to see the data over time for this, but I didn’t have enough time to complete it!
WSB Post Data: https://www.kaggle.com/datasets/gpreda/reddit-wallstreetsbets-posts/code
Robinson, Julia Silge and David. “2 Sentiment Analysis with Tidy Data: Text Mining with R.” 2 Sentiment Analysis with Tidy Data | Text Mining with R, https://www.tidytextmining.com/sentiment.html.
Finn Årup Nielsen, “A new ANEW: evaluation of a word list for sentiment analysis in microblogs”, Proceedings of the ESWC2011 Workshop on ‘Making Sense of Microposts’: Big things come in small packages. Volume 718 in CEUR Workshop Proceedings: 93-98. 2011 May. Matthew Rowe, Milan Stankovic, Aba-Sah Dadzie, Mariann Hardey (editors)
This dataset was first published in Minqing Hu and Bing Liu, “Mining and summarizing customer reviews.”, Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD-2004), 2004.
This dataset was published in Loughran, T. and McDonald, B. (2011), “When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and 10-Ks.” The Journal of Finance, 66: 35-65.
This dataset was published in Saif Mohammad and Peter Turney. (2013), “Crowdsourcing a Word-Emotion Association Lexicon.” Computational Intelligence, 29(3): 436-465.