The Sentiments datasets
There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the text.
AFFIN bing nrc
bing categorizes words in a binary fashion into positive or negative nrc categorizes into positive, negative, anger, anticipation, discust, fear, joy, sadness, suprise and trust. AFFIN assigns a score between -5 and 5, with negative indicating negative sentiment, and 5 positive.
The function get sentiments() allows us to get the specific sentiments lexicon with the measures for each one.
install.packages("textdata")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(tidytext)
library(textdata)
affin <- get_sentiments("afinn")
affin
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
Lets look at bing
bing <- get_sentiments("bing")
bing
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
And lastly nrc
nrc <- get_sentiments("nrc")
nrc
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
These libraries were created either using crowdourcing or cloud computing/ai like Amazon Mechanical Turk, or by labor of one oif the authors, and then validated with crowdsourcing.
Lets look at the words with a joy score from NRC
library(gutenbergr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
darwin <- gutenberg_download(c(944, 1227, 1228,2300), mirror ="http://mirror.csclub.uwaterloo.ca/gutenberg")
tidy_books <- darwin %>%
group_by(gutenberg_id) %>%
mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
Lets add the book name instead of GID
colnames(tidy_books)[1] <- "book"
tidy_books$book[tidy_books$book == 944]<-"The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
tidy_books$book[tidy_books$book == 1228] <- "On the origin of species By Means of Natural selection"
tidy_books$book[tidy_books$book == 2300] <- "The Descent of Man, and selection in Belation to sex"
tidy_books
## # A tibble: 786,575 × 4
## book linenumber chapter word
## <chr> <int> <int> <chr>
## 1 The Voyage of the Beagle 1 0 the
## 2 The Voyage of the Beagle 1 0 voyage
## 3 The Voyage of the Beagle 1 0 of
## 4 The Voyage of the Beagle 1 0 the
## 5 The Voyage of the Beagle 1 0 beagle
## 6 The Voyage of the Beagle 1 0 by
## 7 The Voyage of the Beagle 2 0 charles
## 8 The Voyage of the Beagle 2 0 darwin
## 9 The Voyage of the Beagle 8 0 about
## 10 The Voyage of the Beagle 8 0 the
## # ℹ 786,565 more rows
Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First lets use NRC.
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "The voyage of the Beagle") %>%
inner_join(nrc_joy) %>%
count (word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 0 × 2
## # ℹ 2 variables: word <chr>, n <int>
We can also examine how sentiment changes throughout a work.
library(tidyr)
Charles_Darwin_sentiment <- tidy_books %>%
inner_join(get_sentiments ("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
Now lets plot it
library(ggplot2)
ggplot(Charles_Darwin_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
There are several options for sentiment lexicons, you might want some more info on which is appropriate for your purpose. Here we will use all three of our dictionaries and examine how the sentiment changes across the arc of TVOTB.
library(tidyr)
voyage <- tidy_books %>%
filter(book == "The Voyage of the Beagle")
voyage
## # A tibble: 208,118 × 4
## book linenumber chapter word
## <chr> <int> <int> <chr>
## 1 The Voyage of the Beagle 1 0 the
## 2 The Voyage of the Beagle 1 0 voyage
## 3 The Voyage of the Beagle 1 0 of
## 4 The Voyage of the Beagle 1 0 the
## 5 The Voyage of the Beagle 1 0 beagle
## 6 The Voyage of the Beagle 1 0 by
## 7 The Voyage of the Beagle 2 0 charles
## 8 The Voyage of the Beagle 2 0 darwin
## 9 The Voyage of the Beagle 8 0 about
## 10 The Voyage of the Beagle 8 0 the
## # ℹ 208,108 more rows
Lets again use interger division (‘%/%’) to define larger sections of the text that span multiple lines, and we can use the same pattern with ‘count()’, ‘pivot_wider()’, and ‘mutate()’, to find the net sentiment in each of these sections of text.
affin <- voyage %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate (method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
voyage %>%
inner_join(get_sentiments("bing")) %>%
mutate (method = "Bing et al."),
voyage %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative"))
)%>%
mutate(method="NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1154 of `x` matches multiple rows in `y`.
## ℹ Row 4245 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
We can now estimate the net sentiment (positive negative) in each chunk of the novel text for each lexion (dictionary). Lets bind them all together and visualize with ggplot
bind_rows(affin, bing_and_nrc) %>%
ggplot(aes (index, sentiment, fill= method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales ="free_y")
Lets look at the counts based on each dictionary
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
bing_word_counts
## # A tibble: 2,492 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 great positive 1226
## 2 well positive 855
## 3 like positive 813
## 4 good positive 487
## 5 doubt negative 414
## 6 wild negative 317
## 7 respect positive 310
## 8 remarkable positive 295
## 9 important positive 281
## 10 bright positive 258
## # ℹ 2,482 more rows
This can be shown visually, and we can pipe straight into ggplot2
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill =sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scale = "free_y") +
labs(x = "contribution to Sentiment", y = NULL)
Lets spot an anomoly in the dataset.
custom_stop_words <- bind_rows(tibble(word = c("wild", "dark", "great", "like"), lexicon = c("custom")), stop_words)
custom_stop_words
## # A tibble: 1,153 × 2
## word lexicon
## <chr> <chr>
## 1 wild custom
## 2 dark custom
## 3 great custom
## 4 like custom
## 5 a SMART
## 6 a's SMART
## 7 able SMART
## 8 about SMART
## 9 above SMART
## 10 according SMART
## # ℹ 1,143 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scale = "free_y") +
labs(x = "Contribution to Sentiment", y = NULL)
word clouds!
We can see that tidy text mining and sentimnet analysis works well with ggplot2, but having our data in tidy format leads to other nice graphing techniques lets use the wordcloud package!!
library (wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`
## Warning in wordcloud(word, n, max.words = 100): species could not be fit on
## page. It will not be plotted.
Lets also look at comparison.cloud(), which may require turing the dataframe into a matrix.
We can change to matrix using the acast() function.
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~sentiment, value.var= "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"), max.words = 100)
## Joining with `by = join_by(word)`
Looking at units beyond words
Lots of useful work can be done by tokenizing nat the word level, but sometimes ints nice to look at different units of text. For example, we can look beyond just unigrams.
Ex I am not having a good day.
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize (negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter !=0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 5
## book chapter negativewords words ratio
## <chr> <int> <int> <int> <dbl>
## 1 On the origin of species By Means of Natur… 3 5 86 0.0581
## 2 The Descent of Man, and selection in Belat… 20 4 87 0.0460
## 3 The Expression of the Emotions in Man and … 10 249 4220 0.0590
## 4 The Voyage of the Beagle 10 375 11202 0.0335