# Example code from "Text Mining with R", Chapter 2
# Citation: Silge, J., & Robinson, D. (2017). Text Mining with R: A Tidy Approach. O'Reilly Media.
# What are the most common joy words in Emma?
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(textdata)
library(tidyr)
library(ggplot2)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
head(tidy_books)
## # A tibble: 6 Ă— 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
# Now that the text is in a tidy format with one word per row, we are ready to do the sentiment analysis. First, let’s use the NRC lexicon and filter() for the joy words. Next, let’s filter() the data frame with the text from the books for the words from Emma and then use inner_join() to perform the sentiment analysis.
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
nrc_joy
## # A tibble: 687 Ă— 2
## word sentiment
## <chr> <chr>
## 1 absolution joy
## 2 abundance joy
## 3 abundant joy
## 4 accolade joy
## 5 accompaniment joy
## 6 accomplish joy
## 7 accomplished joy
## 8 achieve joy
## 9 achievement joy
## 10 acrobat joy
## # ℹ 677 more rows
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 301 Ă— 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ℹ 291 more rows
head(tidy_books)
## # A tibble: 6 Ă— 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
# We then use pivot_wider() so that we have negative and positive sentiment in separate columns, and lastly calculate a net sentiment (positive - negative)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
head(jane_austen_sentiment)
## # A tibble: 6 Ă— 5
## book index negative positive sentiment
## <fct> <dbl> <int> <int> <int>
## 1 Sense & Sensibility 0 16 32 16
## 2 Sense & Sensibility 1 19 53 34
## 3 Sense & Sensibility 2 12 31 19
## 4 Sense & Sensibility 3 15 31 16
## 5 Sense & Sensibility 4 16 34 18
## 6 Sense & Sensibility 5 16 51 35
# Now we can plot these sentiment scores across the plot trajectory of each novel.
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
Comparing the three sentiment dictionaries :
# Now filter Pride & Prejudice book
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
head(pride_prejudice)
## # A tibble: 6 Ă— 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
# Now create a net sentiment for each part of the book using each of the lexicons
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
head(afinn)
## # A tibble: 6 Ă— 3
## index sentiment method
## <dbl> <dbl> <chr>
## 1 0 29 AFINN
## 2 1 0 AFINN
## 3 2 20 AFINN
## 4 3 30 AFINN
## 5 4 62 AFINN
## 6 5 66 AFINN
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
head(bing_and_nrc)
## # A tibble: 6 Ă— 5
## method index negative positive sentiment
## <chr> <dbl> <int> <int> <int>
## 1 Bing et al. 0 7 21 14
## 2 Bing et al. 1 20 19 -1
## 3 Bing et al. 2 16 20 4
## 4 Bing et al. 3 19 31 12
## 5 Bing et al. 4 23 47 24
## 6 Bing et al. 5 15 49 34
# Plot net sentiment:
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
# Let’s look briefly at how many positive and negative words are in these lexicons.
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 Ă— 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 Ă— 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
Most common positive and negative words:
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
head(bing_word_counts)
## # A tibble: 6 Ă— 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
# Now add word “miss” to custom words:
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
head(custom_stop_words)
## # A tibble: 6 Ă— 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
Wordclouds:
library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
Looking at units beyond just words:
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
head(p_and_p_sentences)
## # A tibble: 6 Ă— 1
## sentence
## <chr>
## 1 pride and prejudice
## 2 by jane austen
## 3 chapter 1
## 4 it is a truth universally acknowledged, that a single man in possession
## 5 of a good fortune, must be in want of a wife.
## 6 however little known the feelings or views of such a man may be on his
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
head(austen_chapters)
## # A tibble: 6 Ă— 2
## book chapter
## <fct> <chr>
## 1 Sense & Sensibility "sense and sensibility\n\nby jane austen\n\n(1811)\n\n\n\…
## 2 Sense & Sensibility "\n\n\nthe family of dashwood had long been settled in su…
## 3 Sense & Sensibility "\n\n\nmrs. john dashwood now installed herself mistress …
## 4 Sense & Sensibility "\n\n\nmrs. dashwood remained at norland several months; …
## 5 Sense & Sensibility "\n\n\n\"what a pity it is, elinor,\" said marianne, \"th…
## 6 Sense & Sensibility "\n\n\nno sooner was her answer dispatched, than mrs. das…
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 Ă— 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
head(austen_chapters)
## # A tibble: 6 Ă— 2
## book chapter
## <fct> <chr>
## 1 Sense & Sensibility "sense and sensibility\n\nby jane austen\n\n(1811)\n\n\n\…
## 2 Sense & Sensibility "\n\n\nthe family of dashwood had long been settled in su…
## 3 Sense & Sensibility "\n\n\nmrs. john dashwood now installed herself mistress …
## 4 Sense & Sensibility "\n\n\nmrs. dashwood remained at norland several months; …
## 5 Sense & Sensibility "\n\n\n\"what a pity it is, elinor,\" said marianne, \"th…
## 6 Sense & Sensibility "\n\n\nno sooner was her answer dispatched, than mrs. das…
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
head(bingnegative)
## # A tibble: 6 Ă— 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
head(wordcounts)
## # A tibble: 6 Ă— 3
## # Groups: book [1]
## book chapter words
## <fct> <int> <int>
## 1 Sense & Sensibility 0 7
## 2 Sense & Sensibility 1 1571
## 3 Sense & Sensibility 2 1970
## 4 Sense & Sensibility 3 1538
## 5 Sense & Sensibility 4 1952
## 6 Sense & Sensibility 5 1030
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## # A tibble: 6 Ă— 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
head(tidy_books)
## # A tibble: 6 Ă— 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
Now, let’s use the gutenbergr library to analyze another book, “Moby Dick” by Herman Melville using similar approach.
library(gutenbergr)
moby_dick <- gutenberg_download(2701, mirror = "http://www.gutenberg.lib.md.us/")
head(moby_dick)
## # A tibble: 6 Ă— 2
## gutenberg_id text
## <int> <chr>
## 1 2701 "MOBY-DICK;"
## 2 2701 ""
## 3 2701 "or, THE WHALE."
## 4 2701 ""
## 5 2701 "By Herman Melville"
## 6 2701 ""
tidy_moby_dick <- moby_dick %>%
mutate(text = str_replace_all(text, "[^[:alnum:][:space:]]", "")) %>%
unnest_tokens(word, text)
head(tidy_moby_dick)
## # A tibble: 6 Ă— 2
## gutenberg_id word
## <int> <chr>
## 1 2701 mobydick
## 2 2701 or
## 3 2701 the
## 4 2701 whale
## 5 2701 by
## 6 2701 herman
# Perform sentiment analysis:
# Using NRC lexicon for joy words
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_moby_dick %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 409 Ă— 2
## word n
## <chr> <int>
## 1 good 195
## 2 god 118
## 3 found 115
## 4 true 85
## 5 sun 83
## 6 present 79
## 7 green 48
## 8 mighty 47
## 9 art 41
## 10 hope 34
## # ℹ 399 more rows
moby_dick_sentiment <- tidy_moby_dick %>%
inner_join(get_sentiments("bing")) %>%
count(index = row_number() %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 65454 of `x` matches multiple rows in `y`.
## ℹ Row 5125 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
head(moby_dick_sentiment)
## # A tibble: 6 Ă— 4
## index negative positive sentiment
## <dbl> <int> <int> <int>
## 1 0 43 36 -7
## 2 1 40 40 0
## 3 2 40 40 0
## 4 3 28 52 24
## 5 4 48 32 -16
## 6 5 49 31 -18
ggplot(moby_dick_sentiment, aes(index, sentiment)) +
geom_col(show.legend = FALSE) +
labs(title = "Sentiment Analysis of Moby Dick",
x = "Index",
y = "Sentiment Score")
# Compare sentiment dictionaries:
# AFINN lexicon
afinn <- tidy_moby_dick %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = row_number() %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
# Bing and NRC lexicons
bing_and_nrc <- bind_rows(
tidy_moby_dick %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
tidy_moby_dick %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = row_number() %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 65454 of `x` matches multiple rows in `y`.
## ℹ Row 5125 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 882 of `x` matches multiple rows in `y`.
## ℹ Row 5175 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bind_rows(afinn, bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y") +
labs(title = "Sentiment Analysis Comparison for Moby Dick",
x = "Index",
y = "Sentiment Score")
Now, Let’s use the sentimentr package to perform sentiment analysis on “Moby Dick”:
library(sentimentr)
library(gutenbergr)
moby_dick <- gutenberg_download(2701, mirror = "http://www.gutenberg.lib.md.us/")
head(moby_dick)
## # A tibble: 6 Ă— 2
## gutenberg_id text
## <int> <chr>
## 1 2701 "MOBY-DICK;"
## 2 2701 ""
## 3 2701 "or, THE WHALE."
## 4 2701 ""
## 5 2701 "By Herman Melville"
## 6 2701 ""
# prepare the text data:
tidy_moby_dick <- moby_dick %>%
mutate(text = str_replace_all(text, "[^[:alnum:][:space:]]", "")) %>%
unnest_tokens(sentence, text, token = "sentences")
head(tidy_moby_dick)
## # A tibble: 6 Ă— 2
## gutenberg_id sentence
## <int> <chr>
## 1 2701 mobydick
## 2 2701 or the whale
## 3 2701 by herman melville
## 4 2701 contents
## 5 2701 etymology
## 6 2701 extracts supplied by a subsublibrarian
# perform sentiment analysis using sentimentr:
# calculate sentiment for each sentence
sentiment_scores <- sentiment(tidy_moby_dick$sentence)
## Warning: Each time `sentiment` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory. It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.
# add sentiment scores to the original data
tidy_moby_dick <- tidy_moby_dick %>%
mutate(sentiment = sentiment_scores$sentiment)
head(tidy_moby_dick)
## # A tibble: 6 Ă— 3
## gutenberg_id sentence sentiment
## <int> <chr> <dbl>
## 1 2701 mobydick 0
## 2 2701 or the whale 0
## 3 2701 by herman melville 0
## 4 2701 contents 0
## 5 2701 etymology 0
## 6 2701 extracts supplied by a subsublibrarian 0
# aggregate sentiment scores by index
moby_dick_sentiment <- tidy_moby_dick %>%
mutate(index = row_number() %/% 80) %>%
group_by(index) %>%
summarise(sentiment = sum(sentiment))
head(moby_dick_sentiment)
## # A tibble: 6 Ă— 2
## index sentiment
## <dbl> <dbl>
## 1 0 0.287
## 2 1 1.24
## 3 2 -0.424
## 4 3 0.223
## 5 4 0.201
## 6 5 -1.25
ggplot(moby_dick_sentiment, aes(index, sentiment)) +
geom_col(show.legend = FALSE) +
labs(title = "Sentiment Analysis of Moby Dick",
x = "Index",
y = "Sentiment Score")
Conclusion:
This follwing graph shows the emotional ups and downs in Moby Dick:
Overall, the emotional tone of Moby Dick swings between positive and negative, with more leaning toward the negative side.