This is an exploration of sentiment analysis using R packages. The code is taken from the book Text Mining with R: A Tidy Approach 1.
2.1 The sentiments datasets
library(tidytext)
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,875 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,865 more rows
2.2 Sentiment analysis with inner join
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # … with 291 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
2.3 Comparing the three sentiment dictionaries
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # … with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
* We can get a count on the positive and negative words in the different lexicons. Even though NRC has an overall very positive impression of the work. There were more negative term in the lexicon, 3 negative words for every 2 positive words.
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3318
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
2.4 Most common positive and negative words
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # … with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
2.5 Wordclouds
The most common word is ‘time’
library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
2.6 Looking at units beyond just words
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
Here we load the text that we will use in the second half of this data exploration. We use the ‘gutenbergr’ package to load some books by Mark Twain.
library(gutenbergr)
library(SentimentAnalysis)
##
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
##
## write
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
temp <- gutenberg_works(author == "Twain, Mark")
twain <- gutenberg_download(c("74", "76", "86", "1837"))
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
#split the books up to remove the index and preface
t1 <- twain %>% filter(gutenberg_id == 74)
t1 <- t1[459:nrow(t1),]
t2 <- twain %>% filter(gutenberg_id == 76)
t2 <- t2[514:nrow(t2),]
t3 <- twain %>% filter(gutenberg_id == 86)
t3 <- t3[332:nrow(t3),]
t4 <- twain %>% filter(gutenberg_id == 1837)
t4 <- t4[444:nrow(t4),]
twain_books <- bind_rows(t1, t2, t3, t4)
twain_books %>% group_by(gutenberg_id) %>% summarise(n())
## # A tibble: 4 × 2
## gutenberg_id `n()`
## <int> <int>
## 1 74 8413
## 2 76 11495
## 3 86 12580
## 4 1837 7706
2.2 Sentiment analysis with inner join
Before we can do sentiment analysis we need to further organize the data. The Twain data is not as clean as the Austen data. We have a bunch of rows that we don’t want included (preface, table of contents. table of illustrations).
tidy_books <- twain_books %>%
group_by(gutenberg_id) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(
text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)
))) %>%
ungroup() %>%
unnest_tokens(word, text) %>% mutate(
title = case_when(
gutenberg_id == 74 ~ "The Adventures of Tom Sawyer",
gutenberg_id == 76 ~ "Adventures of Huckleberry Finn",
gutenberg_id == 86 ~ "A Connecticut Yankee in King Arthur's Court",
gutenberg_id == 1837 ~ "The Prince and the Pauper"
)
)
We use the ‘nrc’ lexicon to do an analysis of the ‘joy’ in the book
The Adventures of Tom Sawyer.
We find that the most common words used associated with ‘joy’ are ‘good’ and ‘found’.
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(gutenberg_id == 74) %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 280 × 2
## word n
## <chr> <int>
## 1 good 105
## 2 found 83
## 3 hope 30
## 4 money 30
## 5 mighty 26
## 6 church 24
## 7 treasure 24
## 8 kind 23
## 9 glad 21
## 10 finally 19
## # … with 270 more rows
We use the bing lexicon to calculate the positive and negative sentiment for 4 works by Mark Twain.
library(tidyr)
twain_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(gutenberg_id, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative, title = case_when(
gutenberg_id == 74 ~ "The Adventures of Tom Sawyer",
gutenberg_id == 76 ~ "Adventures of Huckleberry Finn",
gutenberg_id == 86 ~ "A Connecticut Yankee in King Arthur's Court",
gutenberg_id == 1837 ~ "The Prince and the Pauper"
)
)
## Joining, by = "word"
Here we plot the sentiment over 80 line segments. We find the most positive of the four chosen works to be _Adventures of Huckleberry Fin’ and the most negative work to be The Adventures of Tom Sawyer
library(ggplot2)
ggplot(twain_sentiment, aes(index, sentiment, fill = title)) +
geom_col(show.legend = FALSE) +
facet_wrap(~title, ncol = 2, scales = "free_x")
2.3 Comparing the three sentiment dictionaries
tom_sawyer <- tidy_books %>%
filter(title == "The Adventures of Tom Sawyer")
tom_sawyer
## # A tibble: 71,125 × 5
## gutenberg_id linenumber chapter word title
## <int> <int> <int> <chr> <chr>
## 1 74 2 1 chapter The Adventures of Tom Sawyer
## 2 74 2 1 i The Adventures of Tom Sawyer
## 3 74 5 1 tom The Adventures of Tom Sawyer
## 4 74 7 1 no The Adventures of Tom Sawyer
## 5 74 7 1 answer The Adventures of Tom Sawyer
## 6 74 9 1 tom The Adventures of Tom Sawyer
## 7 74 11 1 no The Adventures of Tom Sawyer
## 8 74 11 1 answer The Adventures of Tom Sawyer
## 9 74 13 1 what’s The Adventures of Tom Sawyer
## 10 74 13 1 gone The Adventures of Tom Sawyer
## # … with 71,115 more rows
We take 3 seperate lexicons and calculate the sentiment for 80 line segments for the The Adventures of Tom Sawyer
afinn <- tom_sawyer %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
tom_sawyer %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
tom_sawyer %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
We bind each lexicon’s sentiment and plot them side-by-side. The three approaches give surprisingly similar plots.
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3318
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
2.4 Most common positive and negative words
When we do count so positive and negative words again ‘well’ and ‘good’ are the most common. I wonder if it is because common idioms contain those words? The most common negative word is ‘poor’. Mark Twain was a person who continued to struggle with money personally. I wonder if his own financial insecurity crept into his work?
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,616 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 well positive 924
## 2 good positive 762
## 3 like positive 649
## 4 right positive 600
## 5 great positive 315
## 6 enough positive 306
## 7 poor negative 295
## 8 pretty positive 236
## 9 work positive 233
## 10 dead negative 229
## # … with 2,606 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("nigger"),
lexicon = c("custom")),
stop_words)
2.5 Wordclouds
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
library(reshape2)
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
2.6 Looking at units beyond just words
tom <- as.list(t1)
tom_sentences <- tibble(text = tom$text) %>%
unnest_tokens(sentence, text, token = "sentences")
tom_sentences$sentence[2]
## [1] "“tom!”"
twain_chapters <- twain_books %>%
group_by(gutenberg_id) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
twain_chapters %>%
group_by(gutenberg_id) %>%
summarise(chapters = n())
## # A tibble: 4 × 2
## gutenberg_id chapters
## <int> <int>
## 1 74 36
## 2 76 43
## 3 86 45
## 4 1837 47
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(gutenberg_id, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(gutenberg_id, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("gutenberg_id", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.
## # A tibble: 4 × 5
## gutenberg_id chapter negativewords words ratio
## <int> <int> <int> <int> <dbl>
## 1 74 24 25 411 0.0608
## 2 76 13 76 2056 0.0370
## 3 86 17 170 3167 0.0537
## 4 1837 26 81 1393 0.0581
Now let’s add an additional lexicon. This is the ‘DictionaryGI’ lexicon which is a lexicon of opinionated words from the Harvard-IV dictionary as used in the General Inquirer software. General Inquirer is software developed for textual content analysis.
We can use this new lexicon to calculate the percentage of words that are negative just like we did with bing. The DictionaryGI lexicon gave us rates similar to bing.
GI_neg <- as.data.frame(DictionaryGI$negative)
GI_pos <- as.data.frame(DictionaryGI$positive)
wordcounts <- tidy_books %>%
group_by(gutenberg_id, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.
tidy_books %>%
semi_join(GI_neg, by=c('word'= 'DictionaryGI$negative')) %>%
group_by(gutenberg_id, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("gutenberg_id", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.
## # A tibble: 4 × 5
## gutenberg_id chapter negativewords words ratio
## <int> <int> <int> <int> <dbl>
## 1 74 24 28 411 0.0681
## 2 76 13 108 2056 0.0525
## 3 86 30 224 3605 0.0621
## 4 1837 28 87 1257 0.0692
With R it is relatively easy to perform sentiment analysis that gives you the ability to get a flavor for different works and authors. You can take a semi-structured corpus and transform/analyse it rapidly.
There are some limitations to this approach, these lexicons were created based on modern works so these results can only be taken so far. I think it makes sense to limit your analysis to modern works or only compare authors to the work of their contemporaries.
I felt that each of the lexicon’s performed equally well for my Mark Twain corpus. There was no lexicon that gave me results that we not inline with the others. For the example Austen text, the NRC lexicon gave results that were far more positive than bing or AFINN so I might not choose NRC for analysis of non-modern texts.
Citation: