Introduction

This is an exploration of sentiment analysis using R packages. The code is taken from the book Text Mining with R: A Tidy Approach 1.

Method

2.1 The sentiments datasets

library(tidytext)

get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # … with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,875 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,865 more rows

2.2 Sentiment analysis with inner join

library(janeaustenr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # … with 291 more rows
library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

2.3 Comparing the three sentiment dictionaries

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # … with 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

* We can get a count on the positive and negative words in the different lexicons. Even though NRC has an overall very positive impression of the work. There were more negative term in the lexicon, 3 negative words for every 2 positive words.

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3318
## 2 positive   2308
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

2.4 Most common positive and negative words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # … with 2,575 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words
## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # … with 1,140 more rows

2.5 Wordclouds

The most common word is ‘time’

library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

2.6 Looking at units beyond just words

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Now with a new text…

Here we load the text that we will use in the second half of this data exploration. We use the ‘gutenbergr’ package to load some books by Mark Twain.

  • We look to see what works are available by Mark Twain.
  • Then we load the four works of interest.
  • The books require some filtering, we don’t want any table of contents or prefaces
library(gutenbergr)
library(SentimentAnalysis)
## 
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
## 
##     write
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
temp <- gutenberg_works(author == "Twain, Mark")
twain <- gutenberg_download(c("74", "76", "86", "1837"))
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
#split the books up to remove the index and preface
t1 <- twain %>% filter(gutenberg_id == 74)
t1 <- t1[459:nrow(t1),]

t2 <- twain %>% filter(gutenberg_id == 76)
t2 <- t2[514:nrow(t2),]

t3 <- twain %>% filter(gutenberg_id == 86)
t3 <- t3[332:nrow(t3),]

t4 <- twain %>% filter(gutenberg_id == 1837)
t4 <- t4[444:nrow(t4),]

twain_books <- bind_rows(t1, t2, t3, t4)

twain_books %>% group_by(gutenberg_id) %>% summarise(n())
## # A tibble: 4 × 2
##   gutenberg_id `n()`
##          <int> <int>
## 1           74  8413
## 2           76 11495
## 3           86 12580
## 4         1837  7706

2.2 Sentiment analysis with inner join

Before we can do sentiment analysis we need to further organize the data. The Twain data is not as clean as the Austen data. We have a bunch of rows that we don’t want included (preface, table of contents. table of illustrations).

  • We group by book and generate the line number from the row number
  • We using a pretty clever way to calculate the chapter, we use the str_detect function to search for chapter beginning and take the cumulative sum of their occurrences
  • We use the unnest_tokens() function that results in a tidy dataframe of one word per row
  • We also add in the book titles based on their ‘gutenberg_id’
tidy_books <- twain_books %>%
  group_by(gutenberg_id) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(
           text,
           regex("^chapter [\\divxlc]",
                 ignore_case = TRUE)
         ))) %>%
  ungroup() %>%
  unnest_tokens(word, text) %>% mutate(
    title = case_when(
      gutenberg_id == 74 ~ "The Adventures of Tom Sawyer",
      gutenberg_id == 76 ~ "Adventures of Huckleberry Finn",
      gutenberg_id == 86 ~ "A Connecticut Yankee in King Arthur's Court",
      gutenberg_id == 1837 ~ "The Prince and the Pauper"
    )
  )

We use the ‘nrc’ lexicon to do an analysis of the ‘joy’ in the book
The Adventures of Tom Sawyer.

  • We use the get_sentiments() function to get the ‘nrc’ lexicon
  • We filter down to just the words that imply/related to ‘joy’
  • We inner join on the words of the book and the lexicon words related to ‘joy’
  • Last we do a count on those words

We find that the most common words used associated with ‘joy’ are ‘good’ and ‘found’.

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(gutenberg_id == 74) %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 280 × 2
##    word         n
##    <chr>    <int>
##  1 good       105
##  2 found       83
##  3 hope        30
##  4 money       30
##  5 mighty      26
##  6 church      24
##  7 treasure    24
##  8 kind        23
##  9 glad        21
## 10 finally     19
## # … with 270 more rows

We use the bing lexicon to calculate the positive and negative sentiment for 4 works by Mark Twain.

  • We join in the Bing lexicon
  • We chunk up our text into 80 line segments
  • We use the ‘pivot_wider’ function to split the sentiment into two separate columns
  • We calculate the sentiment by subtracting the negative from the positive
library(tidyr)

twain_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(gutenberg_id, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative,     title = case_when(
      gutenberg_id == 74 ~ "The Adventures of Tom Sawyer",
      gutenberg_id == 76 ~ "Adventures of Huckleberry Finn",
      gutenberg_id == 86 ~ "A Connecticut Yankee in King Arthur's Court",
      gutenberg_id == 1837 ~ "The Prince and the Pauper"
    )
  )
## Joining, by = "word"

Here we plot the sentiment over 80 line segments. We find the most positive of the four chosen works to be _Adventures of Huckleberry Fin’ and the most negative work to be The Adventures of Tom Sawyer

library(ggplot2)

ggplot(twain_sentiment, aes(index, sentiment, fill = title)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~title, ncol = 2, scales = "free_x")

2.3 Comparing the three sentiment dictionaries

  • We filter down to just one title The Adventures of Tom Sawyer
tom_sawyer <- tidy_books %>% 
  filter(title == "The Adventures of Tom Sawyer")

tom_sawyer
## # A tibble: 71,125 × 5
##    gutenberg_id linenumber chapter word    title                       
##           <int>      <int>   <int> <chr>   <chr>                       
##  1           74          2       1 chapter The Adventures of Tom Sawyer
##  2           74          2       1 i       The Adventures of Tom Sawyer
##  3           74          5       1 tom     The Adventures of Tom Sawyer
##  4           74          7       1 no      The Adventures of Tom Sawyer
##  5           74          7       1 answer  The Adventures of Tom Sawyer
##  6           74          9       1 tom     The Adventures of Tom Sawyer
##  7           74         11       1 no      The Adventures of Tom Sawyer
##  8           74         11       1 answer  The Adventures of Tom Sawyer
##  9           74         13       1 what’s  The Adventures of Tom Sawyer
## 10           74         13       1 gone    The Adventures of Tom Sawyer
## # … with 71,115 more rows

We take 3 seperate lexicons and calculate the sentiment for 80 line segments for the The Adventures of Tom Sawyer

  • We inner join the lexicons to our tidy corpus
  • We group by the index (80 line segments)
  • We calculate sentiment for each index (varies by lexicon)
afinn <- tom_sawyer %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
  tom_sawyer %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  tom_sawyer %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"

We bind each lexicon’s sentiment and plot them side-by-side. The three approaches give surprisingly similar plots.

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

  • Like we did with the Austen works, we can get counts of the positive and negative words for each lexicon.
get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3318
## 2 positive   2308
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

2.4 Most common positive and negative words

When we do count so positive and negative words again ‘well’ and ‘good’ are the most common. I wonder if it is because common idioms contain those words? The most common negative word is ‘poor’. Mark Twain was a person who continued to struggle with money personally. I wonder if his own financial insecurity crept into his work?

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,616 × 3
##    word   sentiment     n
##    <chr>  <chr>     <int>
##  1 well   positive    924
##  2 good   positive    762
##  3 like   positive    649
##  4 right  positive    600
##  5 great  positive    315
##  6 enough positive    306
##  7 poor   negative    295
##  8 pretty positive    236
##  9 work   positive    233
## 10 dead   negative    229
## # … with 2,606 more rows
  • When we plot the negative and positive word frequencies we see that ‘well and good’ are very close in frequency and ‘poor’ is not an outlier for negative words. Its frequency is very close to the other top 10 negative words.
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

  • I’m adding a word to my stop word list that I do not want to appear in my word cloud. It’s worth noting that this word appears in the top 100 most frequently used non-stop words.
custom_stop_words <- bind_rows(tibble(word = c("nigger"),  
                                      lexicon = c("custom")), 
                               stop_words)

2.5 Wordclouds

  • We can create a word cloud where we anti join our stop words to keep them from being included. ‘Time’ and ‘Tom’ are our most common words.
library(wordcloud)

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

  • We can sort our word cloud by sentiment and we get results consistent with our frequency bar plots.
library(reshape2)

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

2.6 Looking at units beyond just words

  • We have the ability to choose a token other than ‘word’
tom <- as.list(t1)

tom_sentences <- tibble(text = tom$text) %>% 
  unnest_tokens(sentence, text, token = "sentences")
tom_sentences$sentence[2]
## [1] "“tom!”"
twain_chapters <- twain_books %>%
  group_by(gutenberg_id) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

twain_chapters %>% 
  group_by(gutenberg_id) %>% 
  summarise(chapters = n())
## # A tibble: 4 × 2
##   gutenberg_id chapters
##          <int>    <int>
## 1           74       36
## 2           76       43
## 3           86       45
## 4         1837       47
  • We can use the bing lexicon to determine what percentage of each work is negative. Interestingly, each of the four chosen Twain works has a very similar percentage (5%).
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(words = n())
## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("gutenberg_id", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.
## # A tibble: 4 × 5
##   gutenberg_id chapter negativewords words  ratio
##          <int>   <int>         <int> <int>  <dbl>
## 1           74      24            25   411 0.0608
## 2           76      13            76  2056 0.0370
## 3           86      17           170  3167 0.0537
## 4         1837      26            81  1393 0.0581

Now let’s add an additional lexicon. This is the ‘DictionaryGI’ lexicon which is a lexicon of opinionated words from the Harvard-IV dictionary as used in the General Inquirer software. General Inquirer is software developed for textual content analysis.

We can use this new lexicon to calculate the percentage of words that are negative just like we did with bing. The DictionaryGI lexicon gave us rates similar to bing.

GI_neg <- as.data.frame(DictionaryGI$negative)
GI_pos <- as.data.frame(DictionaryGI$positive)


wordcounts <- tidy_books %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(words = n())
## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.
tidy_books %>%
  semi_join(GI_neg, by=c('word'= 'DictionaryGI$negative')) %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("gutenberg_id", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.
## # A tibble: 4 × 5
##   gutenberg_id chapter negativewords words  ratio
##          <int>   <int>         <int> <int>  <dbl>
## 1           74      24            28   411 0.0681
## 2           76      13           108  2056 0.0525
## 3           86      30           224  3605 0.0621
## 4         1837      28            87  1257 0.0692

Conclusion

With R it is relatively easy to perform sentiment analysis that gives you the ability to get a flavor for different works and authors. You can take a semi-structured corpus and transform/analyse it rapidly.

There are some limitations to this approach, these lexicons were created based on modern works so these results can only be taken so far. I think it makes sense to limit your analysis to modern works or only compare authors to the work of their contemporaries.

I felt that each of the lexicon’s performed equally well for my Mark Twain corpus. There was no lexicon that gave me results that we not inline with the others. For the example Austen text, the NRC lexicon gave results that were far more positive than bing or AFINN so I might not choose NRC for analysis of non-modern texts.

Citation:

  1. Silge, Julia, et al. “ 2 Sentiment Analysis with Tidy Data .” Text Mining with R: A Tidy Approach, O’Reilly Media, Sebastopol, CA, 2017.