Week 10 Assignment - NLP

Sentiment Analysis with Tidy Data

Exercises from tidytextmining.com

The following exercises recreate the steps described in “Sentiment Analysis with Tidy Data” at https://www.tidytextmining.com/sentiment.html.

First, we load and compare three common sentiment analysis lexicons (Afinn, Bing and NRC.)

The AFINN lexicon “assigns words with a score that runs between -5 and 5, with negative scores indicating negative sentiment and positive scores indicating positive sentiment.”

get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows

The Bing lexicon “categorizes words in a binary fashion into positive and negative categories.”

get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # … with 6,776 more rows

The nrc lexicon “categorizes words in a binary fashion into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.”

get_sentiments("nrc")
## # A tibble: 13,875 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,865 more rows

Using the janeaustenr dataset of Jane Austen novels, we’ll tokenize individual words and inner-join them to these sentiment lexicons.

library(janeaustenr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Examining the sentiment scores from a single NRC category, “joy”:

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # … with 291 more rows

Examining net sentiment scores from Bing over the course of each novel:

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing net sentiment scores from the three lexicons over the course of the novel “Pride and Prejudice.”

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")
## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # … with 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

“Both the Bing and NRC lexicons have more negative than positive words, but the ratio of negative to positive words is higher” in Bing than in NRC.

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3318
## 2 positive   2308
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Examine the most common positive and negative words:

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # … with 2,575 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Adding custom stopwords:

custom_stop_words <- bind_rows(tibble(word = c("miss"),
                                      lexicon = c("custom")), stop_words)
## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # … with 1,140 more rows

Creating wordclouds:

library(wordcloud)

tidy_books %>%
  anti_join(custom_stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

Creating comparison wordclouds:

library(reshape2)

tidy_books %>%
  anti_join(custom_stop_words) %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

Tokenizing entire sentences with unnest_tokens:

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
## [1] "by jane austen"

Tokenizing with regex patterns:

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

Identifying the most negative chapters:

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup() %>%
  arrange(desc(ratio))
## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Pride & Prejudice        34           111  2104 0.0528
## 2 Northanger Abbey         21           149  2982 0.0500
## 3 Sense & Sensibility      43           161  3405 0.0473
## 4 Mansfield Park           46           173  3685 0.0469
## 5 Emma                     15           151  3340 0.0452
## 6 Persuasion                4            62  1807 0.0343

Extending the code: Lupin

Now, we extend this code with a sentiment analysis of the novels of Maurice Leblanc and the adventures of his “gentleman burglar” Arsène Lupin. We also add one more NLP library and lexicon to the analysis, syuzhet, which is commonly used to analyze sentiment in humanities texts.

Load Lupin novels from Project Gutenberg and tidy up

library(gutenbergr)
library(syuzhet)
# load lupin
book_ids_leblanc <- 
  gutenberg_works(author == "Leblanc, Maurice") %>%
  filter(language=='en') %>%
  slice(1:3) # adjust number of books
## # A tibble: 3 × 8
##   gutenberg_id title   author  gutenberg_autho… language gutenberg_books… rights
##          <int> <chr>   <chr>              <int> <chr>    <chr>            <chr> 
## 1         1563 The Cr… Leblan…             1358 en       Crime Fiction    Publi…
## 2         4014 Arsene… Leblan…             1358 en       Crime Fiction    Publi…
## 3         4017 The Ho… Leblan…             1358 en       Crime Fiction    Publi…
## # … with 1 more variable: has_text <lgl>
lupin_books <- gutenberg_download(book_ids_leblanc) 

lupin_tidybooks <- 
  lupin_books %>%
  group_by(gutenberg_id) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, 
                                     regex("^chapter [\\divxlc]",
                                           ignore_case = TRUE)))) %>%
  ungroup()
## # A tibble: 29,001 × 4
##    gutenberg_id text                                          linenumber chapter
##           <int> <chr>                                              <int>   <int>
##  1         1563 "THE CRYSTAL STOPPER"                                  1       0
##  2         1563 ""                                                     2       0
##  3         1563 "by Maurice LeBlanc"                                   3       0
##  4         1563 ""                                                     4       0
##  5         1563 ""                                                     5       0
##  6         1563 ""                                                     6       0
##  7         1563 ""                                                     7       0
##  8         1563 "CHAPTER I. THE ARRESTS"                               8       1
##  9         1563 ""                                                     9       1
## 10         1563 "The two boats fastened to the little pier t…         10       1
## # … with 28,991 more rows

Tokenize words and get counts

lupin_words <- lupin_tidybooks %>%
  select(text) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  count(word,sort=TRUE)
## # A tibble: 10,532 × 2
##    word           n
##    <chr>      <int>
##  1 lupin       1217
##  2 duke         751
##  3 guerchard    648
##  4 door         451
##  5 daubrecq     445
##  6 beautrelet   404
##  7 eyes         325
##  8 time         321
##  9 it’s         306
## 10 cried        291
## # … with 10,522 more rows

Join to sentiment analysis lexicons

lupin_words_sa_afinn <-
  lupin_words %>%
  inner_join(get_sentiments('afinn')) %>%
  rename(sa_afin=value)

lupin_words_sa_bing <-
  lupin_words %>%
  inner_join(get_sentiments('bing')) %>%
  rename(sa_bing=sentiment)

lupin_words_sa_nrc <-
  lupin_words %>%
  inner_join(get_sentiments('nrc')) %>%
  rename(sa_nrc=sentiment)

lupin_words_sa_syuz <- 
  lupin_words %>%
  inner_join(get_sentiment_dictionary(dictionary="syuzhet")) %>%
  rename(sa_syuz=value)
## # A tibble: 1,130 × 3
##    word        n sa_afin
##    <chr>   <int>   <dbl>
##  1 cried     291      -2
##  2 matter    100       1
##  3 dear       93       2
##  4 grace      88       1
##  5 leave      80      -1
##  6 doubt      77      -1
##  7 stopped    69      -1
##  8 lost       68      -3
##  9 poor       68      -2
## 10 death      67      -2
## # … with 1,120 more rows
## # A tibble: 1,906 × 3
##    word       n sa_bing 
##    <chr>  <int> <chr>   
##  1 grace     88 positive
##  2 doubt     77 negative
##  3 saint     75 positive
##  4 lost      68 negative
##  5 poor      68 negative
##  6 death     67 negative
##  7 fell      67 negative
##  8 master    67 positive
##  9 hollow    60 negative
## 10 dead      59 negative
## # … with 1,896 more rows
## # A tibble: 5,496 × 3
##    word          n sa_nrc      
##    <chr>     <int> <chr>       
##  1 duke        751 positive    
##  2 time        321 anticipation
##  3 inspector   158 positive    
##  4 letter      157 anticipation
##  5 found       148 joy         
##  6 found       148 positive    
##  7 found       148 trust       
##  8 word        126 positive    
##  9 word        126 trust       
## 10 words       125 anger       
## # … with 5,486 more rows
## # A tibble: 3,174 × 3
##    word          n sa_syuz
##    <chr>     <int>   <dbl>
##  1 cried       291   -1   
##  2 inspector   158    0.25
##  3 found       148    0.6 
##  4 police      117   -0.25
##  5 sir          96    0.25
##  6 dear         93    0.5 
##  7 wait         81   -0.25
##  8 leave        80   -0.25
##  9 doubt        77   -0.75
## 10 saint        75    0.5 
## # … with 3,164 more rows

Examining the sentiment scores from a single NRC category, such as “anticipation”:

lupin_nrc_anticipation <- lupin_words_sa_nrc %>% 
  filter(sa_nrc == "anticipation")
## # A tibble: 376 × 3
##    word       n sa_nrc      
##    <chr>  <int> <chr>       
##  1 time     321 anticipation
##  2 letter   157 anticipation
##  3 wait      81 anticipation
##  4 saint     75 anticipation
##  5 coming    70 anticipation
##  6 child     68 anticipation
##  7 death     67 anticipation
##  8 hope      61 anticipation
##  9 top       57 anticipation
## 10 ready     56 anticipation
## # … with 366 more rows

Switching gears .. the syuzhet library has built-in functions for sentence and word parsing, and includes the the afinn, bing and nrc lexicons. We can use these to compare net sentiment scores from all four lexicons at once for the Lupin novel “The Crystal Stopper.”

# get_sentences() and get_sentiment() from the syuzhet library
lupin_crystal <- lupin_books %>% filter(gutenberg_id==1563)

lupin_cs_sentences <- get_sentences(lupin_crystal$text)

lupin_cs_sentences_sa_afinn <- get_sentiment(lupin_cs_sentences, method='afinn')
lupin_cs_sentences_sa_bing <- get_sentiment(lupin_cs_sentences, method='bing')
lupin_cs_sentences_sa_nrc <- get_sentiment(lupin_cs_sentences, method='nrc')
lupin_cs_sentences_sa_syuz <- get_sentiment(lupin_cs_sentences)

Since the four lexicons/methods use different scales, we’ll use the sign() function to denote whether a given sentiment rating is greater/less than or equal to zero. We’ll also group the sentences of segments of 120 (arbitrarily chosen for a readable plot.)

lupin_cs_sentences_sa_all <- 
  data.frame(lupin_cs_sentences,
             sign(lupin_cs_sentences_sa_afinn),
             sign(lupin_cs_sentences_sa_bing),
             sign(lupin_cs_sentences_sa_nrc),
             sign(lupin_cs_sentences_sa_syuz)) %>% 
  mutate(index = row_number()) %>% 
  mutate(segment = index %/% 120)

names(lupin_cs_sentences_sa_all) <- 
  c('sentence','afinn','bing','nrc','syuz','index','segment')
##               sentence afinn bing nrc syuz index segment
## 1  THE CRYSTAL STOPPER     0    0   1    1     1       0
## 2                          0    0   0    0     2       0
## 3   by Maurice LeBlanc     0    0   0    0     3       0
## 4                          0    0   0    0     4       0
## 5                          0    0   0    0     5       0
## 6                          0    0   0    0     6       0
## 7                          0    0   0    0     7       0
## 8           CHAPTER I.     0    0   0    0     8       0
## 9          THE ARRESTS    -1    0   0   -1     9       0
## 10                         0    0   0    0    10       0

Finally we’ll sum up the sentiment values per segment, pivot_longer and plot the results:

lupin_cs_sentences_sa_sum <- 
  lupin_cs_sentences_sa_all %>%
  select(!sentence) %>%
  group_by(segment) %>%
  summarize(afinn=sum(afinn), bing=sum(bing), nrc=sum(nrc), syuz=sum(syuz)) %>%
  pivot_longer(!segment, names_to='lexicon')
ggplot(lupin_cs_sentences_sa_sum, aes(x=segment,y=value,fill=lexicon)) +
  geom_col(show.legend=FALSE) + 
  facet_wrap(~ lexicon)


References

“Sentiment Analysis with Tidy Data” Text Mining with R: A Tidy Approach, Accessed December 28, 2021. https://www.tidytextmining.com/sentiment.html