Week 10 Assignment - NLP
Sentiment Analysis with Tidy Data
Exercises from tidytextmining.com
The following exercises recreate the steps described in “Sentiment Analysis with Tidy Data” at https://www.tidytextmining.com/sentiment.html.
First, we load and compare three common sentiment analysis lexicons (Afinn, Bing and NRC.)
The AFINN lexicon “assigns words with a score that runs between -5 and 5, with negative scores indicating negative sentiment and positive scores indicating positive sentiment.”
get_sentiments("afinn")## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
The Bing lexicon “categorizes words in a binary fashion into positive and negative categories.”
get_sentiments("bing")## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
The nrc lexicon “categorizes words in a binary fashion into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.”
get_sentiments("nrc")## # A tibble: 13,875 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,865 more rows
Using the janeaustenr dataset of Jane Austen novels, we’ll tokenize individual words and inner-join them to these sentiment lexicons.
library(janeaustenr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)Examining the sentiment scores from a single NRC category, “joy”:
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # … with 291 more rows
Examining net sentiment scores from Bing over the course of each novel:
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")Comparing net sentiment scores from the three lexicons over the course of the novel “Pride and Prejudice.”
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # … with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")“Both the Bing and NRC lexicons have more negative than positive words, but the ratio of negative to positive words is higher” in Bing than in NRC.
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3318
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
Examine the most common positive and negative words:
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # … with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)Adding custom stopwords:
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")), stop_words)## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
Creating wordclouds:
library(wordcloud)
tidy_books %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))Creating comparison wordclouds:
library(reshape2)
tidy_books %>%
anti_join(custom_stop_words) %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)Tokenizing entire sentences with unnest_tokens:
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")## [1] "by jane austen"
Tokenizing with regex patterns:
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
Identifying the most negative chapters:
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup() %>%
arrange(desc(ratio))## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Pride & Prejudice 34 111 2104 0.0528
## 2 Northanger Abbey 21 149 2982 0.0500
## 3 Sense & Sensibility 43 161 3405 0.0473
## 4 Mansfield Park 46 173 3685 0.0469
## 5 Emma 15 151 3340 0.0452
## 6 Persuasion 4 62 1807 0.0343
Extending the code: Lupin
Now, we extend this code with a sentiment analysis of the novels of Maurice Leblanc and the adventures of his “gentleman burglar” Arsène Lupin. We also add one more NLP library and lexicon to the analysis, syuzhet, which is commonly used to analyze sentiment in humanities texts.
Load Lupin novels from Project Gutenberg and tidy up
library(gutenbergr)
library(syuzhet)# load lupin
book_ids_leblanc <-
gutenberg_works(author == "Leblanc, Maurice") %>%
filter(language=='en') %>%
slice(1:3) # adjust number of books## # A tibble: 3 × 8
## gutenberg_id title author gutenberg_autho… language gutenberg_books… rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 1563 The Cr… Leblan… 1358 en Crime Fiction Publi…
## 2 4014 Arsene… Leblan… 1358 en Crime Fiction Publi…
## 3 4017 The Ho… Leblan… 1358 en Crime Fiction Publi…
## # … with 1 more variable: has_text <lgl>
lupin_books <- gutenberg_download(book_ids_leblanc)
lupin_tidybooks <-
lupin_books %>%
group_by(gutenberg_id) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()## # A tibble: 29,001 × 4
## gutenberg_id text linenumber chapter
## <int> <chr> <int> <int>
## 1 1563 "THE CRYSTAL STOPPER" 1 0
## 2 1563 "" 2 0
## 3 1563 "by Maurice LeBlanc" 3 0
## 4 1563 "" 4 0
## 5 1563 "" 5 0
## 6 1563 "" 6 0
## 7 1563 "" 7 0
## 8 1563 "CHAPTER I. THE ARRESTS" 8 1
## 9 1563 "" 9 1
## 10 1563 "The two boats fastened to the little pier t… 10 1
## # … with 28,991 more rows
Tokenize words and get counts
lupin_words <- lupin_tidybooks %>%
select(text) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(word,sort=TRUE)## # A tibble: 10,532 × 2
## word n
## <chr> <int>
## 1 lupin 1217
## 2 duke 751
## 3 guerchard 648
## 4 door 451
## 5 daubrecq 445
## 6 beautrelet 404
## 7 eyes 325
## 8 time 321
## 9 it’s 306
## 10 cried 291
## # … with 10,522 more rows
Join to sentiment analysis lexicons
lupin_words_sa_afinn <-
lupin_words %>%
inner_join(get_sentiments('afinn')) %>%
rename(sa_afin=value)
lupin_words_sa_bing <-
lupin_words %>%
inner_join(get_sentiments('bing')) %>%
rename(sa_bing=sentiment)
lupin_words_sa_nrc <-
lupin_words %>%
inner_join(get_sentiments('nrc')) %>%
rename(sa_nrc=sentiment)
lupin_words_sa_syuz <-
lupin_words %>%
inner_join(get_sentiment_dictionary(dictionary="syuzhet")) %>%
rename(sa_syuz=value)## # A tibble: 1,130 × 3
## word n sa_afin
## <chr> <int> <dbl>
## 1 cried 291 -2
## 2 matter 100 1
## 3 dear 93 2
## 4 grace 88 1
## 5 leave 80 -1
## 6 doubt 77 -1
## 7 stopped 69 -1
## 8 lost 68 -3
## 9 poor 68 -2
## 10 death 67 -2
## # … with 1,120 more rows
## # A tibble: 1,906 × 3
## word n sa_bing
## <chr> <int> <chr>
## 1 grace 88 positive
## 2 doubt 77 negative
## 3 saint 75 positive
## 4 lost 68 negative
## 5 poor 68 negative
## 6 death 67 negative
## 7 fell 67 negative
## 8 master 67 positive
## 9 hollow 60 negative
## 10 dead 59 negative
## # … with 1,896 more rows
## # A tibble: 5,496 × 3
## word n sa_nrc
## <chr> <int> <chr>
## 1 duke 751 positive
## 2 time 321 anticipation
## 3 inspector 158 positive
## 4 letter 157 anticipation
## 5 found 148 joy
## 6 found 148 positive
## 7 found 148 trust
## 8 word 126 positive
## 9 word 126 trust
## 10 words 125 anger
## # … with 5,486 more rows
## # A tibble: 3,174 × 3
## word n sa_syuz
## <chr> <int> <dbl>
## 1 cried 291 -1
## 2 inspector 158 0.25
## 3 found 148 0.6
## 4 police 117 -0.25
## 5 sir 96 0.25
## 6 dear 93 0.5
## 7 wait 81 -0.25
## 8 leave 80 -0.25
## 9 doubt 77 -0.75
## 10 saint 75 0.5
## # … with 3,164 more rows
Examining the sentiment scores from a single NRC category, such as “anticipation”:
lupin_nrc_anticipation <- lupin_words_sa_nrc %>%
filter(sa_nrc == "anticipation")## # A tibble: 376 × 3
## word n sa_nrc
## <chr> <int> <chr>
## 1 time 321 anticipation
## 2 letter 157 anticipation
## 3 wait 81 anticipation
## 4 saint 75 anticipation
## 5 coming 70 anticipation
## 6 child 68 anticipation
## 7 death 67 anticipation
## 8 hope 61 anticipation
## 9 top 57 anticipation
## 10 ready 56 anticipation
## # … with 366 more rows
Switching gears .. the syuzhet library has built-in functions for sentence and word parsing, and includes the the afinn, bing and nrc lexicons. We can use these to compare net sentiment scores from all four lexicons at once for the Lupin novel “The Crystal Stopper.”
# get_sentences() and get_sentiment() from the syuzhet library
lupin_crystal <- lupin_books %>% filter(gutenberg_id==1563)
lupin_cs_sentences <- get_sentences(lupin_crystal$text)
lupin_cs_sentences_sa_afinn <- get_sentiment(lupin_cs_sentences, method='afinn')
lupin_cs_sentences_sa_bing <- get_sentiment(lupin_cs_sentences, method='bing')
lupin_cs_sentences_sa_nrc <- get_sentiment(lupin_cs_sentences, method='nrc')
lupin_cs_sentences_sa_syuz <- get_sentiment(lupin_cs_sentences)Since the four lexicons/methods use different scales, we’ll use the sign() function to denote whether a given sentiment rating is greater/less than or equal to zero. We’ll also group the sentences of segments of 120 (arbitrarily chosen for a readable plot.)
lupin_cs_sentences_sa_all <-
data.frame(lupin_cs_sentences,
sign(lupin_cs_sentences_sa_afinn),
sign(lupin_cs_sentences_sa_bing),
sign(lupin_cs_sentences_sa_nrc),
sign(lupin_cs_sentences_sa_syuz)) %>%
mutate(index = row_number()) %>%
mutate(segment = index %/% 120)
names(lupin_cs_sentences_sa_all) <-
c('sentence','afinn','bing','nrc','syuz','index','segment')## sentence afinn bing nrc syuz index segment
## 1 THE CRYSTAL STOPPER 0 0 1 1 1 0
## 2 0 0 0 0 2 0
## 3 by Maurice LeBlanc 0 0 0 0 3 0
## 4 0 0 0 0 4 0
## 5 0 0 0 0 5 0
## 6 0 0 0 0 6 0
## 7 0 0 0 0 7 0
## 8 CHAPTER I. 0 0 0 0 8 0
## 9 THE ARRESTS -1 0 0 -1 9 0
## 10 0 0 0 0 10 0
Finally we’ll sum up the sentiment values per segment, pivot_longer and plot the results:
lupin_cs_sentences_sa_sum <-
lupin_cs_sentences_sa_all %>%
select(!sentence) %>%
group_by(segment) %>%
summarize(afinn=sum(afinn), bing=sum(bing), nrc=sum(nrc), syuz=sum(syuz)) %>%
pivot_longer(!segment, names_to='lexicon')ggplot(lupin_cs_sentences_sa_sum, aes(x=segment,y=value,fill=lexicon)) +
geom_col(show.legend=FALSE) +
facet_wrap(~ lexicon)References
“Sentiment Analysis with Tidy Data” Text Mining with R: A Tidy Approach, Accessed December 28, 2021. https://www.tidytextmining.com/sentiment.html