library(tidyverse)
library(dplyr)
library(tidytext)
library(janeaustenr)
library(dplyr)
library(stringr)
library(tm)
library(textdata)
library(gutenbergr)
library(stopwords)
library(ggplot2)
library(wordcloud)
library(lexicon)

Introduction into sentiments and text analysis

From Chapter 2 in “Text Mining with R: The Tidy approach”, We learn about sentiment analysis. Each word is ranked on sentiments, which determines if a term has a positive or negative feeling associated.

Tidy text provides three lexicon’s to measure word association assignment: ncr, bing, and AFINN. Below, the chapter provided their examples of the use of all three lexicons on their corpus of choice janeaustenr. These examples are from Chapter two, which can be found below 1.

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Extension of Chapter Two

Corpus with complex emotions

For our extension, We will use a romance book called “The Story of the Lover” by Hutchins Hapgood. Let’s say for the sentiments, we want to view only the terms that surround negative emotions like anger.

Lets use the nrc lexicon and retrieve the most common words from the book which involves anger. From our pull, We see a plot that could be about another partner as terms like jealously, possession and infidelity ranked in the top ten of the list.

story_lover <- gutenberg_download(67706)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
story_lover<-story_lover%>%ungroup() %>%
  unnest_tokens(word, text)

story_lover<-story_lover%>% filter(!(word %in% stopwords(source = "snowball")))

nrc_anger<-get_sentiments("nrc")%>%filter(sentiment=="anger")

story_lover %>%
  inner_join(nrc_anger) %>%
  count(word, sort = TRUE)%>%slice_max(n, n = 10)%>%ggplot(aes(n, word)) +
  geom_col(show.legend = FALSE) +theme_light()
## Joining, by = "word"

Expansions of lexicons

For our new lexicon, I will use the lexicon package to retrieve a new lexicon. For our analysis, Lets see the most common terms associated with sensations. I will use the key regressive imagery table to best reflect the term.

Sensations like sight and touch were most used in the novel compared the other three senses.

imagery<-key_regressive_imagery
imagery<-imagery%>%filter(category=="sensation")
imagery$regex<-str_replace_all(imagery$regex,"[\\\\b]","")
names(imagery)<-c("x","y","z","a","word")

story_lover %>%
  inner_join(imagery) %>%
  count(word, sort = TRUE)%>%slice_max(n, n = 10)%>%with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

Conclusion

For my corpus, the key regressive imagery lexicon was more granular compared to the lexicons provided by tidytext. There was more control on the specific terms/feelings in the listings than bing’s eight emotions. The lexicon can better defined emotions like romance or senses like taste than the lists provided.


  1. Robinson, Julia Silge and David. “2 Sentiment Analysis with Tidy Data: Text Mining with R.” 2 Sentiment Analysis with Tidy Data | Text Mining with R, https://www.tidytextmining.com/sentiment.html. ↩︎

LS0tDQp0aXRsZTogIkhvbWV3b3JrIDEwIg0KYXV0aG9yOiAiVnlhbm5hIEhpbGwiDQpkYXRlOiAiYHIgU3lzLkRhdGUoKWAiDQpvdXRwdXQ6IG9wZW5pbnRybzo6bGFiX3JlcG9ydA0KLS0tDQoNCmBgYHtyIGxvYWQtcGFja2FnZXMsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KHRpZHl0ZXh0KQ0KbGlicmFyeShqYW5lYXVzdGVucikNCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KHN0cmluZ3IpDQpsaWJyYXJ5KHRtKQ0KbGlicmFyeSh0ZXh0ZGF0YSkNCmxpYnJhcnkoZ3V0ZW5iZXJncikNCmxpYnJhcnkoc3RvcHdvcmRzKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeSh3b3JkY2xvdWQpDQpsaWJyYXJ5KGxleGljb24pDQpgYGANCg0KIyMjIEludHJvZHVjdGlvbiBpbnRvIHNlbnRpbWVudHMgYW5kIHRleHQgYW5hbHlzaXMNCg0KRnJvbSBDaGFwdGVyIDIgaW4gIlRleHQgTWluaW5nIHdpdGggUjogVGhlIFRpZHkgYXBwcm9hY2giLCBXZSBsZWFybiBhYm91dCBzZW50aW1lbnQgYW5hbHlzaXMuIEVhY2ggd29yZCBpcyByYW5rZWQgb24gc2VudGltZW50cywgd2hpY2ggZGV0ZXJtaW5lcyBpZiBhIHRlcm0gaGFzIGEgcG9zaXRpdmUgb3IgbmVnYXRpdmUgZmVlbGluZyBhc3NvY2lhdGVkLiANCg0KVGlkeSB0ZXh0IHByb3ZpZGVzIHRocmVlIGxleGljb24ncyB0byBtZWFzdXJlIHdvcmQgYXNzb2NpYXRpb24gYXNzaWdubWVudDogbmNyLCBiaW5nLCBhbmQgQUZJTk4uIEJlbG93LCB0aGUgY2hhcHRlciBwcm92aWRlZCB0aGVpciBleGFtcGxlcyBvZiB0aGUgdXNlIG9mIGFsbCB0aHJlZSBsZXhpY29ucyBvbiB0aGVpciBjb3JwdXMgb2YgY2hvaWNlIGphbmVhdXN0ZW5yLiBUaGVzZSBleGFtcGxlcyBhcmUgZnJvbSBDaGFwdGVyIHR3bywgd2hpY2ggY2FuIGJlIGZvdW5kIGJlbG93IF5bUm9iaW5zb24sIEp1bGlhIFNpbGdlIGFuZCBEYXZpZC4g4oCcMiBTZW50aW1lbnQgQW5hbHlzaXMgd2l0aCBUaWR5IERhdGE6IFRleHQgTWluaW5nIHdpdGggUi7igJ0gMiBTZW50aW1lbnQgQW5hbHlzaXMgd2l0aCBUaWR5IERhdGEgfCBUZXh0IE1pbmluZyB3aXRoIFIsIGh0dHBzOi8vd3d3LnRpZHl0ZXh0bWluaW5nLmNvbS9zZW50aW1lbnQuaHRtbC4gXS4NCg0KYGBge3IgY29kZS1jaHVuay1sYWJlbH0NCnRpZHlfYm9va3MgPC0gYXVzdGVuX2Jvb2tzKCkgJT4lDQogIGdyb3VwX2J5KGJvb2spICU+JQ0KICBtdXRhdGUoDQogICAgbGluZW51bWJlciA9IHJvd19udW1iZXIoKSwNCiAgICBjaGFwdGVyID0gY3Vtc3VtKHN0cl9kZXRlY3QodGV4dCwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHJlZ2V4KCJeY2hhcHRlciBbXFxkaXZ4bGNdIiwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGlnbm9yZV9jYXNlID0gVFJVRSkpKSkgJT4lDQogIHVuZ3JvdXAoKSAlPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KQ0KDQpiaW5nX3dvcmRfY291bnRzIDwtIHRpZHlfYm9va3MgJT4lDQogIGlubmVyX2pvaW4oZ2V0X3NlbnRpbWVudHMoImJpbmciKSkgJT4lDQogIGNvdW50KHdvcmQsIHNlbnRpbWVudCwgc29ydCA9IFRSVUUpICU+JQ0KICB1bmdyb3VwKCkNCg0KYmluZ193b3JkX2NvdW50cyAlPiUNCiAgZ3JvdXBfYnkoc2VudGltZW50KSAlPiUNCiAgc2xpY2VfbWF4KG4sIG4gPSAxMCkgJT4lIA0KICB1bmdyb3VwKCkgJT4lDQogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lDQogIGdncGxvdChhZXMobiwgd29yZCwgZmlsbCA9IHNlbnRpbWVudCkpICsNCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKw0KICBmYWNldF93cmFwKH5zZW50aW1lbnQsIHNjYWxlcyA9ICJmcmVlX3kiKSArDQogIGxhYnMoeCA9ICJDb250cmlidXRpb24gdG8gc2VudGltZW50IiwNCiAgICAgICB5ID0gTlVMTCkNCmBgYA0KDQojIyMgRXh0ZW5zaW9uIG9mIENoYXB0ZXIgVHdvDQoNCiMjIyMgQ29ycHVzIHdpdGggY29tcGxleCBlbW90aW9ucw0KDQpGb3Igb3VyIGV4dGVuc2lvbiwgV2Ugd2lsbCB1c2UgYSByb21hbmNlIGJvb2sgY2FsbGVkICJUaGUgU3Rvcnkgb2YgdGhlIExvdmVyIiBieSBIdXRjaGlucyBIYXBnb29kLiBMZXQncyBzYXkgZm9yIHRoZSBzZW50aW1lbnRzLCB3ZSB3YW50IHRvIHZpZXcgb25seSB0aGUgdGVybXMgdGhhdCBzdXJyb3VuZCBuZWdhdGl2ZSBlbW90aW9ucyBsaWtlIGFuZ2VyLg0KDQpMZXRzIHVzZSB0aGUgbnJjIGxleGljb24gYW5kIHJldHJpZXZlIHRoZSBtb3N0IGNvbW1vbiB3b3JkcyBmcm9tIHRoZSBib29rIHdoaWNoIGludm9sdmVzIGFuZ2VyLiBGcm9tIG91ciBwdWxsLCBXZSBzZWUgYSBwbG90IHRoYXQgY291bGQgYmUgYWJvdXQgYW5vdGhlciBwYXJ0bmVyIGFzIHRlcm1zIGxpa2UgamVhbG91c2x5LCBwb3NzZXNzaW9uIGFuZCBpbmZpZGVsaXR5IHJhbmtlZCBpbiB0aGUgdG9wIHRlbiBvZiB0aGUgbGlzdC4NCg0KYGBge3J9DQpzdG9yeV9sb3ZlciA8LSBndXRlbmJlcmdfZG93bmxvYWQoNjc3MDYpDQoNCnN0b3J5X2xvdmVyPC1zdG9yeV9sb3ZlciU+JXVuZ3JvdXAoKSAlPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KQ0KDQpzdG9yeV9sb3Zlcjwtc3RvcnlfbG92ZXIlPiUgZmlsdGVyKCEod29yZCAlaW4lIHN0b3B3b3Jkcyhzb3VyY2UgPSAic25vd2JhbGwiKSkpDQoNCm5yY19hbmdlcjwtZ2V0X3NlbnRpbWVudHMoIm5yYyIpJT4lZmlsdGVyKHNlbnRpbWVudD09ImFuZ2VyIikNCg0Kc3RvcnlfbG92ZXIgJT4lDQogIGlubmVyX2pvaW4obnJjX2FuZ2VyKSAlPiUNCiAgY291bnQod29yZCwgc29ydCA9IFRSVUUpJT4lc2xpY2VfbWF4KG4sIG4gPSAxMCklPiVnZ3Bsb3QoYWVzKG4sIHdvcmQpKSArDQogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICt0aGVtZV9saWdodCgpDQogIA0KYGBgYA0KDQojIyMjIEV4cGFuc2lvbnMgb2YgbGV4aWNvbnMNCg0KRm9yIG91ciBuZXcgbGV4aWNvbiwgSSB3aWxsIHVzZSB0aGUgbGV4aWNvbiBwYWNrYWdlIHRvIHJldHJpZXZlIGEgbmV3IGxleGljb24uIEZvciBvdXIgYW5hbHlzaXMsIExldHMgc2VlIHRoZSBtb3N0IGNvbW1vbiB0ZXJtcyBhc3NvY2lhdGVkIHdpdGggc2Vuc2F0aW9ucy4gSSB3aWxsIHVzZSB0aGUga2V5IHJlZ3Jlc3NpdmUgaW1hZ2VyeSB0YWJsZSB0byBiZXN0IHJlZmxlY3QgdGhlIHRlcm0uDQoNClNlbnNhdGlvbnMgbGlrZSBzaWdodCBhbmQgdG91Y2ggd2VyZSBtb3N0IHVzZWQgaW4gdGhlIG5vdmVsIGNvbXBhcmVkIHRoZSBvdGhlciB0aHJlZSBzZW5zZXMuDQoNCmBgYHtyfQ0KaW1hZ2VyeTwta2V5X3JlZ3Jlc3NpdmVfaW1hZ2VyeQ0KaW1hZ2VyeTwtaW1hZ2VyeSU+JWZpbHRlcihjYXRlZ29yeT09InNlbnNhdGlvbiIpDQppbWFnZXJ5JHJlZ2V4PC1zdHJfcmVwbGFjZV9hbGwoaW1hZ2VyeSRyZWdleCwiW1xcXFxiXSIsIiIpDQpuYW1lcyhpbWFnZXJ5KTwtYygieCIsInkiLCJ6IiwiYSIsIndvcmQiKQ0KDQpzdG9yeV9sb3ZlciAlPiUNCiAgaW5uZXJfam9pbihpbWFnZXJ5KSAlPiUNCiAgY291bnQod29yZCwgc29ydCA9IFRSVUUpJT4lc2xpY2VfbWF4KG4sIG4gPSAxMCklPiV3aXRoKHdvcmRjbG91ZCh3b3JkLCBuLCBtYXgud29yZHMgPSAxMDApKQ0KYGBgYA0KDQojIyMgQ29uY2x1c2lvbg0KDQpGb3IgbXkgY29ycHVzLCB0aGUga2V5IHJlZ3Jlc3NpdmUgaW1hZ2VyeSBsZXhpY29uIHdhcyBtb3JlIGdyYW51bGFyIGNvbXBhcmVkIHRvIHRoZSBsZXhpY29ucyBwcm92aWRlZCBieSB0aWR5dGV4dC4gVGhlcmUgd2FzIG1vcmUgY29udHJvbCBvbiB0aGUgc3BlY2lmaWMgdGVybXMvZmVlbGluZ3MgaW4gdGhlIGxpc3RpbmdzIHRoYW4gYmluZydzIGVpZ2h0IGVtb3Rpb25zLiBUaGUgbGV4aWNvbiBjYW4gYmV0dGVyIGRlZmluZWQgZW1vdGlvbnMgbGlrZSByb21hbmNlIG9yIHNlbnNlcyBsaWtlIHRhc3RlIHRoYW4gdGhlIGxpc3RzIHByb3ZpZGVkLg==