Introduction

The goal of this document is to use current sentiment analysis base code from “Text Mining with R”, and apply it to a new corpus and lexicon."

Bring in packages.

library(tidytext)
library(textdata)
library(janeaustenr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(ggplot2)
library(gutenbergr)

Book Example

Word counts.

Tidy selected corpus into data frame(added sample function to reduce processing power

tidy_books <- sample_n(austen_books(),10000) %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
      ignore_case = TRUE
    )))
  ) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Word count reflecting “joy” per nrc lexicon

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 161 x 2
##    word        n
##    <chr>   <int>
##  1 good       45
##  2 hope       25
##  3 happy      23
##  4 young      23
##  5 friend     18
##  6 found      17
##  7 love       17
##  8 kind       16
##  9 pretty     15
## 10 present    14
## # … with 151 more rows

Sentiment score

Sentiment score grouped grouped 80 line buckets

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"

Plot sentiments trend

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Recreation

Charles Dickens Children Books

dickens_works<- gutenberg_works(author == "Dickens, Charles",!str_detect(title, "Works"))
dickens_works <- dickens_works %>%
  filter(str_detect(gutenberg_bookshelf,'Children'))
dickens_works_ids <- dickens_works$gutenberg_id

Word counts.

Tidy selected corpus into data frame(added sample function to reduce processing power

dickens_childrens_books <- gutenberg_download(dickens_works_ids)%>%
  left_join(dickens_works) %>%
  rename(book = title)%>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
      ignore_case = TRUE
    )))
  ) %>%
  ungroup() %>%
  unnest_tokens(word, text)%>%
  select(book,linenumber, chapter, word)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Joining, by = "gutenberg_id"

Word count reflecting “joy” per nrc lexicon

Words that you would expect in children books appear, such as “young” and “daughter.” However, interesting call-outs include “money” and “church,” which may not be associated with children books these days.

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

dickens_childrens_books %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 399 x 2
##    word         n
##    <chr>    <int>
##  1 good       554
##  2 young      350
##  3 found      224
##  4 money      160
##  5 merry      159
##  6 baby       146
##  7 love       146
##  8 daughter   128
##  9 church     123
## 10 child      121
## # … with 389 more rows

Sentiment score bing

Sentiment score grouped grouped 80 line buckets

dickens_childrens_books_sentiment <- dickens_childrens_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"

Plot sentiments trends Example plot using “bing” Lexicon.

ggplot(filter(dickens_childrens_books_sentiment,book == "Holiday Romance"), aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 3, scales = "free_x")

Sentiment score DictionarHE

Re-run with different Lexicon

library(SentimentAnalysis)
## 
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
## 
##     write
#load DictionarHE lexicon from SentimentAnalysis Package
dict.HE <- rbind(data.frame(rename(data.frame(sentiment = "positive", loadDictionaryHE()[1]), word = positiveWords)),data.frame(rename(data.frame(sentiment = "negative", loadDictionaryHE()[2]), word = negativeWords)))
dickens_childrens_books_sentiment_HE <- dickens_childrens_books %>%
  inner_join(dict.HE) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"

Plot sentiments trends

ggplot(filter(dickens_childrens_books_sentiment_HE,book == "Holiday Romance"), aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 3, scales = "free_x")

Conclusion

We can see how the choice of lexicon can affect the results in a major way; not only in absolute value but in trends.

Citation

Julia Silge and David Robinson. “Text Mining with R.” https://www.tidytextmining.com.*