In the beginning of this lab I will be following along the tutorial from the tidytextmining textbook available here: https://www.tidytextmining.com/sentiment.html.

Silge, Julia, and David Robinson. “2 Sentiment Analysis with Tidy Data.” Text Mining with R: A Tidy Approach, O’Reilly, Bejing, 2017, https://www.tidytextmining.com/sentiment.html. Accessed 8 Apr. 2022.

I begin with the required libraries, including the interesting janeaustenr library.

The code below splits out all of the Austen books into rows containing one word each.

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
    ungroup() %>%
    unnest_tokens(word, text)

tidy_books

Next it’s time to filter for words considered “joy words” with the help of the NRC lexicon- a dictionary of words that have a “sentiment” attached to them.

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"

Here I create a new dataframe containing the sentiments in all the books, this time using the “bing” lexicon, which categorizes words in a binary way into negative or positive. The result is a dataframe where every row contains a count of negative and positive words for every 80 lines in the books.

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
jane_austen_sentiment

I plot the sections of text on the x-axis against the sentiment “count” on the y-axis per book:

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)
custom_stop_words
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the `.groups`
## argument.
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.

For my own analysis I will be using a corpus with Harry Potter books.

if (packageVersion("devtools") < 1.6) {
  install.packages("devtools")
}

devtools::install_github("bradleyboehmke/harrypotter")
## Skipping install of 'harrypotter' from a github remote, the SHA1 (51f71461) has not changed since last install.
##   Use `force = TRUE` to force installation
library(harrypotter)

Im interested only in the last book, my favorite of the series. The library provides the entire book as text separated by chapters, and here I separate it out on a sentence-per-row basis while labeling the chapters.

hallows <- deathly_hallows
hallows_table <- tibble(chapter=seq_along(hallows), text = hallows )  %>% unnest_tokens(sentence,text, token="sentences")

hallows_table

For the next step I will be using sentimentr to get the sentiment per sentence.

library(sentimentr)
hallows_sent <- hallows_table %>%  
  mutate(`sentiment` = round(as.numeric(sentiment_by(get_sentences(sentence))$ave_sentiment ),3 ) )

The result is that now each row, containing a sentence, will also contain a sentiment score for that sentence.

hallows_sent

I want to gauge the negativity or positivity of the book by chapter and then plot the result.

hallows_ratio = hallows_sent %>%  
  group_by(chapter) %>% 
  summarize(sum(sentiment)) %>% 
  dplyr::rename('chap_sentiment' = 'sum(sentiment)' )
  

hallows_ratio
ggplot(hallows_ratio, aes(x=chapter, y=chap_sentiment)) +
  geom_col(show.legend = FALSE)

It seems like the book is exceedingly negative for the most part (which checks out considering You-Know-Who is in power for most of it). We can compare this to one of the earlier books in the series by re-doing the analysis:

potter_book <- philosophers_stone
  book_table <- tibble(chapter=seq_along(potter_book), text = potter_book ) %>% 
    unnest_tokens(sentence,text,token="sentences")
  
  book_sent <- book_table %>%  
    mutate(`sentiment` = round(as.numeric(sentiment_by(get_sentences(sentence))$ave_sentiment ),3 ) )
  
  book_ratio = book_sent %>%  
    group_by(chapter) %>% 
    summarize(sum(sentiment)) %>% 
    dplyr::rename('chap_sentiment' = 'sum(sentiment)' )
  
  print(ggplot(book_ratio, aes(x=chapter, y=chap_sentiment)) +
    geom_col(show.legend = FALSE) )

potter_book <- chamber_of_secrets
  book_table <- tibble(chapter=seq_along(potter_book), text = potter_book ) %>% 
    unnest_tokens(sentence,text,token="sentences")
  
  book_sent <- book_table %>%  
    mutate(`sentiment` = round(as.numeric(sentiment_by(get_sentences(sentence))$ave_sentiment ),3 ) )
  
  book_ratio = book_sent %>%  
    group_by(chapter) %>% 
    summarize(sum(sentiment)) %>% 
    dplyr::rename('chap_sentiment' = 'sum(sentiment)' )
  
  print(ggplot(book_ratio, aes(x=chapter, y=chap_sentiment)) +
    geom_col(show.legend = FALSE) )

potter_book <- prisoner_of_azkaban
  book_table <- tibble(chapter=seq_along(potter_book), text = potter_book ) %>% 
    unnest_tokens(sentence,text,token="sentences")
  
  book_sent <- book_table %>%  
    mutate(`sentiment` = round(as.numeric(sentiment_by(get_sentences(sentence))$ave_sentiment ),3 ) )
  
  book_ratio = book_sent %>%  
    group_by(chapter) %>% 
    summarize(sum(sentiment)) %>% 
    dplyr::rename('chap_sentiment' = 'sum(sentiment)' )
  
  print(ggplot(book_ratio, aes(x=chapter, y=chap_sentiment)) +
    geom_col(show.legend = FALSE) )

potter_book <- goblet_of_fire
  book_table <- tibble(chapter=seq_along(potter_book), text = potter_book ) %>% 
    unnest_tokens(sentence,text,token="sentences")
  
  book_sent <- book_table %>%  
    mutate(`sentiment` = round(as.numeric(sentiment_by(get_sentences(sentence))$ave_sentiment ),3 ) )
  
  book_ratio = book_sent %>%  
    group_by(chapter) %>% 
    summarize(sum(sentiment)) %>% 
    dplyr::rename('chap_sentiment' = 'sum(sentiment)' )
  
  print(ggplot(book_ratio, aes(x=chapter, y=chap_sentiment)) +
    geom_col(show.legend = FALSE) )

potter_book <- order_of_the_phoenix
  book_table <- tibble(chapter=seq_along(potter_book), text = potter_book ) %>% 
    unnest_tokens(sentence,text,token="sentences")
  
  book_sent <- book_table %>%  
    mutate(`sentiment` = round(as.numeric(sentiment_by(get_sentences(sentence))$ave_sentiment ),3 ) )
  
  book_ratio = book_sent %>%  
    group_by(chapter) %>% 
    summarize(sum(sentiment)) %>% 
    dplyr::rename('chap_sentiment' = 'sum(sentiment)' )
  
  print(ggplot(book_ratio, aes(x=chapter, y=chap_sentiment)) +
    geom_col(show.legend = FALSE) )

potter_book <- half_blood_prince
  book_table <- tibble(chapter=seq_along(potter_book), text = potter_book ) %>% 
    unnest_tokens(sentence,text,token="sentences")
  
  book_sent <- book_table %>%  
    mutate(`sentiment` = round(as.numeric(sentiment_by(get_sentences(sentence))$ave_sentiment ),3 ) )
  
  book_ratio = book_sent %>%  
    group_by(chapter) %>% 
    summarize(sum(sentiment)) %>% 
    dplyr::rename('chap_sentiment' = 'sum(sentiment)' )
  
  print(ggplot(book_ratio, aes(x=chapter, y=chap_sentiment)) +
    geom_col(show.legend = FALSE) )

(If I had realized I would be comparing all of the books, I would have done the sentiment analysis on all of them from the start).