Week 10 Assignment (D607)

# install.packages("gutenbergr")

library(tidyverse)

## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0

## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(jsonlite)

## 
## Attaching package: 'jsonlite'

## The following object is masked from 'package:purrr':
## 
##     flatten

library(knitr)
library(tidytext)
library(janeaustenr)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(wordcloud)

## Loading required package: RColorBrewer

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(gutenbergr)

Start by getting the primary example code from Chapter 2 working, will then extend it with a different corpus, and incorporate at least one additional sentiment lexicon

2.1 The sentiments dataset

# need to install package textdata in order to run get_sentiments on afinn
# install.packages("textdata")
get_sentiments("afinn")

## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # … with 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,891 more rows

2.2 Sentiment analysis with inner join

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", 
                                                 ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # … with 293 more rows

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

jane_austen_sentiment

## # A tibble: 920 x 5
##    book                index negative positive sentiment
##    <fct>               <dbl>    <dbl>    <dbl>     <dbl>
##  1 Sense & Sensibility     0       16       32        16
##  2 Sense & Sensibility     1       19       53        34
##  3 Sense & Sensibility     2       12       31        19
##  4 Sense & Sensibility     3       15       31        16
##  5 Sense & Sensibility     4       16       34        18
##  6 Sense & Sensibility     5       16       51        35
##  7 Sense & Sensibility     6       24       40        16
##  8 Sense & Sensibility     7       23       51        28
##  9 Sense & Sensibility     8       30       40        10
## 10 Sense & Sensibility     9       15       19         4
## # … with 910 more rows

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

2.3 Comparing the three sentiment dictionaries

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice

## # A tibble: 122,204 x 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # … with 122,194 more rows

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

bing_and_nrc <- bind_rows(pride_prejudice %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          pride_prejudice %>% 
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"
## Joining, by = "word"

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

# sentiment of the lexicon nrc
get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312

# sentiment of the lexicon bing
get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

2.4 Most common positive and negative words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts

## # A tibble: 2,585 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # … with 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

custom_stop_words <- bind_rows(tibble(word = c("miss"), 
                                          lexicon = c("custom")), 
                               stop_words)

custom_stop_words

## # A tibble: 1,150 x 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # … with 1,140 more rows

2.5 Wordclouds

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

2.6 Looking at units beyond just words

PandP_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

PandP_sentences$sentence[2]

## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 x 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  top_n(1) %>%
  ungroup()

## Joining, by = "word"

## Selecting by ratio

## # A tibble: 6 x 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

These are the chapters with the most sad words in each book, normalized for number of words in the chapter. What is happening in these chapters? In Chapter 43 of Sense and Sensibility Marianne is seriously ill, near death, and in Chapter 34 of Pride and Prejudice Mr. Darcy proposes for the first time (so badly!). Chapter 46 of Mansfield Park is almost the end, when everyone learns of Henry’s scandalous adultery, Chapter 15 of Emma is when horrifying Mr. Elton proposes, and in Chapter 21 of Northanger Abbey Catherine is deep in her Gothic faux fantasy of murder, etc. Chapter 4 of Persuasion is when the reader gets the full flashback of Anne refusing Captain Wentworth and how sad she was and what a terrible mistake she realized it to be.

Now it’s my turn to use a different corpus and incorporate an additional set of sentiment lexicon - loughran.

gutenberg_works()

## # A tibble: 40,737 x 8
##    gutenberg_id title author gutenberg_autho… language gutenberg_books… rights
##           <int> <chr> <chr>             <int> <chr>    <chr>            <chr> 
##  1            0  <NA> <NA>                 NA en       <NA>             Publi…
##  2            1 "The… Jeffe…             1638 en       United States L… Publi…
##  3            2 "The… Unite…                1 en       American Revolu… Publi…
##  4            3 "Joh… Kenne…             1666 en       <NA>             Publi…
##  5            4 "Lin… Linco…                3 en       US Civil War     Publi…
##  6            5 "The… Unite…                1 en       American Revolu… Publi…
##  7            6 "Giv… Henry…                4 en       American Revolu… Publi…
##  8            7 "The… <NA>                 NA en       <NA>             Publi…
##  9            8 "Abr… Linco…                3 en       US Civil War     Publi…
## 10            9 "Abr… Linco…                3 en       US Civil War     Publi…
## # … with 40,727 more rows, and 1 more variable: has_text <lgl>

decided to use the corpus from the title “A Sentimental Journey Through France and Italy” by Sterne, Laurence in the bookshelf of Harvard Classics/Best Books Ever Listings/Banned Books from Anne Haight’s list.

A_Sentimental_Journey_Thru_Fr_and_It <- gutenberg_download(804)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

A_Sentimental_Journey_Thru_Fr_and_It

## # A tibble: 4,353 x 2
##    gutenberg_id text                                                   
##           <int> <chr>                                                  
##  1          804 "                                    A"                
##  2          804 "                           SENTIMENTAL JOURNEY"       
##  3          804 "                                 THROUGH"             
##  4          804 "                            FRANCE AND ITALY;"        
##  5          804 ""                                                     
##  6          804 ""                                                     
##  7          804 "                              BY MR. YORICK."         
##  8          804 ""                                                     
##  9          804 "                     [THE REV. LAURENCE STERNE, M.A.]"
## 10          804 ""                                                     
## # … with 4,343 more rows

# found that the book has a repeated work CALAIS. 

tidy_book <- A_Sentimental_Journey_Thru_Fr_and_It %>%
              mutate(linenumber = row_number(),
              chapter = cumsum(str_detect(text, regex("^CALAIS", ignore_case = TRUE)))) %>%
              ungroup() %>%
              unnest_tokens(word, text)

unique(tidy_book$chapter)

##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16

# summary stats of linenumber which is the row_number()
summary(tidy_book$linenumber)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1140    2222    2202    3264    4353

tidy_sentiment <- tidy_book %>%
  inner_join(get_sentiments("bing")) %>%
  count(gutenberg_id, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

tidy_sentiment

## # A tibble: 55 x 5
##    gutenberg_id index negative positive sentiment
##           <int> <dbl>    <dbl>    <dbl>     <dbl>
##  1          804     0       16       17         1
##  2          804     1       29       17       -12
##  3          804     2       24       26         2
##  4          804     3       15       16         1
##  5          804     4        9       29        20
##  6          804     5       28       16       -12
##  7          804     6        8       21        13
##  8          804     7       35       18       -17
##  9          804     8       29       16       -13
## 10          804     9        9       15         6
## # … with 45 more rows

# plots of sentiment per gutenberg_id
ggplot(tidy_sentiment, aes(index, sentiment, fill = gutenberg_id)) +
  geom_col(show.legend = F) +
  facet_wrap(~gutenberg_id, ncol = 2, scales = "free_x")

# Comparing the four sentiment dictionaries
afinn <- tidy_book %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

bing_and_nrc_and_lough <- bind_rows(tidy_book %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                             tidy_book %>% 
                                inner_join(get_sentiments("nrc") %>% 
                                             filter(sentiment %in% c("positive", 
                                                                     "negative")))
                                                                %>%
                                                                mutate(method = "NRC"),
                            tidy_book %>%
                            inner_join(get_sentiments("loughran")) %>%
                            mutate(method = "Loughran")
                            ) %>%                            
                          count(method, index = linenumber %/% 80, sentiment) %>%
                          spread(sentiment, n, fill = 0) %>%
                          mutate(sentiment = positive - negative)

## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"

bind_rows(afinn, 
          bing_and_nrc_and_lough) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

What stood out to be is that AFINN and Bing et al. matched in terms of patterns. NRC is a little bit different. Loughran is totally negative, which is opposite from the other three.

get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Most common positive and negative words

bing_word_counts <- tidy_book %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts

## # A tibble: 756 x 3
##    word   sentiment     n
##    <chr>  <chr>     <int>
##  1 good   positive     72
##  2 poor   negative     68
##  3 well   positive     44
##  4 better positive     36
##  5 heaven positive     34
##  6 like   positive     31
##  7 great  positive     28
##  8 master positive     26
##  9 scarce negative     26
## 10 love   positive     25
## # … with 746 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

NO NEED to add custom stop words; just use stop_words

Wordclouds

tidy_book %>%
 anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

tidy_book %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_book %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(words = n())

# instead of removing chapter 0, I included it 
# I sorted the following results by ratio of highest negative words in the chapter
negative_ratios <- tidy_book %>%
  semi_join(bingnegative) %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("gutenberg_id", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  arrange(desc(ratio)) %>%
  top_n(n()) %>%
  ungroup()

## Joining, by = "word"

## Selecting by ratio

negative_ratios

## # A tibble: 17 x 5
##    gutenberg_id chapter negativewords words   ratio
##           <int>   <int>         <int> <int>   <dbl>
##  1          804       4             7   137 0.0511 
##  2          804       3            19   408 0.0466 
##  3          804      11            23   581 0.0396 
##  4          804      10            25   697 0.0359 
##  5          804       9            20   563 0.0355 
##  6          804       2            21   602 0.0349 
##  7          804      14            12   361 0.0332 
##  8          804       7            16   489 0.0327 
##  9          804       1            10   317 0.0315 
## 10          804       6            15   498 0.0301 
## 11          804      15             6   220 0.0273 
## 12          804      16           772 32977 0.0234 
## 13          804       0             6   292 0.0205 
## 14          804       5            27  1499 0.0180 
## 15          804       8             7   544 0.0129 
## 16          804      13             3   269 0.0112 
## 17          804      12             4   496 0.00806

looking at top 8 out of 16 (15+1) most negative

 ggplot(head(negative_ratios, 8), aes(x = reorder(chapter, - ratio), y = ratio, fill = ratio)) +
  geom_col(show.legend = TRUE) +
  labs(x = "Chapter", y = "Negative Ratios", fill = "Negative Ratios") +
  scale_y_continuous(labels = scales::percent) +
  geom_text(aes(label=paste0(round(100 * ratio,2),"%")), position=position_dodge(width=0.9), vjust=-0.25)

looking at top 8 most positive

positive_ratios <- tidy_book %>%
  semi_join(bingnegative) %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("gutenberg_id", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  arrange(ratio) %>%
  top_n(n()) %>%
  ungroup()

## Joining, by = "word"

## Selecting by ratio

positive_ratios

## # A tibble: 17 x 5
##    gutenberg_id chapter negativewords words   ratio
##           <int>   <int>         <int> <int>   <dbl>
##  1          804      12             4   496 0.00806
##  2          804      13             3   269 0.0112 
##  3          804       8             7   544 0.0129 
##  4          804       5            27  1499 0.0180 
##  5          804       0             6   292 0.0205 
##  6          804      16           772 32977 0.0234 
##  7          804      15             6   220 0.0273 
##  8          804       6            15   498 0.0301 
##  9          804       1            10   317 0.0315 
## 10          804       7            16   489 0.0327 
## 11          804      14            12   361 0.0332 
## 12          804       2            21   602 0.0349 
## 13          804       9            20   563 0.0355 
## 14          804      10            25   697 0.0359 
## 15          804      11            23   581 0.0396 
## 16          804       3            19   408 0.0466 
## 17          804       4             7   137 0.0511

 ggplot(head(positive_ratios, 8), aes(x = reorder(chapter, ratio), y = ratio, fill = ratio)) +
  geom_col(show.legend = TRUE) +
  labs(x = "Chapter", y = "Positive Ratios", fill = "Positive Ratios") +
  scale_y_continuous(labels = scales::percent) +
  geom_text(aes(label=paste0(round(100 * ratio,2),"%")), position=position_dodge(width=0.9), vjust=-0.25)

Chapter 4 turns out to be the most negative in terms of sentiment, at 5.11% while Chapter 12 turns out to be the most positive. I intentionally picked this book called A Sentimental Journey Through France and Italy. I looked forward to seeing more negative terms, which is shown in the Loughran lexicon. It turned out I picked the right lexicon to explore. It probably has a wider (lengthier) list of negative words to match with.

Citation: “Silge, Julia, and David Robinson. Text mining with R: A tidy approach.” O’Reilly Media, Inc.“, 2017.”