Method

2.1 The sentiments datasets

Get the 3 different sentiment lexicons

library(tidytext)

get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # … with 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,875 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,865 more rows

2.2 Sentiment analysis with inner join

Calculate the line number and chapter for each book
Convert the books to one word per row

library(janeaustenr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Look at the book Emma for occurrences of words related to ‘joy’. The most common words were ‘good’ and ‘friend’

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # … with 291 more rows

calculate the sentiment for 80 line segments of the 6 Jane Austin books

library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining, by = "word"

Plot the sentiment side-by-side. The books are all pretty positive overall, with the most positive book being Persuasion and the most negative being Northanger Abbey.

library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

2.3 Comparing the three sentiment dictionaries

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice

## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # … with 122,194 more rows

We can compare how each sentiment lexicon interprets a single work Pride and Prejudice

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining, by = "word"
## Joining, by = "word"

When plotted, we see that there is surprising variation in how each lexicon interprets the work. NRC has the most positive interpretation while Bing has the least postive.

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

* We can get a count on the positive and negative words in the different lexicons. Even though NRC has an overall very positive impression of the work. There were more negative term in the lexicon, 3 negative words for every 2 positive words.

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3318
## 2 positive   2308

Bing on the other had a far more negative impression of the work and unsurprisingly had twice as many negative words in the lexicon.

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

2.4 Most common positive and negative words

We can join the bing lexicon to the 6 Austen works and find the most common positive and negative words. The most common positve word was “well” while the most common negative word was “miss”.

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts

## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # … with 2,575 more rows

We can look at a bar chart of the most common positive and negative words. ‘Miss’ is far and away the most common negative word while ‘well’ and ‘good’ are very close in frequency for most common positive words.

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

We can add ‘miss’ to our stop words because Austen is using it differently than the negative word we are thinking of

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words

## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # … with 1,140 more rows

2.5 Wordclouds

We can filter out our stop words from our works and then create a word cloud with the top 100 most frequent words

The most common word is ‘time’

library(wordcloud)

## Loading required package: RColorBrewer

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

We can do create word clouds of common words sorted by sentiment. As we saw before the most common positive words are ‘well’ and ‘good’. The most common negative word is ‘miss’.

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

2.6 Looking at units beyond just words

We also have the ability to see words at the sentence level.

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

p_and_p_sentences$sentence[2]

## [1] "by jane austen"

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

We can calculate the percentage of words that are negative. For the 6 chosen works between 3-5% words are negative.

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.

## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Now with a new text…

Here we load the text that we will use in the second half of this data exploration. We use the ‘gutenbergr’ package to load some books by Mark Twain.

We look to see what works are available by Mark Twain.
Then we load the four works of interest.
The books require some filtering, we don’t want any table of contents or prefaces

library(gutenbergr)
library(SentimentAnalysis)

## 
## Attaching package: 'SentimentAnalysis'

## The following object is masked from 'package:base':
## 
##     write

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

temp <- gutenberg_works(author == "Twain, Mark")
twain <- gutenberg_download(c("74", "76", "86", "1837"))

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

#split the books up to remove the index and preface
t1 <- twain %>% filter(gutenberg_id == 74)
t1 <- t1[459:nrow(t1),]

t2 <- twain %>% filter(gutenberg_id == 76)
t2 <- t2[514:nrow(t2),]

t3 <- twain %>% filter(gutenberg_id == 86)
t3 <- t3[332:nrow(t3),]

t4 <- twain %>% filter(gutenberg_id == 1837)
t4 <- t4[444:nrow(t4),]

twain_books <- bind_rows(t1, t2, t3, t4)

twain_books %>% group_by(gutenberg_id) %>% summarise(n())

## # A tibble: 4 × 2
##   gutenberg_id `n()`
##          <int> <int>
## 1           74  8413
## 2           76 11495
## 3           86 12580
## 4         1837  7706

2.2 Sentiment analysis with inner join

Before we can do sentiment analysis we need to further organize the data. The Twain data is not as clean as the Austen data. We have a bunch of rows that we don’t want included (preface, table of contents. table of illustrations).

We group by book and generate the line number from the row number
We using a pretty clever way to calculate the chapter, we use the str_detect function to search for chapter beginning and take the cumulative sum of their occurrences
We use the unnest_tokens() function that results in a tidy dataframe of one word per row
We also add in the book titles based on their ‘gutenberg_id’

tidy_books <- twain_books %>%
  group_by(gutenberg_id) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(
           text,
           regex("^chapter [\\divxlc]",
                 ignore_case = TRUE)
         ))) %>%
  ungroup() %>%
  unnest_tokens(word, text) %>% mutate(
    title = case_when(
      gutenberg_id == 74 ~ "The Adventures of Tom Sawyer",
      gutenberg_id == 76 ~ "Adventures of Huckleberry Finn",
      gutenberg_id == 86 ~ "A Connecticut Yankee in King Arthur's Court",
      gutenberg_id == 1837 ~ "The Prince and the Pauper"
    )
  )

We use the ‘nrc’ lexicon to do an analysis of the ‘joy’ in the book
The Adventures of Tom Sawyer.

We use the get_sentiments() function to get the ‘nrc’ lexicon
We filter down to just the words that imply/related to ‘joy’
We inner join on the words of the book and the lexicon words related to ‘joy’
Last we do a count on those words

We find that the most common words used associated with ‘joy’ are ‘good’ and ‘found’.

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(gutenberg_id == 74) %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

## # A tibble: 280 × 2
##    word         n
##    <chr>    <int>
##  1 good       105
##  2 found       83
##  3 hope        30
##  4 money       30
##  5 mighty      26
##  6 church      24
##  7 treasure    24
##  8 kind        23
##  9 glad        21
## 10 finally     19
## # … with 270 more rows

We use the bing lexicon to calculate the positive and negative sentiment for 4 works by Mark Twain.

We join in the Bing lexicon
We chunk up our text into 80 line segments
We use the ‘pivot_wider’ function to split the sentiment into two separate columns
We calculate the sentiment by subtracting the negative from the positive

library(tidyr)

twain_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(gutenberg_id, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative,     title = case_when(
      gutenberg_id == 74 ~ "The Adventures of Tom Sawyer",
      gutenberg_id == 76 ~ "Adventures of Huckleberry Finn",
      gutenberg_id == 86 ~ "A Connecticut Yankee in King Arthur's Court",
      gutenberg_id == 1837 ~ "The Prince and the Pauper"
    )
  )

## Joining, by = "word"

Here we plot the sentiment over 80 line segments. We find the most positive of the four chosen works to be _Adventures of Huckleberry Fin’ and the most negative work to be The Adventures of Tom Sawyer

library(ggplot2)

ggplot(twain_sentiment, aes(index, sentiment, fill = title)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~title, ncol = 2, scales = "free_x")

2.3 Comparing the three sentiment dictionaries

We filter down to just one title The Adventures of Tom Sawyer

tom_sawyer <- tidy_books %>% 
  filter(title == "The Adventures of Tom Sawyer")

tom_sawyer

## # A tibble: 71,125 × 5
##    gutenberg_id linenumber chapter word    title                       
##           <int>      <int>   <int> <chr>   <chr>                       
##  1           74          2       1 chapter The Adventures of Tom Sawyer
##  2           74          2       1 i       The Adventures of Tom Sawyer
##  3           74          5       1 tom     The Adventures of Tom Sawyer
##  4           74          7       1 no      The Adventures of Tom Sawyer
##  5           74          7       1 answer  The Adventures of Tom Sawyer
##  6           74          9       1 tom     The Adventures of Tom Sawyer
##  7           74         11       1 no      The Adventures of Tom Sawyer
##  8           74         11       1 answer  The Adventures of Tom Sawyer
##  9           74         13       1 what’s  The Adventures of Tom Sawyer
## 10           74         13       1 gone    The Adventures of Tom Sawyer
## # … with 71,115 more rows

We take 3 seperate lexicons and calculate the sentiment for 80 line segments for the The Adventures of Tom Sawyer

We inner join the lexicons to our tidy corpus
We group by the index (80 line segments)
We calculate sentiment for each index (varies by lexicon)

afinn <- tom_sawyer %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

bing_and_nrc <- bind_rows(
  tom_sawyer %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  tom_sawyer %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining, by = "word"
## Joining, by = "word"

We bind each lexicon’s sentiment and plot them side-by-side. The three approaches give surprisingly similar plots.

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Like we did with the Austen works, we can get counts of the positive and negative words for each lexicon.

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3318
## 2 positive   2308

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

2.4 Most common positive and negative words

When we do count so positive and negative words again ‘well’ and ‘good’ are the most common. I wonder if it is because common idioms contain those words? The most common negative word is ‘poor’. Mark Twain was a person who continued to struggle with money personally. I wonder if his own financial insecurity crept into his work?

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts

## # A tibble: 2,616 × 3
##    word   sentiment     n
##    <chr>  <chr>     <int>
##  1 well   positive    924
##  2 good   positive    762
##  3 like   positive    649
##  4 right  positive    600
##  5 great  positive    315
##  6 enough positive    306
##  7 poor   negative    295
##  8 pretty positive    236
##  9 work   positive    233
## 10 dead   negative    229
## # … with 2,606 more rows

When we plot the negative and positive word frequencies we see that ‘well and good’ are very close in frequency and ‘poor’ is not an outlier for negative words. Its frequency is very close to the other top 10 negative words.

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

I’m adding a word to my stop word list that I do not want to appear in my word cloud. It’s worth noting that this word appears in the top 100 most frequently used non-stop words.

custom_stop_words <- bind_rows(tibble(word = c("nigger"),  
                                      lexicon = c("custom")), 
                               stop_words)

2.5 Wordclouds

We can create a word cloud where we anti join our stop words to keep them from being included. ‘Time’ and ‘Tom’ are our most common words.

library(wordcloud)

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

We can sort our word cloud by sentiment and we get results consistent with our frequency bar plots.

library(reshape2)

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

2.6 Looking at units beyond just words

We have the ability to choose a token other than ‘word’

tom <- as.list(t1)

tom_sentences <- tibble(text = tom$text) %>% 
  unnest_tokens(sentence, text, token = "sentences")

tom_sentences$sentence[2]

## [1] "“tom!”"

twain_chapters <- twain_books %>%
  group_by(gutenberg_id) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

twain_chapters %>% 
  group_by(gutenberg_id) %>% 
  summarise(chapters = n())

## # A tibble: 4 × 2
##   gutenberg_id chapters
##          <int>    <int>
## 1           74       36
## 2           76       43
## 3           86       45
## 4         1837       47

We can use the bing lexicon to determine what percentage of each work is negative. Interestingly, each of the four chosen Twain works has a very similar percentage (5%).

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(words = n())

## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("gutenberg_id", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

## Joining, by = "word"
## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.

## # A tibble: 4 × 5
##   gutenberg_id chapter negativewords words  ratio
##          <int>   <int>         <int> <int>  <dbl>
## 1           74      24            25   411 0.0608
## 2           76      13            76  2056 0.0370
## 3           86      17           170  3167 0.0537
## 4         1837      26            81  1393 0.0581

Now let’s add an additional lexicon. This is the ‘DictionaryGI’ lexicon which is a lexicon of opinionated words from the Harvard-IV dictionary as used in the General Inquirer software. General Inquirer is software developed for textual content analysis.

We can use this new lexicon to calculate the percentage of words that are negative just like we did with bing. The DictionaryGI lexicon gave us rates similar to bing.

GI_neg <- as.data.frame(DictionaryGI$negative)
GI_pos <- as.data.frame(DictionaryGI$positive)


wordcounts <- tidy_books %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(words = n())

## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.

tidy_books %>%
  semi_join(GI_neg, by=c('word'= 'DictionaryGI$negative')) %>%
  group_by(gutenberg_id, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("gutenberg_id", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

## `summarise()` has grouped output by 'gutenberg_id'. You can override using the `.groups` argument.

## # A tibble: 4 × 5
##   gutenberg_id chapter negativewords words  ratio
##          <int>   <int>         <int> <int>  <dbl>
## 1           74      24            28   411 0.0681
## 2           76      13           108  2056 0.0525
## 3           86      30           224  3605 0.0621
## 4         1837      28            87  1257 0.0692

DATA607 HW10 WilliamAiken

William Aiken

10/31/2021

Introduction

Method

Now with a new text…

Conclusion