library(gutenbergr)
library(stringr)

book <- gutenberg_download(11)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://www.gutenberg.lib.md.us
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidytext)

book_words <- book %>%
  mutate(chapter = cumsum(str_detect(text, "^CHAPTER"))) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)
library(ggplot2)

book_words %>%
  count(word, sort = TRUE) %>%
  head(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  coord_flip()

book_words %>%
  inner_join(get_sentiments("afinn"), by = "word") %>%
  group_by(word) %>%
  summarize(total_sentiment = sum(score)) %>%
  arrange(total_sentiment)
## # A tibble: 321 × 2
##           word total_sentiment
##          <chr>           <int>
## 1         mock            -112
## 2         poor             -54
## 3          mad             -45
## 4        cried             -40
## 5       afraid             -24
## 6        tears             -22
## 7     offended             -20
## 8  interrupted             -18
## 9      puzzled             -18
## 10       angry             -15
## # ... with 311 more rows
book_words %>%
  inner_join(get_sentiments("afinn"), by = "word") %>%
  filter(!word %in% c("mock", "mad", "dear")) %>%
  group_by(chapter) %>%
  summarize(average_sentiment = mean(score)) %>%
  filter(chapter > 0) %>%
  ggplot(aes(chapter, average_sentiment, fill = average_sentiment > 0)) +
  geom_col(show.legend = FALSE) +
  theme_minimal()

book_words %>%
  filter(chapter > 0) %>%
  count(chapter, word) %>%
  bind_tf_idf(word, chapter, n) %>%
  group_by(chapter) %>%
  top_n(6, tf_idf) %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col() +
  facet_wrap(~ chapter, scales = "free") +
  coord_flip()

book %>%
  mutate(chapter = cumsum(str_detect(text, "^CHAPTER"))) %>%
  unnest_tokens(word, text, token = "ngrams", n = 2) %>%
  count(word, sort = TRUE)
## # A tibble: 14,728 × 2
##          word     n
##         <chr> <int>
## 1    said the   210
## 2      of the   133
## 3  said alice   116
## 4        in a    97
## 5     and the    82
## 6      in the    80
## 7      it was    76
## 8      to the    69
## 9   the queen    65
## 10     as she    61
## # ... with 14,718 more rows
library(widyr)

book_words %>%
  count(word, chapter) %>%
  filter(sum(n) >= 30) %>%
  pairwise_cor(word, chapter, n, sort = TRUE)
## # A tibble: 552 × 3
##       item1    item2 correlation
##       <chr>    <chr>       <dbl>
## 1    turtle     mock   0.9995947
## 2      mock   turtle   0.9995947
## 3     march     hare   0.9988637
## 4      hare    march   0.9988637
## 5    hatter dormouse   0.9907407
## 6  dormouse   hatter   0.9907407
## 7      mock  gryphon   0.9795810
## 8   gryphon     mock   0.9795810
## 9    turtle  gryphon   0.9789628
## 10  gryphon   turtle   0.9789628
## # ... with 542 more rows