library(gutenbergr)
library(stringr)
book <- gutenberg_download(11)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://www.gutenberg.lib.md.us
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
book_words <- book %>%
mutate(chapter = cumsum(str_detect(text, "^CHAPTER"))) %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word)
library(ggplot2)
book_words %>%
count(word, sort = TRUE) %>%
head(20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
coord_flip()

book_words %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(total_sentiment = sum(score)) %>%
arrange(total_sentiment)
## # A tibble: 321 × 2
## word total_sentiment
## <chr> <int>
## 1 mock -112
## 2 poor -54
## 3 mad -45
## 4 cried -40
## 5 afraid -24
## 6 tears -22
## 7 offended -20
## 8 interrupted -18
## 9 puzzled -18
## 10 angry -15
## # ... with 311 more rows
book_words %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
filter(!word %in% c("mock", "mad", "dear")) %>%
group_by(chapter) %>%
summarize(average_sentiment = mean(score)) %>%
filter(chapter > 0) %>%
ggplot(aes(chapter, average_sentiment, fill = average_sentiment > 0)) +
geom_col(show.legend = FALSE) +
theme_minimal()

book_words %>%
filter(chapter > 0) %>%
count(chapter, word) %>%
bind_tf_idf(word, chapter, n) %>%
group_by(chapter) %>%
top_n(6, tf_idf) %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf)) +
geom_col() +
facet_wrap(~ chapter, scales = "free") +
coord_flip()

book %>%
mutate(chapter = cumsum(str_detect(text, "^CHAPTER"))) %>%
unnest_tokens(word, text, token = "ngrams", n = 2) %>%
count(word, sort = TRUE)
## # A tibble: 14,728 × 2
## word n
## <chr> <int>
## 1 said the 210
## 2 of the 133
## 3 said alice 116
## 4 in a 97
## 5 and the 82
## 6 in the 80
## 7 it was 76
## 8 to the 69
## 9 the queen 65
## 10 as she 61
## # ... with 14,718 more rows
library(widyr)
book_words %>%
count(word, chapter) %>%
filter(sum(n) >= 30) %>%
pairwise_cor(word, chapter, n, sort = TRUE)
## # A tibble: 552 × 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 turtle mock 0.9995947
## 2 mock turtle 0.9995947
## 3 march hare 0.9988637
## 4 hare march 0.9988637
## 5 hatter dormouse 0.9907407
## 6 dormouse hatter 0.9907407
## 7 mock gryphon 0.9795810
## 8 gryphon mock 0.9795810
## 9 turtle gryphon 0.9789628
## 10 gryphon turtle 0.9789628
## # ... with 542 more rows