R Markdown

library(janeaustenr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
original_books <- austen_books() %>%
 group_by(book) %>%
 mutate(linenumber = row_number(),
 chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
 ignore_case = TRUE)))) %>%
 ungroup()

Restructure it in the one-token-per-row format

library(tidytext)
tidy_books <- original_books %>%
 unnest_tokens(word, text)

Remove stopwords

data(stop_words)
tidy_books <- tidy_books %>%
 anti_join(stop_words)
## Joining, by = "word"

Find most common words in all the books as a whole

tidy_books %>%
 count(word, sort = TRUE)
## # A tibble: 13,914 x 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ... with 13,904 more rows

Word counts are stored in a tidy data frame. It allows us to pipe directly to the ggplot2 package to create a visualization of the most common words

library(ggplot2)
tidy_books %>%
 count(word, sort = TRUE) %>%
 filter(n > 600) %>%
 mutate(word = reorder(word, n)) %>%
 ggplot(aes(word, n)) +
 geom_col() +
 xlab(NULL) +
 coord_flip()

#library(tm)
#dtm <- DocumentTermMatrix(tidy_books)
#inspect(dtm)