library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(janeaustenr)
library(ggplot2)
austen_books() %>%
unnest_tokens(token = "words", word, text) %>%
count(book, word) %>%
bind_tf_idf(word, book, n) %>%
arrange(desc(tf_idf)) %>%
group_by(book) %>%
top_n(10) %>%
ggplot(aes(x=factor(word, levels = rev(unique(word))), y=tf_idf, fill=book)) + geom_bar(stat="identity", show.legend = FALSE) + facet_wrap(~ book, ncol=3, scales = "free") + labs(x="", y="tf-idf") + theme(panel.background = element_blank()) + coord_flip()
## Selecting by tf_idf
Above plot shows that top 10 most important words used in Jane Austen’s books are names of characters and places