%>% enables subseting of data using funcions from packages like tidytext and tidyverse
MJ <- c("janeaustenr", "dplyr", "stringr", "tidytext","tidyr","ggplot2","textdata","wordcloud")
lapply(MJ, library, character.only = TRUE)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: RColorBrewer
## [[1]]
## [1] "janeaustenr" "stats" "graphics" "grDevices" "utils"
## [6] "datasets" "methods" "base"
##
## [[2]]
## [1] "dplyr" "janeaustenr" "stats" "graphics" "grDevices"
## [6] "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "stringr" "dplyr" "janeaustenr" "stats" "graphics"
## [6] "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "tidytext" "stringr" "dplyr" "janeaustenr" "stats"
## [6] "graphics" "grDevices" "utils" "datasets" "methods"
## [11] "base"
##
## [[5]]
## [1] "tidyr" "tidytext" "stringr" "dplyr" "janeaustenr"
## [6] "stats" "graphics" "grDevices" "utils" "datasets"
## [11] "methods" "base"
##
## [[6]]
## [1] "ggplot2" "tidyr" "tidytext" "stringr" "dplyr"
## [6] "janeaustenr" "stats" "graphics" "grDevices" "utils"
## [11] "datasets" "methods" "base"
##
## [[7]]
## [1] "textdata" "ggplot2" "tidyr" "tidytext" "stringr"
## [6] "dplyr" "janeaustenr" "stats" "graphics" "grDevices"
## [11] "utils" "datasets" "methods" "base"
##
## [[8]]
## [1] "wordcloud" "RColorBrewer" "textdata" "ggplot2" "tidyr"
## [6] "tidytext" "stringr" "dplyr" "janeaustenr" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
line.chapter <- austen_books() %>%
group_by(book) %>%
mutate(lilnenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE))))
head(line.chapter, n = 12)
## # A tibble: 12 x 4
## # Groups: book [1]
## text book lilnenumber chapter
## <chr> <fct> <int> <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 "by Jane Austen" Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 "(1811)" Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 "CHAPTER 1" Sense & Sensibility 10 1
## 11 "" Sense & Sensibility 11 1
## 12 "" Sense & Sensibility 12 1
line.chapter.oneword <- line.chapter %>%
unnest_tokens(word,text)
head(line.chapter.oneword)
## # A tibble: 6 x 2
## # Groups: book [1]
## book word
## <fct> <chr>
## 1 Sense & Sensibility sense
## 2 Sense & Sensibility and
## 3 Sense & Sensibility sensibility
## 4 Sense & Sensibility by
## 5 Sense & Sensibility jane
## 6 Sense & Sensibility austen
data("stop_words")
clean.jane <- line.chapter.oneword %>%
anti_join(stop_words)
## Joining, by = "word"
head(clean.jane)
## # A tibble: 6 x 2
## # Groups: book [1]
## book word
## <fct> <chr>
## 1 Sense & Sensibility sense
## 2 Sense & Sensibility sensibility
## 3 Sense & Sensibility jane
## 4 Sense & Sensibility austen
## 5 Sense & Sensibility 1811
## 6 Sense & Sensibility chapter
bunker <- clean.jane %>%
count(word, sort = TRUE)
head(bunker, n = 10)
## # A tibble: 10 x 3
## # Groups: book [6]
## book word n
## <fct> <chr> <int>
## 1 Mansfield Park fanny 816
## 2 Emma emma 786
## 3 Sense & Sensibility elinor 623
## 4 Emma miss 599
## 5 Pride & Prejudice elizabeth 597
## 6 Mansfield Park crawford 493
## 7 Sense & Sensibility marianne 492
## 8 Persuasion anne 447
## 9 Mansfield Park miss 432
## 10 Northanger Abbey catherine 428
clean.jane %>%
count(word, sort = TRUE) %>%
filter(n > 400) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n))+
geom_col()+
xlab(NULL)+
coord_flip()