%>% enables subseting of data using funcions from packages like tidytext and tidyverse

LOAD PACKAGES INTO RSTUDIO

MJ <- c("janeaustenr", "dplyr", "stringr", "tidytext","tidyr","ggplot2","textdata","wordcloud")
lapply(MJ, library, character.only = TRUE)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: RColorBrewer
## [[1]]
## [1] "janeaustenr" "stats"       "graphics"    "grDevices"   "utils"      
## [6] "datasets"    "methods"     "base"       
## 
## [[2]]
## [1] "dplyr"       "janeaustenr" "stats"       "graphics"    "grDevices"  
## [6] "utils"       "datasets"    "methods"     "base"       
## 
## [[3]]
##  [1] "stringr"     "dplyr"       "janeaustenr" "stats"       "graphics"   
##  [6] "grDevices"   "utils"       "datasets"    "methods"     "base"       
## 
## [[4]]
##  [1] "tidytext"    "stringr"     "dplyr"       "janeaustenr" "stats"      
##  [6] "graphics"    "grDevices"   "utils"       "datasets"    "methods"    
## [11] "base"       
## 
## [[5]]
##  [1] "tidyr"       "tidytext"    "stringr"     "dplyr"       "janeaustenr"
##  [6] "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
## [11] "methods"     "base"       
## 
## [[6]]
##  [1] "ggplot2"     "tidyr"       "tidytext"    "stringr"     "dplyr"      
##  [6] "janeaustenr" "stats"       "graphics"    "grDevices"   "utils"      
## [11] "datasets"    "methods"     "base"       
## 
## [[7]]
##  [1] "textdata"    "ggplot2"     "tidyr"       "tidytext"    "stringr"    
##  [6] "dplyr"       "janeaustenr" "stats"       "graphics"    "grDevices"  
## [11] "utils"       "datasets"    "methods"     "base"       
## 
## [[8]]
##  [1] "wordcloud"    "RColorBrewer" "textdata"     "ggplot2"      "tidyr"       
##  [6] "tidytext"     "stringr"      "dplyr"        "janeaustenr"  "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"

GET LINE NUMBER AND CHAPTER OF JANEAUSTENR

line.chapter <- austen_books() %>%
  group_by(book) %>%
  mutate(lilnenumber = row_number(),
         chapter = cumsum(str_detect(text, 
                                     regex("^chapter [\\divxlc]",
                                           ignore_case = TRUE))))

head(line.chapter, n = 12)
## # A tibble: 12 x 4
## # Groups:   book [1]
##    text                    book                lilnenumber chapter
##    <chr>                   <fct>                     <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility           1       0
##  2 ""                      Sense & Sensibility           2       0
##  3 "by Jane Austen"        Sense & Sensibility           3       0
##  4 ""                      Sense & Sensibility           4       0
##  5 "(1811)"                Sense & Sensibility           5       0
##  6 ""                      Sense & Sensibility           6       0
##  7 ""                      Sense & Sensibility           7       0
##  8 ""                      Sense & Sensibility           8       0
##  9 ""                      Sense & Sensibility           9       0
## 10 "CHAPTER 1"             Sense & Sensibility          10       1
## 11 ""                      Sense & Sensibility          11       1
## 12 ""                      Sense & Sensibility          12       1

ONE WORD PER ROW

line.chapter.oneword <- line.chapter %>%
  unnest_tokens(word,text)
head(line.chapter.oneword)
## # A tibble: 6 x 2
## # Groups:   book [1]
##   book                word       
##   <fct>               <chr>      
## 1 Sense & Sensibility sense      
## 2 Sense & Sensibility and        
## 3 Sense & Sensibility sensibility
## 4 Sense & Sensibility by         
## 5 Sense & Sensibility jane       
## 6 Sense & Sensibility austen

removing stop words

data("stop_words")
clean.jane <- line.chapter.oneword %>%
  anti_join(stop_words)
## Joining, by = "word"
head(clean.jane)
## # A tibble: 6 x 2
## # Groups:   book [1]
##   book                word       
##   <fct>               <chr>      
## 1 Sense & Sensibility sense      
## 2 Sense & Sensibility sensibility
## 3 Sense & Sensibility jane       
## 4 Sense & Sensibility austen     
## 5 Sense & Sensibility 1811       
## 6 Sense & Sensibility chapter

word count

bunker <- clean.jane %>%
  count(word, sort = TRUE)
head(bunker, n = 10)
## # A tibble: 10 x 3
## # Groups:   book [6]
##    book                word          n
##    <fct>               <chr>     <int>
##  1 Mansfield Park      fanny       816
##  2 Emma                emma        786
##  3 Sense & Sensibility elinor      623
##  4 Emma                miss        599
##  5 Pride & Prejudice   elizabeth   597
##  6 Mansfield Park      crawford    493
##  7 Sense & Sensibility marianne    492
##  8 Persuasion          anne        447
##  9 Mansfield Park      miss        432
## 10 Northanger Abbey    catherine   428

plot of common words

clean.jane %>%
  count(word, sort = TRUE) %>%
  filter(n > 400) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n))+
  geom_col()+
  xlab(NULL)+
  coord_flip()