packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","janeaustenr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
library(stringr)
library(wordcloud2)
library(ggplot2)
library(tidyr)
library(scales)
library(janeaustenr)

1.2

text <- c("Because I could not stop for Death -",
          "He kindly stopped for me -",
          "The Carriage held but just Ourselves -",
          "and Immortality")

text
## [1] "Because I could not stop for Death -"  
## [2] "He kindly stopped for me -"            
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
text_df <- tibble(line = 1:4, text = text)

text_df
## # A tibble: 4 x 2
##    line text                                  
##   <int> <chr>                                 
## 1     1 Because I could not stop for Death -  
## 2     2 He kindly stopped for me -            
## 3     3 The Carriage held but just Ourselves -
## 4     4 and Immortality
text_df %>%
  unnest_tokens(word, text)
## # A tibble: 20 x 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 for        
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality
#unnest_tokens(Data frame, output, input, token = "words")
#token:Unit for tokenizing,"words" (default), "characters", "ngrams", "skip_ngrams", "sentences", "lines"...
austen_books() %>% View()

Jane Austen完成的六部小說,資料型態包含text和book

original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),#保留資料row_num
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                                 ignore_case = TRUE)))) %>%
  ungroup()
#str_detect比對text和regex出現的次數透過cumsum累加
#regex正規表示式,^開頭,\d數字,ivxlc出現羅馬數字i或x或...
#ignore_case = TRUE忽略大小寫
original_books
## # A tibble: 73,422 x 4
##    text                  book                linenumber chapter
##    <chr>                 <fct>                    <int>   <int>
##  1 SENSE AND SENSIBILITY Sense & Sensibility          1       0
##  2 ""                    Sense & Sensibility          2       0
##  3 by Jane Austen        Sense & Sensibility          3       0
##  4 ""                    Sense & Sensibility          4       0
##  5 (1811)                Sense & Sensibility          5       0
##  6 ""                    Sense & Sensibility          6       0
##  7 ""                    Sense & Sensibility          7       0
##  8 ""                    Sense & Sensibility          8       0
##  9 ""                    Sense & Sensibility          9       0
## 10 CHAPTER 1             Sense & Sensibility         10       1
## # ... with 73,412 more rows
tidy_books <- original_books %>%
  unnest_tokens(word, text)
#unnest_tokens(word, text)將text欄位抓出tokens,新增至word
tidy_books
## # A tibble: 725,055 x 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ... with 725,045 more rows
data(stop_words)

tidy_books <- tidy_books %>%
  anti_join(stop_words)
#anti_join用法,將tidy_books裡出現stop_words的word刪除
head(stop_words)
## # A tibble: 6 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART

從字庫(lexicon)=SMART抓出

tidy_books %>%
  count(word, sort = TRUE)
## # A tibble: 13,914 x 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ... with 13,904 more rows
#根據word計算次數並排序,如下
tidy_books %>% group_by(word) %>% summarise(count=n()) %>% arrange(desc(count))
## # A tibble: 13,914 x 2
##    word   count
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ... with 13,904 more rows
#如上
tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n))
## # A tibble: 13 x 2
##    word          n
##    <fct>     <int>
##  1 miss       1855
##  2 time       1337
##  3 fanny       862
##  4 dear        822
##  5 lady        817
##  6 sir         806
##  7 day         797
##  8 emma        787
##  9 sister      727
## 10 house       699
## 11 elizabeth   687
## 12 elinor      623
## 13 hope        601
#filter(n > 600)取出部分row
#word = reorder(word, n)將word重新排序依照n
library(ggplot2)

tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

#coord_flip()將x軸與y軸反過來

範例: 重點流程(基本):先將資料載入->在將stop word移除->統計次數->畫圖

hgwells <- gutenberg_download(c(35, 36, 5230, 159))

http://www.gutenberg.org/由此可下載書Ebook.No

tidy_hgwells <- hgwells %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
tidy_hgwells %>%
  count(word, sort = TRUE)
## # A tibble: 11,769 x 2
##    word       n
##    <chr>  <int>
##  1 time     454
##  2 people   302
##  3 door     260
##  4 heard    249
##  5 black    232
##  6 stood    229
##  7 white    222
##  8 hand     218
##  9 kemp     213
## 10 eyes     210
## # ... with 11,759 more rows
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767))
tidy_bronte <- bronte %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
tidy_bronte %>%
  count(word, sort = TRUE)
## # A tibble: 23,050 x 2
##    word       n
##    <chr>  <int>
##  1 time    1065
##  2 miss     855
##  3 day      827
##  4 hand     768
##  5 eyes     713
##  6 night    647
##  7 heart    638
##  8 looked   601
##  9 door     592
## 10 half     586
## # ... with 23,040 more rows
bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
                       mutate(tidy_hgwells, author = "H.G. Wells"), 
                       mutate(tidy_books, author = "Jane Austen"))
## # A tibble: 535,379 x 6
##    gutenberg_id word       author         book  linenumber chapter
##           <int> <chr>      <chr>          <fct>      <int>   <int>
##  1          767 _facsimile Bronte Sisters <NA>          NA      NA
##  2          767 title      Bronte Sisters <NA>          NA      NA
##  3          767 page       Bronte Sisters <NA>          NA      NA
##  4          767 edition_   Bronte Sisters <NA>          NA      NA
##  5          767 _which     Bronte Sisters <NA>          NA      NA
##  6          767 issued_    Bronte Sisters <NA>          NA      NA
##  7          767 _together  Bronte Sisters <NA>          NA      NA
##  8          767 with_      Bronte Sisters <NA>          NA      NA
##  9          767 _wuthering Bronte Sisters <NA>          NA      NA
## 10          767 heights_   Bronte Sisters <NA>          NA      NA
## # ... with 535,369 more rows
#bind_rows:將三位作者依照row合併
 bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
                       mutate(tidy_hgwells, author = "H.G. Wells"), 
                       mutate(tidy_books, author = "Jane Austen")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(author, word)
## # A tibble: 48,018 x 3
##    author         word            n
##    <chr>          <chr>       <int>
##  1 Bronte Sisters a               8
##  2 Bronte Sisters a'most          4
##  3 Bronte Sisters aback           1
##  4 Bronte Sisters abaht           1
##  5 Bronte Sisters abandon         8
##  6 Bronte Sisters abandoned      23
##  7 Bronte Sisters abandoning      1
##  8 Bronte Sisters abandonment     5
##  9 Bronte Sisters abase           1
## 10 Bronte Sisters abasement       4
## # ... with 48,008 more rows
#和上個結果比較,show spread用法
bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
                       mutate(tidy_hgwells, author = "H.G. Wells"), 
                       mutate(tidy_books, author = "Jane Austen")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% #把col=n拿掉
  spread(author, proportion) 
## # A tibble: 28,909 x 4
##    word        `Bronte Sisters` `H.G. Wells` `Jane Austen`
##    <chr>                  <dbl>        <dbl>         <dbl>
##  1 a                 0.0000319     0.0000150    0.00000919
##  2 a'most            0.0000159    NA           NA         
##  3 a'n't            NA            NA            0.00000460
##  4 aback             0.00000398    0.0000150   NA         
##  5 abaht             0.00000398   NA           NA         
##  6 abandon           0.0000319     0.0000150   NA         
##  7 abandoned         0.0000916     0.000180     0.00000460
##  8 abandoning        0.00000398    0.0000450   NA         
##  9 abandonment       0.0000199     0.0000150   NA         
## 10 abart            NA             0.0000150   NA         
## # ... with 28,899 more rows
frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
                       mutate(tidy_hgwells, author = "H.G. Wells"), 
                       mutate(tidy_books, author = "Jane Austen")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% #把col=n拿掉
  spread(author, proportion) %>% 
  gather(author, proportion, `Brontë Sisters`:`H.G. Wells`)
#str_extract(word, "[a-z']+"):word只取出英文字母的部分,例如_any_修改成any
frequency
## # A tibble: 57,818 x 4
##    word        `Jane Austen` author          proportion
##    <chr>               <dbl> <chr>                <dbl>
##  1 a              0.00000919 Bronte Sisters  0.0000319 
##  2 a'most        NA          Bronte Sisters  0.0000159 
##  3 a'n't          0.00000460 Bronte Sisters NA         
##  4 aback         NA          Bronte Sisters  0.00000398
##  5 abaht         NA          Bronte Sisters  0.00000398
##  6 abandon       NA          Bronte Sisters  0.0000319 
##  7 abandoned      0.00000460 Bronte Sisters  0.0000916 
##  8 abandoning    NA          Bronte Sisters  0.00000398
##  9 abandonment   NA          Bronte Sisters  0.0000199 
## 10 abart         NA          Bronte Sisters NA         
## # ... with 57,808 more rows
#gather把Brontë Sisters`:`H.G. Wells用Austen表示,可比較Austen和另外兩個人的比較
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~author, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "Jane Austen", x = NULL)

#geom_abline畫線
#geom_jitter散佈圖設定
#geom_text圖show出字