###ch1

範例文本

變成資料框

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## # A tibble: 4 x 2
##    line text                                  
##   <int> <chr>                                 
## 1     1 Because I could not stop for Death -  
## 2     2 He kindly stopped for me -            
## 3     3 The Carriage held but just Ourselves -
## 4     4 and Immortality
library(tidytext)

text_df %>%
  unnest_tokens(word, text)#列名,變數第一個為結果的列名,第二個為文本
## # A tibble: 20 x 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 for        
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality
library(janeaustenr)#小說的文本
library(dplyr)
library(stringr)

original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(), #做新的欄位
         chapter = cumsum(str_detect(text, #cumulative sum 累積總和, str_detect()是否含有此種文字
         regex("^chapter [\\divxlc]",#正規表達式
         ignore_case = TRUE)))) %>% #忽略大小寫
  ungroup()#去除已分组数据的分组

original_books
## # A tibble: 73,422 x 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # ... with 73,412 more rows
#將文本拆分出來
library(tidytext)
tidy_books <- original_books %>%
  unnest_tokens(word, text)

tidy_books
## # A tibble: 725,055 x 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ... with 725,045 more rows
#刪除英文常用的停用字
data(stop_words)

tidy_books <- tidy_books %>%
  anti_join(stop_words)
## Joining, by = "word"
#計算字詞的出現次數
tidy_books %>%
  count(word, sort = TRUE) 
## # A tibble: 13,914 x 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ... with 13,904 more rows
library(ggplot2)

tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>% #reorder()依大至小排列
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL) #不要y軸線的名稱

library(gutenbergr)#古騰堡

hgwells <- gutenberg_download(c(35, 36, 5230, 159))#書籍的序號
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
tidy_hgwells <- hgwells %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining, by = "word"
tidy_hgwells %>%
  count(word, sort = TRUE)
## # A tibble: 11,830 x 2
##    word       n
##    <chr>  <int>
##  1 time     461
##  2 people   302
##  3 door     260
##  4 heard    249
##  5 black    232
##  6 stood    229
##  7 white    224
##  8 hand     218
##  9 kemp     213
## 10 eyes     210
## # ... with 11,820 more rows
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767))
tidy_bronte <- bronte %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining, by = "word"
tidy_bronte %>%
  count(word, sort = TRUE)
## # A tibble: 23,303 x 2
##    word       n
##    <chr>  <int>
##  1 time    1064
##  2 miss     854
##  3 day      826
##  4 hand     767
##  5 eyes     713
##  6 don’t    666
##  7 night    648
##  8 heart    638
##  9 looked   601
## 10 door     591
## # ... with 23,293 more rows
library(tidyr)

frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
                       mutate(tidy_hgwells, author = "H.G. Wells"), 
                       mutate(tidy_books, author = "Jane Austen")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>% # 篩選並增加欄位
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  pivot_wider(names_from = author, values_from = proportion) %>%
  pivot_longer(`Brontë Sisters`:`H.G. Wells`,
               names_to = "author", values_to = "proportion")

frequency
## # A tibble: 57,252 x 4
##    word      `Jane Austen` author          proportion
##    <chr>             <dbl> <chr>                <dbl>
##  1 a            0.00000919 Bronte Sisters  0.0000587 
##  2 a            0.00000919 H.G. Wells      0.0000148 
##  3 aback       NA          Bronte Sisters  0.00000391
##  4 aback       NA          H.G. Wells      0.0000148 
##  5 abaht       NA          Bronte Sisters  0.00000391
##  6 abaht       NA          H.G. Wells     NA         
##  7 abandon     NA          Bronte Sisters  0.0000313 
##  8 abandon     NA          H.G. Wells      0.0000148 
##  9 abandoned    0.00000460 Bronte Sisters  0.0000900 
## 10 abandoned    0.00000460 H.G. Wells      0.000178  
## # ... with 57,242 more rows
library(scales)

# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, 
                      color = abs(`Jane Austen` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), 
                       low = "darkslategray4", high = "gray75") +
  facet_wrap(~author, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "Jane Austen", x = NULL)
## Warning: Removed 40857 rows containing missing values (geom_point).
## Warning: Removed 40859 rows containing missing values (geom_text).

###CH2 情感分析

library(tidytext) #選取情感分析的字典

get_sentiments("afinn")
## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,891 more rows
library(janeaustenr)
library(dplyr)
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, #cumulative sum 累積總和, str_detect()是否含有此種文字
              regex("^chapter [\\divxlc]", #正規表達式
              ignore_case = TRUE)))) %>%  #一律小寫
  ungroup() %>%
  unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>% #情感分析
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # ... with 293 more rows
#nrc會把單字分為positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust,此處為查找joy的字
library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>% #共有80行
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) + #去掉圖例
  facet_wrap(~book, ncol = 2, scales = "free_x") #ncol為欄的數量,scales為交由r判斷

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
## # A tibble: 122,204 x 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ... with 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", "negative"))) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

#MISS被視為負面用法
custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                 lexicon = c("custom")), 
                               stop_words)
library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
## # A tibble: 6 x 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
## # A tibble: 6 x 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

####ch3

library(dplyr)
library(janeaustenr)
library(tidytext)

book_words <- austen_books() %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE)

total_words <- book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)
## Joining, by = "book"
book_words
## # A tibble: 40,379 x 4
##    book              word      n  total
##    <fct>             <chr> <int>  <int>
##  1 Mansfield Park    the    6206 160460
##  2 Mansfield Park    to     5475 160460
##  3 Mansfield Park    and    5438 160460
##  4 Emma              to     5239 160996
##  5 Emma              the    5201 160996
##  6 Emma              and    4896 160996
##  7 Mansfield Park    of     4778 160460
##  8 Pride & Prejudice the    4331 122204
##  9 Emma              of     4291 160996
## 10 Pride & Prejudice to     4162 122204
## # ... with 40,369 more rows
library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 896 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).

freq_by_rank <- book_words %>% 
  group_by(book) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total) %>%
  ungroup()
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset) #Linear Model線性回歸模型
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -0.6226      -1.1125
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_abline(intercept = -0.62, slope = -1.1, 
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)

查看高idt-tf的詞

book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 40,379 x 6
##    book                word          n      tf   idf  tf_idf
##    <fct>               <chr>     <int>   <dbl> <dbl>   <dbl>
##  1 Sense & Sensibility elinor      623 0.00519  1.79 0.00931
##  2 Sense & Sensibility marianne    492 0.00410  1.79 0.00735
##  3 Mansfield Park      crawford    493 0.00307  1.79 0.00551
##  4 Pride & Prejudice   darcy       373 0.00305  1.79 0.00547
##  5 Persuasion          elliot      254 0.00304  1.79 0.00544
##  6 Emma                emma        786 0.00488  1.10 0.00536
##  7 Northanger Abbey    tilney      196 0.00252  1.79 0.00452
##  8 Emma                weston      389 0.00242  1.79 0.00433
##  9 Pride & Prejudice   bennet      294 0.00241  1.79 0.00431
## 10 Persuasion          wentworth   191 0.00228  1.79 0.00409
## # ... with 40,369 more rows
library(forcats)

book_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) + #將因子 f 類別出現的排列順序依照其他變數更動
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

##例子

library(gutenbergr)
physics <- gutenberg_download(c(37729, 14725, 13476, 30155), 
                              meta_fields = "author")#下載古騰堡的書籍
physics_words <- physics %>%
  unnest_tokens(word, text) %>%
  count(author, word, sort = TRUE)
plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>% #將tf-idf算出來
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan", 
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

plot_physics %>% 
  group_by(author) %>% 
  slice_max(tf_idf, n = 15) %>% 
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(tf_idf, word, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = "tf-idf", y = NULL) +
  facet_wrap(~author, ncol = 2, scales = "free")

library(stringr)

physics %>% 
  filter(str_detect(text, "_k_")) %>% #str_detect()含有此種文字的文章
  select(text)
## # A tibble: 7 x 1
##   text                                                                  
##   <chr>                                                                 
## 1 surface AB at the points AK_k_B. Then instead of the hemispherical    
## 2 would needs be that from all the other points K_k_B there should      
## 3 necessarily be equal to CD, because C_k_ is equal to CK, and C_g_ to  
## 4 the crystal at K_k_, all the points of the wave CO_oc_ will have      
## 5 O_o_ has reached K_k_. Which is easy to comprehend, since, of these   
## 6 CO_oc_ in the crystal, when O_o_ has arrived at K_k_, because it forms
## 7 ρ is the average density of the matter and _k_ is a constant connected
physics %>% 
  filter(str_detect(text, "RC")) %>% 
  select(text)
## # A tibble: 44 x 1
##    text                                                                  
##    <chr>                                                                 
##  1 line RC, parallel and equal to AB, to be a portion of a wave of light,
##  2 represents the partial wave coming from the point A, after the wave RC
##  3 be the propagation of the wave RC which fell on AB, and would be the  
##  4 transparent body; seeing that the wave RC, having come to the aperture
##  5 incident rays. Let there be such a ray RC falling upon the surface    
##  6 CK. Make CO perpendicular to RC, and across the angle KCO adjust OK,  
##  7 the required refraction of the ray RC. The demonstration of this is,  
##  8 explaining ordinary refraction. For the refraction of the ray RC is   
##  9 29. Now as we have found CI the refraction of the ray RC, similarly   
## 10 the ray _r_C is inclined equally with RC, the line C_d_ will          
## # ... with 34 more rows
mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn", 
                                   "fig", "file", "cg", "cb", "cm",
                               "ab", "_k", "_k_", "_x")) #新增一些停用字到字典中

physics_words <- anti_join(physics_words, mystopwords, 
                           by = "word")
plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(word = str_remove_all(word, "_")) %>%
  group_by(author) %>% 
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, tf_idf, author)) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan",
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))
ggplot(plot_physics, aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, ncol = 2, scales = "free") +
  coord_flip() +
  scale_x_reordered()

####ch4

library(dplyr)
library(tidytext)
library(janeaustenr)

austen_bigrams <- austen_books() %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

##4-1-1

austen_bigrams %>%
  count(bigram, sort = TRUE)
## # A tibble: 193,210 x 2
##    bigram      n
##    <chr>   <int>
##  1 <NA>    12242
##  2 of the   2853
##  3 to be    2670
##  4 in the   2221
##  5 it was   1691
##  6 i am     1485
##  7 she had  1405
##  8 of her   1363
##  9 to the   1315
## 10 she was  1309
## # ... with 193,200 more rows
library(tidyr)

bigrams_separated <- austen_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ") #將兩個連續詞分開

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) #清理停用字

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)#重新篩選

bigram_counts
## # A tibble: 28,975 x 3
##    word1   word2         n
##    <chr>   <chr>     <int>
##  1 <NA>    <NA>      12242
##  2 sir     thomas      266
##  3 miss    crawford    196
##  4 captain wentworth   143
##  5 miss    woodhouse   143
##  6 frank   churchill   114
##  7 lady    russell     110
##  8 sir     walter      108
##  9 lady    bertram     101
## 10 miss    fairfax      98
## # ... with 28,965 more rows
bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

trigram

austen_books() %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)
## # A tibble: 6,141 x 4
##    word1     word2     word3         n
##    <chr>     <chr>     <chr>     <int>
##  1 <NA>      <NA>      <NA>      13260
##  2 dear      miss      woodhouse    20
##  3 miss      de        bourgh       17
##  4 lady      catherine de           11
##  5 poor      miss      taylor       11
##  6 sir       walter    elliot       10
##  7 catherine de        bourgh        9
##  8 dear      sir       thomas        8
##  9 replied   miss      crawford      7
## 10 sir       william   lucas         7
## # ... with 6,131 more rows

##4.1.2 Analyzing bigrams

bigrams_filtered %>%
  filter(word2 == "street") %>%
  count(book, word1, sort = TRUE)
## # A tibble: 33 x 3
##    book                word1           n
##    <fct>               <chr>       <int>
##  1 Sense & Sensibility harley         16
##  2 Sense & Sensibility berkeley       15
##  3 Northanger Abbey    milsom         10
##  4 Northanger Abbey    pulteney       10
##  5 Mansfield Park      wimpole         9
##  6 Pride & Prejudice   gracechurch     8
##  7 Persuasion          milsom          5
##  8 Sense & Sensibility bond            4
##  9 Sense & Sensibility conduit         4
## 10 Persuasion          rivers          4
## # ... with 23 more rows
bigram_tf_idf <- bigrams_united %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))

4.1.3 Using bigrams to provide context in sentiment analysis

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 1,178 x 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   be      580
##  2 not   to      335
##  3 not   have    307
##  4 not   know    237
##  5 not   a       184
##  6 not   think   162
##  7 not   been    151
##  8 not   the     135
##  9 not   at      126
## 10 not   in      110
## # ... with 1,168 more rows
AFINN <- get_sentiments("afinn")

AFINN
## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows
not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)
library(ggplot2)

not_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  labs(x = "Sentiment value * number of occurrences",
       y = "Words preceded by \"not\"")

negation_words <- c("not", "no", "never", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)

##4.1.4 Visualizing a network of bigrams with ggraph

library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame() #顯示出from、to、weight
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
bigram_graph
## IGRAPH 0797184 DN-- 86 71 -- 
## + attr: name (v/c), n (e/n)
## + edges from 0797184 (vertex names):
##  [1] NA      ->NA         sir     ->thomas     miss    ->crawford  
##  [4] captain ->wentworth  miss    ->woodhouse  frank   ->churchill 
##  [7] lady    ->russell    sir     ->walter     lady    ->bertram   
## [10] miss    ->fairfax    colonel ->brandon    sir     ->john      
## [13] miss    ->bates      jane    ->fairfax    lady    ->catherine 
## [16] lady    ->middleton  miss    ->tilney     miss    ->bingley   
## [19] thousand->pounds     miss    ->dashwood   dear    ->miss      
## [22] miss    ->bennet     miss    ->morland    captain ->benwick   
## + ... omitted several edges
library(ggraph)
set.seed(2017)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

##4.1.5 Visualizing bigrams in other texts

library(dplyr)
library(tidyr)
library(tidytext)
library(ggplot2)
library(igraph)
library(ggraph)

count_bigrams <- function(dataset) {
  dataset %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word) %>%
    count(word1, word2, sort = TRUE)
}

visualize_bigrams <- function(bigrams) {
  set.seed(2016)#取隨機數
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))#畫箭頭
  
  bigrams %>%
    graph_from_data_frame() %>% #畫網絡
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}
library(gutenbergr)
kjv <- gutenberg_download(10)
library(stringr)

kjv_bigrams <- kjv %>%
  count_bigrams()
kjv_bigrams %>%
  filter(n > 40,
         !str_detect(word1, "\\d"),
         !str_detect(word2, "\\d")) %>%
  visualize_bigrams()

##4.2 Counting and correlating pairs of words with the widyr package

##4.2.1 Counting and correlating among sections

austen_section_words <- austen_books() %>%
  filter(book == "Pride & Prejudice") %>%
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)
library(widyr)

# count words co-occuring within sections
word_pairs <- austen_section_words %>%
  pairwise_count(word, section, sort = TRUE) #計算最常見的兩個字詞
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
word_pairs
## # A tibble: 796,008 x 3
##    item1     item2         n
##    <chr>     <chr>     <dbl>
##  1 darcy     elizabeth   144
##  2 elizabeth darcy       144
##  3 miss      elizabeth   110
##  4 elizabeth miss        110
##  5 elizabeth jane        106
##  6 jane      elizabeth   106
##  7 miss      darcy        92
##  8 darcy     miss         92
##  9 elizabeth bingley      91
## 10 bingley   elizabeth    91
## # ... with 795,998 more rows
word_pairs %>%
  filter(item1 == "darcy")
## # A tibble: 2,930 x 3
##    item1 item2         n
##    <chr> <chr>     <dbl>
##  1 darcy elizabeth   144
##  2 darcy miss         92
##  3 darcy bingley      86
##  4 darcy jane         46
##  5 darcy bennet       45
##  6 darcy sister       45
##  7 darcy time         41
##  8 darcy lady         38
##  9 darcy friend       37
## 10 darcy wickham      37
## # ... with 2,920 more rows
#最常出現的兩個關聯字是兩位主角

##4.2.2 Pairwise correlation

# we need to filter for at least relatively common words first
word_cors <- austen_section_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, section, sort = TRUE)

word_cors
## # A tibble: 154,842 x 3
##    item1     item2     correlation
##    <chr>     <chr>           <dbl>
##  1 bourgh    de              0.951
##  2 de        bourgh          0.951
##  3 pounds    thousand        0.701
##  4 thousand  pounds          0.701
##  5 william   sir             0.664
##  6 sir       william         0.664
##  7 catherine lady            0.663
##  8 lady      catherine       0.663
##  9 forster   colonel         0.622
## 10 colonel   forster         0.622
## # ... with 154,832 more rows
word_cors %>%
  filter(item1 == "pounds")
## # A tibble: 393 x 3
##    item1  item2     correlation
##    <chr>  <chr>           <dbl>
##  1 pounds thousand       0.701 
##  2 pounds ten            0.231 
##  3 pounds fortune        0.164 
##  4 pounds settled        0.149 
##  5 pounds wickham's      0.142 
##  6 pounds children       0.129 
##  7 pounds mother's       0.119 
##  8 pounds believed       0.0932
##  9 pounds estate         0.0890
## 10 pounds ready          0.0860
## # ... with 383 more rows
word_cors %>%
  filter(item1 %in% c("elizabeth", "pounds", "married", "pride")) %>%
  group_by(item1) %>%
  slice_max(correlation, n = 6) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()

set.seed(2016)

word_cors %>%
  filter(correlation > .15) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()