載入需要的packages

library(gutenbergr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(tidytext)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)

下載文本

#快樂王子(第53~352行)
book_prince <- gutenberg_download(30120) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
prince <- book_prince[53:352,]

#下載灰姑娘
cinderalla <- gutenberg_download(10830) %>% filter(text!="") %>% 
distinct(gutenberg_id, text)

重新處理文本使後面的斷詞能夠正確地斷出happy prince

text<- paste0(prince$text, collapse = ' ') #把所有的文字接在一起

#gsub把正則表達式抓出來的字整個取代成happy prince
text <- gsub('(H|h)appy (P|p)rince','HappyPrince',text) 

df <- tibble(text = text) #存成tibble

TF-IDF

#將兩本書合併,找出HappyPrince & Cinderalla 總字數及個別字出現次數
Cinderalla_words <- cinderalla %>% 
  unnest_tokens(word, text) %>% 
  mutate(book='Cinderalla') 

Prince_words <- df %>%
  unnest_tokens(word, text) %>% 
  mutate(book='Prince') 

book_words <- full_join(Cinderalla_words,Prince_words) %>% 
  count(book,word,sort = TRUE) 
## Joining, by = c("word", "book")
total_words <-book_words %>% 
  group_by(book) %>% 
  summarize(total=sum(n))

book_words <- left_join(book_words,total_words)
## Joining, by = "book"
book_words
## # A tibble: 2,031 x 4
##    book       word      n total
##    <chr>      <chr> <int> <int>
##  1 Prince     the     299  3475
##  2 Cinderalla the     282  4149
##  3 Cinderalla and     169  4149
##  4 Cinderalla to      154  4149
##  5 Prince     and     139  3475
##  6 Cinderalla her     134  4149
##  7 Prince     he      110  3475
##  8 Cinderalla of       93  4149
##  9 Prince     of       87  3475
## 10 Cinderalla she      77  4149
## # ... with 2,021 more rows
##畫出tf
ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend =FALSE) +
  xlim(NA, 0.01) +
  facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 24 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

結果:兩本皆有長尾

檢查是否符合Zipf’s law

#計算rank與term frequency

freq_by_rank <- book_words %>% 
  group_by(book) %>% 
  mutate(rank = row_number(),  
         `term frequency` = n/total)

freq_by_rank
## # A tibble: 2,031 x 6
## # Groups:   book [2]
##    book       word      n total  rank `term frequency`
##    <chr>      <chr> <int> <int> <int>            <dbl>
##  1 Prince     the     299  3475     1           0.0860
##  2 Cinderalla the     282  4149     1           0.0680
##  3 Cinderalla and     169  4149     2           0.0407
##  4 Cinderalla to      154  4149     3           0.0371
##  5 Prince     and     139  3475     2           0.04  
##  6 Cinderalla her     134  4149     4           0.0323
##  7 Prince     he      110  3475     3           0.0317
##  8 Cinderalla of       93  4149     5           0.0224
##  9 Prince     of       87  3475     4           0.0250
## 10 Cinderalla she      77  4149     6           0.0186
## # ... with 2,021 more rows
#計算tf並繪製成圖,比對rank與tf是否成連續性反比
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_line(size = 1.5, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

#因為rank後段較不連續,把線最為線性的中段(取rank 10~100)的部分提出來當基準線
rank_subset <- freq_by_rank %>% 
  filter(rank < 100,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -0.9988      -0.8893
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_abline(intercept = -0.9988, slope = -0.8893, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = T) + 
  scale_x_log10() +
  scale_y_log10()

結果:大致上符合Zipf’s Law

#兩本書一起做TF-IDF
book_words <- book_words %>%
  bind_tf_idf(word, book, n)
book_words
## # A tibble: 2,031 x 7
##    book       word      n total     tf   idf tf_idf
##    <chr>      <chr> <int> <int>  <dbl> <dbl>  <dbl>
##  1 Prince     the     299  3475 0.0860     0      0
##  2 Cinderalla the     282  4149 0.0680     0      0
##  3 Cinderalla and     169  4149 0.0407     0      0
##  4 Cinderalla to      154  4149 0.0371     0      0
##  5 Prince     and     139  3475 0.04       0      0
##  6 Cinderalla her     134  4149 0.0323     0      0
##  7 Prince     he      110  3475 0.0317     0      0
##  8 Cinderalla of       93  4149 0.0224     0      0
##  9 Prince     of       87  3475 0.0250     0      0
## 10 Cinderalla she      77  4149 0.0186     0      0
## # ... with 2,021 more rows
#依照TF-IDF的排序找出重要的字詞
book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 2,031 x 6
##    book       word            n      tf   idf  tf_idf
##    <chr>      <chr>       <int>   <dbl> <dbl>   <dbl>
##  1 Cinderalla cinderella     61 0.0147  0.693 0.0102 
##  2 Prince     swallow        45 0.0129  0.693 0.00898
##  3 Prince     happyprince    17 0.00489 0.693 0.00339
##  4 Cinderalla sisters        18 0.00434 0.693 0.00301
##  5 Prince     statue         12 0.00345 0.693 0.00239
##  6 Cinderalla godmother      12 0.00289 0.693 0.00200
##  7 Prince     egypt          10 0.00288 0.693 0.00199
##  8 Cinderalla slipper        11 0.00265 0.693 0.00184
##  9 Prince     city            9 0.00259 0.693 0.00180
## 10 Cinderalla clothes        10 0.00241 0.693 0.00167
## # ... with 2,021 more rows

比較以TF-IDF去除stopwords方法和上週的結果

#畫出兩本書各自重要的字詞
book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf

結果:故事主角皆顯得較重要

和直接去除stopwords的方式做比較

去除stopwords段詞後的情緒分析結果

#快樂王子_word分列列出,去除stopwords
tidy_prince <- df %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  mutate(linenumber = row_number()) %>% 
  mutate(book="happy prince")
## Joining, by = "word"
#灰姑娘_word分列列出,去除stopwords
tidy_cinderalla <- cinderalla %>%
  unnest_tokens(word, text)  %>%
  anti_join(stop_words) %>% 
  mutate(linenumber = row_number()) %>% 
  mutate(book="cinderalla")
## Joining, by = "word"
##比較上周的結果
#計算兩本書個字詞出現的數量並做排序
bind_rows(tidy_prince,tidy_cinderalla[,2:4]) %>%
  group_by(word,book) %>% 
  summarise(sum = n()) %>% 
  arrange(desc(sum)) %>%
  ungroup() %>% 
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, sum, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by sum

可見故事主角是重要的,但其他詞頻高的字詞不代表重要性較高

n-grams

#將兩本書各自做ngram,再合併兩者的結果
C_bigrams <- cinderalla %>%  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% mutate(book='cinderella')
P_bigrams <- df %>%  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% mutate(book='happy prince')
tale_bigrams<- full_join(C_bigrams,P_bigrams)
## Joining, by = c("bigram", "book")
tale_bigrams
## # A tibble: 7,622 x 3
##    gutenberg_id bigram               book      
##           <int> <chr>                <chr>     
##  1        10830 illustration hewet's cinderella
##  2        10830 hewet's household    cinderella
##  3        10830 household stories    cinderella
##  4        10830 stories for          cinderella
##  5        10830 for little           cinderella
##  6        10830 little folks         cinderella
##  7        10830 folks illustrated    cinderella
##  8        10830 illustrated w        cinderella
##  9        10830 w h                  cinderella
## 10        10830 h thwaite            cinderella
## # ... with 7,612 more rows
#統計bigram出現次數,找出最常出現的bigram
tale_bigrams %>%
  count(bigram, sort = TRUE)
## # A tibble: 5,565 x 2
##    bigram              n
##    <chr>           <int>
##  1 of the             63
##  2 to the             61
##  3 in the             41
##  4 said the           34
##  5 and the            23
##  6 the prince         21
##  7 the swallow        21
##  8 at the             20
##  9 the happyprince    17
## 10 i am               16
## # ... with 5,555 more rows

結果:可以看到在沒去除stopwords的情況下,出現很多沒有意義的詞組

清理有stopwords的bigrams

#將bigram分成個別的兩個字,再移除任一word含有stopwords的bigram
bigrams_separated <- tale_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)


#計算去除stopwords後的bigram數目
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts
## # A tibble: 598 x 3
##    word1        word2           n
##    <chr>        <chr>       <int>
##  1 king's       son             9
##  2 town         councillors     5
##  3 fine         gold            4
##  4 glass        slipper         4
##  5 illustration cinderella      4
##  6 swallow      swallow         4
##  7 cinder       wench           3
##  8 court        ball            3
##  9 dear         prince          3
## 10 glass        slippers        3
## # ... with 588 more rows

可以看到stopwors皆已去除

#合併去除stopwords後的bigram結果
bigrams_united <- bigrams_filtered[,-c(1)] %>%
  unite(bigram, word1, word2, sep = " ")

bigrams_united
## # A tibble: 663 x 2
##    bigram                    book      
##    <chr>                     <chr>     
##  1 illustration hewet's      cinderella
##  2 hewet's household         cinderella
##  3 household stories         cinderella
##  4 folks illustrated         cinderella
##  5 thwaite engraved          cinderella
##  6 artists vol               cinderella
##  7 cinderella 1855           cinderella
##  8 1855 illustration         cinderella
##  9 illustration frontispiece cinderella
## 10 frontispiece cinderella   cinderella
## # ... with 653 more rows

以相同方式做trigram的結果

##改以三個字為單位組成的token,移除任一word含有stopwords的trigram
total_words %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)
## # A tibble: 37 x 4
##    word1     word2    word3         n
##    <chr>     <chr>    <chr>     <int>
##  1 beautiful sapphire lying         2
##  2 boys      wore     scarlet       2
##  3 bright    scarlet  cloaks        2
##  4 broken    lead     heart         2
##  5 bye       dear     prince        2
##  6 carry     amber    beads         2
##  7 cataract  swallow  swallow       2
##  8 catch     gold     fish          2
##  9 clean     white    pinafores     2
## 10 coarse    red      hands         2
## # ... with 27 more rows

比較兩本書中的beautiful和poor

前一周小結:兩本書的poor跟beautiful都很多,只是形容的事物不同、意義不同

#兩本書分別出現那些用beautifel去形容的事物
bigrams_filtered %>%
  filter(word1 == "beautiful") %>%
  count(book,word2, sort = TRUE) %>% 
  arrange(book)
## # A tibble: 11 x 3
##    book         word2        n
##    <chr>        <chr>    <int>
##  1 cinderella   pair         2
##  2 cinderella   allowed      1
##  3 cinderella   creature     1
##  4 cinderella   lady         1
##  5 cinderella   princess     1
##  6 cinderella   stranger     1
##  7 happy prince houses       2
##  8 happy prince girl         1
##  9 happy prince jewels       1
## 10 happy prince reed         1
## 11 happy prince sapphire     1

灰姑娘的beautiful幾乎都是用來形容人,而快樂王子多用於形容物品

#兩本書分別出現那些用poor去形容的事物
bigrams_filtered %>%
  filter(word1 == "poor") %>%
  count(book,word2, sort = TRUE) %>% 
  arrange(book)
## # A tibble: 5 x 3
##   book         word2          n
##   <chr>        <chr>      <int>
## 1 cinderella   girl           2
## 2 cinderella   cinderella     1
## 3 cinderella   lady           1
## 4 happy prince house          2
## 5 happy prince prince         1

poor在兩本書就幾乎都是用來形容人

#兩本書各自對bigram做TF-IDF後的結果
bigram_tf_idf <- bigrams_united %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(book) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(bigram, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf
bigram_tf_idf 

使用否定字來改變詞彙的情緒值

# 查看前面出現否定詞和後面的所有詞彙
BING <- get_sentiments("bing")

negation_words <- c("not", "no", "never", "without")

n_bigrams <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  count(word1, word2, sort = TRUE)

n_bigrams
## # A tibble: 57 x 3
##    word1 word2       n
##    <chr> <chr>   <int>
##  1 no    longer      3
##  2 not   only        3
##  3 never seen        2
##  4 no    better      2
##  5 no    one         2
##  6 no    sooner      2
##  7 not   allowed     2
##  8 not   bring       2
##  9 not   know        2
## 10 not   to          2
## # ... with 47 more rows
# 查看前面出現否定詞且後面爲情緒詞彙的組合
#使用"bing" lexicon
BING <- get_sentiments("bing")

n_s_bigrams <- n_bigrams %>%
  inner_join(BING, by = c(word2 = "word")) %>%
  count(word1, word2, sort = TRUE)
  
n_s_bigrams
## # A tibble: 9 x 3
##   word1 word2         nn
##   <chr> <chr>      <int>
## 1 no    better         1
## 2 no    mystery        1
## 3 no    uneasiness     1
## 4 not   approve        1
## 5 not   fail           1
## 6 not   lie            1
## 7 not   lose           1
## 8 not   solid          1
## 9 not   sorry          1

從57個減為9個

# 如果在情緒詞前出現的是否定詞的話,則將他的情緒對調

#先找出所有word2包含情緒詞的bigrams
sentiment_bigrams <- bigrams_separated %>%
  inner_join(BING, by = c(word2 = "word")) %>%
  count(word1, word2, sort = TRUE) 

#
total_positve_sentiment <- BING %>% filter(sentiment == "positive")

#將正負面情緒放上標籤,正面為1;負面為-1
sentiment_bigrams <- sentiment_bigrams %>% 
  #inner_join(total_positve_sentiment, by = c(word2 = "word"))
  mutate(sentiment=ifelse(word2 %in% total_positve_sentiment$word,1,-1))
  
  
  
#將前面有反面字的bigram的情緒作對調
inverse_bigrams <- sentiment_bigrams %>%
  mutate(sentiment=ifelse(word1 %in% negation_words, -1*sentiment, sentiment)) %>%
  mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))

inverse_bigrams
## # A tibble: 464 x 5
##    word1 word2         n sentiment sentiment_tag
##    <chr> <chr>     <int>     <dbl> <chr>        
##  1 the   poor          9        -1 negative     
##  2 the   great         8         1 positive     
##  3 the   beautiful     6         1 positive     
##  4 a     good          5         1 positive     
##  5 a     great         5         1 positive     
##  6 fine  gold          4         1 positive     
##  7 a     beautiful     3         1 positive     
##  8 a     lovely        3         1 positive     
##  9 i     love          3         1 positive     
## 10 so    beautiful     3         1 positive     
## # ... with 454 more rows

比較轉換後情緒分析結果的差異

sentiment_bigrams <- sentiment_bigrams %>%
  mutate(tag = "origin")

inverse_bigrams <- inverse_bigrams %>%
  mutate(tag = "inverse")

compare_inverse <- bind_rows(sentiment_bigrams, inverse_bigrams) %>%
  group_by(tag,sentiment) %>% 
  summarise(sum(sentiment)) 

compare_inverse 
## # A tibble: 4 x 3
## # Groups:   tag [?]
##   tag     sentiment `sum(sentiment)`
##   <chr>       <dbl>            <dbl>
## 1 inverse        -1             -174
## 2 inverse         1              290
## 3 origin         -1             -177
## 4 origin          1              287

字詞關聯性

兩本書所有字詞的關聯性

bigram_graph <- bigram_counts %>%
  filter(n > 1) %>%
  graph_from_data_frame()
#留下出現過1次以上的詞

set.seed(2019)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

set.seed(2019)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

兩本書各自的字詞關聯性

處理各自的ngram

#快樂王子去除stopwords後的ngram
P_bigrams <- df %>%  unnest_tokens(bigram, text, token = "ngrams", n = 2) 

P_bigrams_separated <- P_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

P_bigrams_filtered <- P_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)
#計算去除stopwords後的bigram數目
P_bigram_counts <- P_bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)


#灰姑娘去除stopwords後的ngram
C_bigrams <- cinderalla %>%  unnest_tokens(bigram, text, token = "ngrams", n = 2)

C_bigrams_separated <- C_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

C_bigrams_filtered <- C_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)
#計算去除stopwords後的bigram數目
C_bigram_counts <- C_bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

快樂王子的字詞關聯圖

#留下出現過1次以上的詞
P_bigram_graph <- P_bigram_counts %>%
  filter(n > 1) %>%
  graph_from_data_frame()


set.seed(2019)
ggraph(P_bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

灰姑娘的字詞關聯圖

#留下出現過1次以上的詞
C_bigram_graph <- C_bigram_counts %>%
  filter(n > 1) %>%
  graph_from_data_frame()


set.seed(2019)
ggraph(C_bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

小結:快樂王子原本的字詞關聯性就很低,沒有像灰姑娘一樣以主角為核心字詞,有可能因為它是一個短篇故事,所以沒辦法像較長的故事一樣有較明顯的結構