安裝需要的packages

packages = c("dplyr", "tidytext", "gutenbergr", "ggplot2", "tidyr", "igraph", "ggraph")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(gutenbergr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(ggplot2)
library(tidyr)
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)

下載書籍

scarlet <- gutenberg_download(244) %>%
  filter(text!="") %>% 
  distinct(gutenberg_id, text) %>% 
  mutate(book = "A Study in Scarlet", linenumber = row_number())
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
hound <- gutenberg_download(2852) %>% 
  filter(text!="") %>% 
  distinct(gutenberg_id, text) %>% 
  mutate(book = "The Hound of the Baskervilles", linenumber = row_number())

sign <- gutenberg_download(2097) %>% 
  filter(text!="") %>% 
  distinct(gutenberg_id, text) %>% 
  mutate(book = "The Sign of the Four", linenumber = row_number())

valley <- gutenberg_download(3289) %>% 
  filter(text!="") %>% 
  distinct(gutenberg_id, text) %>% 
  mutate(book = "The Valley of Fear", linenumber = row_number())

斷詞

tokens_scar <- scarlet %>% unnest_tokens(word, text)
tokens_hound <- hound %>% unnest_tokens(word, text)
tokens_sign <- sign %>% unnest_tokens(word, text)
tokens_valley <- valley %>% unnest_tokens(word, text)
tokens_all <- bind_rows(tokens_scar, tokens_hound, tokens_sign, tokens_valley)

Term Frequency (tf)

計算字出現數

book_words <- tokens_all %>%
  count(book, word, sort = TRUE)

total_words <- book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)
## Joining, by = "book"
book_words
## # A tibble: 22,631 x 4
##    book                          word      n total
##    <chr>                         <chr> <int> <int>
##  1 The Hound of the Baskervilles the    3331 59481
##  2 The Valley of Fear            the    3266 57955
##  3 A Study in Scarlet            the    2567 44202
##  4 The Sign of the Four          the    2341 43535
##  5 The Hound of the Baskervilles and    1628 59481
##  6 The Hound of the Baskervilles of     1594 59481
##  7 The Hound of the Baskervilles i      1468 59481
##  8 The Valley of Fear            and    1444 57955
##  9 The Valley of Fear            of     1441 57955
## 10 The Hound of the Baskervilles to     1408 59481
## # … with 22,621 more rows

四篇都是“the”出現最多次,另外可能是因為巴斯克維爾的獵犬字數最多,導致他其他字“and”, “of”, “i”, “to”也出現很多次。

出現率與個數

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE, bins = 30) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")
## Warning: Removed 573 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).

由這張圖可以看出,血字研究、巴斯克維爾的獵犬、恐怖谷這三本,用字的情況較不集中,出現頻率最低與次低的差距甚大。

出現頻率與排名

freq_by_rank <- book_words %>% 
  group_by(book) %>% 
  mutate(rank = row_number(), `term frequency` = n/total)

freq_by_rank %>% arrange(rank)
## # A tibble: 22,631 x 6
## # Groups:   book [4]
##    book                          word      n total  rank `term frequency`
##    <chr>                         <chr> <int> <int> <int>            <dbl>
##  1 The Hound of the Baskervilles the    3331 59481     1           0.0560
##  2 The Valley of Fear            the    3266 57955     1           0.0564
##  3 A Study in Scarlet            the    2567 44202     1           0.0581
##  4 The Sign of the Four          the    2341 43535     1           0.0538
##  5 The Hound of the Baskervilles and    1628 59481     2           0.0274
##  6 The Valley of Fear            and    1444 57955     2           0.0249
##  7 A Study in Scarlet            and    1369 44202     2           0.0310
##  8 The Sign of the Four          i      1216 43535     2           0.0279
##  9 The Hound of the Baskervilles of     1594 59481     3           0.0268
## 10 The Valley of Fear            of     1441 57955     3           0.0249
## # … with 22,621 more rows

四篇統計的結果可看出,每篇前幾名的用字不外乎“the”, “and”, “of”, “i”, “a”, “to”等,其中“the”的出現頻率皆超過5%。

出現頻率與排名取log

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()

可看出每篇用字的頻率都有一致的走向。

計算斜率與常數

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -0.6597      -1.1090

加上線

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_abline(intercept = -0.6597, slope = -1.109, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

Term Frequency - Inverse Document Frequency (tf-idf)

透過函式計算tf、idf、tf-idf

book_words <- book_words %>%
  bind_tf_idf(word, book, n)
book_words
## # A tibble: 22,631 x 7
##    book                          word      n total     tf   idf tf_idf
##    <chr>                         <chr> <int> <int>  <dbl> <dbl>  <dbl>
##  1 The Hound of the Baskervilles the    3331 59481 0.0560     0      0
##  2 The Valley of Fear            the    3266 57955 0.0564     0      0
##  3 A Study in Scarlet            the    2567 44202 0.0581     0      0
##  4 The Sign of the Four          the    2341 43535 0.0538     0      0
##  5 The Hound of the Baskervilles and    1628 59481 0.0274     0      0
##  6 The Hound of the Baskervilles of     1594 59481 0.0268     0      0
##  7 The Hound of the Baskervilles i      1468 59481 0.0247     0      0
##  8 The Valley of Fear            and    1444 57955 0.0249     0      0
##  9 The Valley of Fear            of     1441 57955 0.0249     0      0
## 10 The Hound of the Baskervilles to     1408 59481 0.0237     0      0
## # … with 22,621 more rows

與剛剛計算的結果相同。

按照tf-idf順序排列

book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 22,631 x 6
##    book                          word            n      tf   idf  tf_idf
##    <chr>                         <chr>       <int>   <dbl> <dbl>   <dbl>
##  1 The Hound of the Baskervilles moor          163 0.00274 1.39  0.00380
##  2 The Hound of the Baskervilles henry         146 0.00245 1.39  0.00340
##  3 The Hound of the Baskervilles baskerville   111 0.00187 1.39  0.00259
##  4 The Valley of Fear            douglas       105 0.00181 1.39  0.00251
##  5 The Valley of Fear            mcmurdo       193 0.00333 0.693 0.00231
##  6 The Valley of Fear            mcginty        90 0.00155 1.39  0.00215
##  7 The Sign of the Four          sholto         66 0.00152 1.39  0.00210
##  8 The Hound of the Baskervilles mortimer       87 0.00146 1.39  0.00203
##  9 The Hound of the Baskervilles stapleton      85 0.00143 1.39  0.00198
## 10 The Sign of the Four          morstan        60 0.00138 1.39  0.00191
## # … with 22,621 more rows

可看到出現了許多人名,包括“henry”, “baskerville”, “douglas”, “mcmurdo”, “mcginty”…。

這裡我們還可以看到,“mcmurdo”的 idf 與其他人不同,因此可以知道他在不只一本書中出現,除了恐怖谷之外,還出現在四個簽名之中。

圖表呈現前十五名

book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf

基本上每本前幾名都是人名,除了巴斯克維爾的獵犬中第一名:moor(沼澤),因為其他本小說沒有用到這個字,而自己使用率很高,導致tf-idf分數十分高。

n-gram

bigram斷詞

holmes <- bind_rows(scarlet, hound, sign, valley)

holmes_bigrams <- holmes %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
  select(-gutenberg_id) %>% 
  filter(!is.na(bigram))

holmes_bigrams
## # A tibble: 187,275 x 3
##    book               linenumber bigram                
##    <chr>                   <int> <chr>                 
##  1 A Study in Scarlet          1 a study               
##  2 A Study in Scarlet          1 study in              
##  3 A Study in Scarlet          1 in scarlet            
##  4 A Study in Scarlet          2 by a                  
##  5 A Study in Scarlet          2 a conan               
##  6 A Study in Scarlet          2 conan doyle           
##  7 A Study in Scarlet          4 original transcriber's
##  8 A Study in Scarlet          4 transcriber's note    
##  9 A Study in Scarlet          4 note this             
## 10 A Study in Scarlet          4 this etext            
## # … with 187,265 more rows

第三行內容為“[1]”,斷詞後結果為NA,在這裡我們選擇將NA過濾掉,因此最終結果並們有第三行。

bigram排名

holmes_bigrams %>% 
  count(bigram, sort = TRUE)
## # A tibble: 82,408 x 2
##    bigram       n
##    <chr>    <int>
##  1 of the    1364
##  2 in the     874
##  3 it was     537
##  4 to the     500
##  5 it is      439
##  6 at the     405
##  7 i have     395
##  8 and the    369
##  9 that i     350
## 10 upon the   342
## # … with 82,398 more rows

結果中多是與停用字有關,導致很難分析。

分割bigram,去除含有停用字

bigrams_separated <- holmes_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts
## # A tibble: 11,501 x 3
##    word1       word2        n
##    <chr>       <chr>    <int>
##  1 sir         henry      129
##  2 sherlock    holmes     116
##  3 dr          mortimer    69
##  4 sir         charles     64
##  5 dr          watson      43
##  6 jefferson   hope        33
##  7 miss        morstan     31
##  8 baker       street      28
##  9 baskerville hall        27
## 10 john        ferrier     23
## # … with 11,491 more rows

可看到很多稱謂或地名。

將分割後的結果組合回去

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigrams_united
## # A tibble: 13,945 x 3
##    book               linenumber bigram                 
##    <chr>                   <int> <chr>                  
##  1 A Study in Scarlet          2 conan doyle            
##  2 A Study in Scarlet          4 original transcriber's 
##  3 A Study in Scarlet          4 transcriber's note     
##  4 A Study in Scarlet          4 prepared directly      
##  5 A Study in Scarlet          5 1887 edition           
##  6 A Study in Scarlet          6 including typographical
##  7 A Study in Scarlet          8 text include           
##  8 A Study in Scarlet          8 include adding         
##  9 A Study in Scarlet          8 underscore character   
## 10 A Study in Scarlet          9 square braces          
## # … with 13,935 more rows

trigram斷詞,去除停用字

holmes %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  filter(!is.na(trigram)) %>% 
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)
## # A tibble: 2,804 x 4
##    word1      word2     word3           n
##    <chr>      <chr>     <chr>       <int>
##  1 sir        henry     baskerville    17
##  2 sir        charles   baskerville    11
##  3 salt       lake      city            9
##  4 sir        charles's death           8
##  5 halliday's private   hotel           5
##  6 detective  police    force           4
##  7 dr         james     mortimer        4
##  8 lodge      341       vermissa        4
##  9 3          lauriston gardens         3
## 10 dr         mortimer  looked          3
## # … with 2,794 more rows

可看出也是有很多人名。

分析bigram

XX街名出現次數

bigrams_filtered %>%
  filter(word2 == "street") %>%
  count(book, word1, sort = TRUE)
## # A tibble: 25 x 3
##    book                          word1         n
##    <chr>                         <chr>     <int>
##  1 The Hound of the Baskervilles baker        11
##  2 The Sign of the Four          baker        10
##  3 The Hound of the Baskervilles regent        8
##  4 A Study in Scarlet            baker         6
##  5 The Hound of the Baskervilles oxford        3
##  6 A Study in Scarlet            duncan        2
##  7 The Valley of Fear            sheridan      2
##  8 A Study in Scarlet            busy          1
##  9 A Study in Scarlet            henrietta     1
## 10 A Study in Scarlet            ragged        1
## # … with 15 more rows

福爾摩斯與華生的住所:貝克街,出現很多次。

bigram結果做tf-idf

bigram_tf_idf <- bigrams_united %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf
## # A tibble: 11,922 x 6
##    book                         bigram              n      tf   idf  tf_idf
##    <chr>                        <chr>           <int>   <dbl> <dbl>   <dbl>
##  1 The Hound of the Baskervill… sir henry         129 0.0317  1.39  0.0439 
##  2 The Hound of the Baskervill… dr mortimer        69 0.0169  1.39  0.0235 
##  3 The Sign of the Four         miss morstan       31 0.0101  1.39  0.0140 
##  4 The Hound of the Baskervill… sir charles        63 0.0155  0.693 0.0107 
##  5 A Study in Scarlet           john ferrier       23 0.00735 1.39  0.0102 
##  6 The Hound of the Baskervill… baskerville ha…    27 0.00663 1.39  0.00919
##  7 The Valley of Fear           white mason        23 0.00627 1.39  0.00869
##  8 The Sign of the Four         athelney jones     18 0.00585 1.39  0.00811
##  9 The Sign of the Four         thaddeus sholto    18 0.00585 1.39  0.00811
## 10 The Hound of the Baskervill… sir charles's      23 0.00565 1.39  0.00783
## # … with 11,912 more rows

圖表呈現

bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(book) %>% 
  top_n(10) %>% 
  ungroup() %>%
  ggplot(aes(bigram, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf

bigram結果做進階情緒分析

過濾出第一個字為 “not” 的 bigram

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 368 x 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   a        58
##  2 not   to       41
##  3 not   be       40
##  4 not   have     37
##  5 not   know     29
##  6 not   the      29
##  7 not   been     25
##  8 not   for      19
##  9 not   so       17
## 10 not   only     14
## # … with 358 more rows

用AFINN詞庫統計接在“not”後出現字的分數與出現次數

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(get_sentiments("afinn") , by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE)

not_words
## # A tibble: 70 x 3
##    word2   score     n
##    <chr>   <int> <int>
##  1 help        2     9
##  2 wish        1     4
##  3 fear       -2     3
##  4 leave      -1     3
##  5 trust       1     3
##  6 ashamed    -2     2
##  7 doubt      -1     2
##  8 easy        1     2
##  9 escape     -1     2
## 10 hurt       -2     2
## # … with 60 more rows

圖表顯示 “score” 乘以 “n”

not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

四本小說中,“not”, “no”, “never”, “without”否定字分開統計各取前十名

negation_words <- c("not", "no", "never", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = TRUE)

negated_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  group_by(word1) %>% 
  top_n(10) %>%
  ungroup() %>% 
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  facet_wrap(~word1, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by contribution

透過以上結果可看出接在“not”和“no”後面多為正面字,“never”後多接否定字,但我們無法看出否定字對我們情緒分析的影響,因此我們將上次做的情緒分析,與加上“not”等否定字調整情緒分數的結果做比較。

純使用AFINN詞庫的情緒分數 vs. 加上否定字調整後的情緒分數

AFINN詞庫的情緒分數

all_afinn <- tokens_all %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(book, index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(status = "Original")
## Joining, by = "word"
not_words_afinn <- bigrams_separated %>%
  filter(word1 == "not" | word1 == "no" | word1 == "never" | word1 == "without") %>%
  inner_join(get_sentiments("afinn") , by = c(word2 = "word")) %>% 
  group_by(book, index = linenumber %/% 80) %>% 
  summarise(sentiment = (sum(score) * -2)) %>% 
  mutate(status = "Adjusted")

adjust_afinn <- bind_rows(all_afinn, not_words_afinn) %>% 
  group_by(book, index) %>% 
  summarise(sentiment = sum(sentiment)) %>% 
  mutate(status = "Adjustment")

ggplot(all_afinn, aes(index, sentiment, fill= book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs(title = "AFINN", x = NULL, y = NULL)

加上“not”, “no”, “never”, “without”否定字調整後的情緒分數

ggplot(adjust_afinn, aes(index, sentiment, fill= book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs(title = "AFINN(Adjusted)", x = NULL, y = NULL)

與原先結果之間的誤差

ggplot(not_words_afinn, aes(index, sentiment, fill= book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs(title = "Adjustment", x = NULL, y = NULL)

compare_afinn <- bind_rows(all_afinn, adjust_afinn, not_words_afinn)
ggplot(compare_afinn, aes(index, sentiment, fill= book)) +
  geom_col(show.legend = FALSE) +
  facet_grid(status~book, scales = "free_x") +
  labs(title = "Compare", x = NULL, y = NULL)

ggraph

轉為ggraph可接受的形式

bigram_graph <- bigram_counts %>%
  filter(n > 10) %>%
  graph_from_data_frame()

bigram_graph
## IGRAPH f5182d0 DN-- 63 39 -- 
## + attr: name (v/c), n (e/n)
## + edges from f5182d0 (vertex names):
##  [1] sir        ->henry       sherlock   ->holmes     
##  [3] dr         ->mortimer    sir        ->charles    
##  [5] dr         ->watson      jefferson  ->hope       
##  [7] miss       ->morstan     baker      ->street     
##  [9] baskerville->hall        john       ->ferrier    
## [11] sir        ->charles's   white      ->mason      
## [13] wedding    ->ring        henry      ->baskerville
## [15] manor      ->house       athelney   ->jones      
## + ... omitted several edges

畫出連結關係圖

set.seed(2017)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  labs(x = NULL, y = NULL)

美化連結關係圖

set.seed(2016)

a <- grid::arrow(type = "closed", length = unit(.1, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.05, 'inches')) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()