學習重點:雙字元字詞分析、情緒分析、詞頻分析

 

以魔戒三部曲為練習

先安裝需要的packages,並匯入

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales','gutenbergr','jiebaR')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(dplyr)
library(stringr)
require(tidytext)
require(jiebaR)
library(wordcloud2)
library(gutenbergr)
require(data.table)
require(ggplot2)
require(reshape2)
require(wordcloud)
require(tidyr)
require(readr)
require(scales)

#設定資料夾
#設定資料夾
setwd('C:/Users/XA/Documents/R')
getwd()

 

準備3情緒字典庫

get_sentiments("afinn")
get_sentiments("bing")
get_sentiments("nrc")

 

匯入文本

# 魔戒索引
lortindex <-read.csv("lort_index.csv", header = TRUE, sep = ",")

lort1 <- read_delim("./the lord of rings 1.txt", col_names = "text", delim = "\n",quote = " ") %>%
  filter(text!="")

lort2 <- read_delim("./the lord of rings 2.txt", col_names = "text", delim = "\n",quote = " ") %>%
  filter(text!="")

lort3 <- read_delim("./the lord of rings 3.txt", col_names = "text", delim = "\n",quote = " ") %>%
  filter(text!="")

#將文本以書本和章節區分
lort1_parts <- lort1 %>%
  mutate(linenumber = row_number(),
         book = cumsum(str_detect(text, regex("^<BOOK [A-Z]*>"))),
         chapter = cumsum(str_detect(text, regex("^Chapter [\\divxlc]", 
                                                 ignore_case = TRUE))))
lort2_parts <- lort2 %>%
  mutate(linenumber = row_number(),
         book = cumsum(str_detect(text, regex("^<BOOK [A-Z]*>"))),
         chapter = cumsum(str_detect(text, regex("^Chapter [\\divxlc]", 
                                                 ignore_case = TRUE))))
lort3_parts <- lort3 %>%
  mutate(linenumber = row_number(),
         book = cumsum(str_detect(text, regex("^<BOOK [A-Z]*>"))),
         chapter = cumsum(str_detect(text, regex("^Chapter [\\divxlc]", 
                                                 ignore_case = TRUE))))

lort_alltext <- bind_rows(lort1, lort2, lort3 )%>%
  mutate(book = cumsum(str_detect(text, regex("^<BOOK [A-Z]*>"))),
  chapter = cumsum(str_detect(text, regex("^Chapter [\\divxlc]",
                                            ignore_case = TRUE))))
lort_alltext
## # A tibble: 35,789 x 3
##    text                                                        book chapter
##    <chr>                                                      <int>   <int>
##  1 "THE FELLOWSHIP OF THE RING "                                  0       0
##  2 "J.R.R.ToIkien "                                               0       0
##  3 "<BOOK I> "                                                    1       0
##  4 "Chapter 1 . A Long-expected Party "                           1       1
##  5 "When Mr. Bilbo Baggins of Bag End announced that he woul~     1       1
##  6 "celebrating his eleventy -first birthday with a party of~     1       1
##  7 "magnificence, there was much talk and excitement in Hobb~     1       1
##  8 "Bilbo was very rich and very peculiar, and had been the ~     1       1
##  9 "Shire for sixty years, ever since his remarkable disappe~     1       1
## 10 "unexpected return. The riches he had brought back from h~     1       1
## # ... with 35,779 more rows

 

匯入詞庫

#匯入停止文字
data(stop_words)
add_stop <- bind_rows(stop_words,
                      data_frame(word = c("ring", "lord", "rings", "j.r.r.toikien"), 
                                 lexicon = c("custom")))

#匯入人名 
#主角群
data_names_protagonists <- c("frodo", "baggins","samwise","sam", "meriadoc", "merry",
                             "peregrin","pippin", "pip", "gandalf","aragorn","strider",
                             "legolas", "gimli","boromir","denethor", "faramir", 
                             "galadriel", "celeborn", "elrond","arwen",
                             "bilbo","theoden", "treebeard")
#邪惡方
data_name_antagonists <- c("sauron", "ringwraiths", "nazgul", "nazgyl", "saruman",
                           "grima", "orcses","orcs","angmar" ,"goblins","trolls",
                           "barrow-wights", "gollum", "shelob", "balrog", "morgoth")

 

文本斷句

#word 分析
word_lort1 <- lort1_parts %>%
  unnest_tokens(word, text) %>%
  anti_join(add_stop)%>%
  group_by(word)

word_lort2 <- lort2_parts %>%
  unnest_tokens(word, text) %>%
  anti_join(add_stop)%>%
  group_by(word)

word_lort3 <- lort3_parts %>%
  unnest_tokens(word, text) %>%
  anti_join(add_stop)%>%
  group_by(word)

#三部合併 計算詞頻
lort_total <- bind_rows(lort1_parts %>%
                          mutate(book ="lort 1"),
                        lort2_parts %>%
                          mutate(book ="lort 2"),
                        lort3_parts %>%
                          mutate(book = "lort 3"))%>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE) %>% #以書本區分, 並計算各文字出現次數
  ungroup()

lort_total_words <- lort_total %>% 
  group_by(book) %>% 
  summarize(total = sum(n)) #計算每本書的字元數

lort_total <- left_join(lort_total, lort_total_words)#保留lort_total的資料,將lort_total_words合併進
lort_total
## # A tibble: 29,345 x 4
##    book   word      n  total
##    <chr>  <chr> <int>  <int>
##  1 lort 1 the   10998 179384
##  2 lort 2 the    8892 154929
##  3 lort 3 the    7563 128837
##  4 lort 1 and    7177 179384
##  5 lort 3 and    5943 128837
##  6 lort 2 and    5910 154929
##  7 lort 1 of     4674 179384
##  8 lort 2 of     3968 154929
##  9 lort 1 to     3740 179384
## 10 lort 1 a      3596 179384
## # ... with 29,335 more rows

   

詞頻分析

#三部曲詞頻繪圖, 篩選n>150的詞 ,並繪製長條圖
ggplot_word <- lort_total %>%
  anti_join(add_stop)%>%
  filter(!word %in% data_names_protagonists) %>% #篩掉人名
  filter(!word %in% data_name_antagonists) %>% #篩掉人名
  filter(n>150) %>%
  mutate(word = reorder(word, n)) %>% #根據n重排word排序
  ggplot(aes(word, n), fill = book) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, scales = "free_y") +
  labs(y = "Word Frequency",
       x = NULL) +
  geom_text(aes(label=n))+
  coord_flip()
ggplot_word

可以看到,“dark”在三部曲中都是數一數二的高頻率;而“light”,在前兩部還有200次以上的出現頻率,但第三部卻不在。另外可以看到出現150次以上的詞,三部曲逐漸遞減,可以判斷,故事也由第一部的介紹各種族、支線繁多,逐漸變化為第三部的單一主線為主,支線甚少。

 

透過統計字詞,以Zipf’s law檢視

#先計算字的頻率
freq_by_rank <- lort_total %>%
  group_by(book) %>%
  mutate(rank = row_number(), 
         prop = n/total) %>%
  ungroup()
freq_by_rank
## # A tibble: 29,345 x 6
##    book   word      n  total  rank   prop
##    <chr>  <chr> <int>  <int> <int>  <dbl>
##  1 lort 1 the   10998 179384     1 0.0613
##  2 lort 2 the    8892 154929     1 0.0574
##  3 lort 3 the    7563 128837     1 0.0587
##  4 lort 1 and    7177 179384     2 0.0400
##  5 lort 3 and    5943 128837     2 0.0461
##  6 lort 2 and    5910 154929     2 0.0381
##  7 lort 1 of     4674 179384     3 0.0261
##  8 lort 2 of     3968 154929     3 0.0256
##  9 lort 1 to     3740 179384     4 0.0208
## 10 lort 1 a      3596 179384     5 0.0200
## # ... with 29,335 more rows
#Zipf's law
ggplot_zipf <- freq_by_rank %>%
  mutate(word = reorder(word, prop))%>%
  filter(prop > 0.01)%>%
  ggplot(aes(word, prop, fill = book)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  facet_wrap(~book, ncol = 3, scales = "free_y")+
  geom_text(aes(label=percent(prop)))+ #以百分比顯示各word的頻率
  coord_flip()
ggplot_zipf

#繪製曲線圖
ggplot_log <- freq_by_rank %>%
  ggplot(aes(rank, prop, color = book)) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

ggplot_log

三部曲出現頻率最高的前三分別是the、and、of,並且分別佔the(6.13%/5.74%/5.87%)、
and(4.00%/3.81%/4.61%)、of(2.61%/2.56%/2.79%)。
而the/and大概是1.53/1.50/2.10,整體而言,與Zipf’s Law大致吻合。

 

再來看bind_tf_idf

#bind_tf_idf
book_idf <- lort_total %>%
  bind_tf_idf(word, book, n) %>%
  select(-total) %>%
  arrange(desc(tf_idf))

ggplot_idf <- book_idf %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(10) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 1, scales = "free") +
  coord_flip()
ggplot_idf

tf_idf是透過減少常用單詞的權重,來分析評估文檔的重要單詞,這樣可以更容易掌握各文本的key word。
  • 以第一部來看,ferry(渡船)、balin(巴林)、pony(小馬)都名列其中,這也與第一部的重要場景大致符合。
  • 第二部則集中在treebeard(樹鬚)、ents(樹人)、theoden(希優頓)、eomer(伊歐墨)、faramir(法拉墨)、半獸人名(ugl、grishnbkh)上。

       

    三情緒庫分析全文

afinn_all <- lort_alltext %>% 
  unnest_tokens(word, text) %>%
  anti_join(add_stop)%>%
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = chapter) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN")

bing_nrc_all <- bind_rows(lort_alltext %>% 
                            unnest_tokens(word, text) %>%
                            anti_join(add_stop)%>%
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          lort_alltext %>% 
                            unnest_tokens(word, text) %>%
                            anti_join(add_stop)%>%
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = chapter, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

bind_rows(afinn_all, 
          bing_nrc_all) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  xlab("chapter")+
  facet_wrap(~method, ncol = 1, scales = "free_y")+
  geom_vline(xintercept = as.numeric(c(23,44)), color="black")

可以看到魔戒整部大多數都是處於負面的情緒,只有最後一章是三情緒庫都顯示正向的。

   

以afinn分析正向/負向最高頻率的字元

afinn_lort_all <- bind_rows( word_lort1 %>% 
                           inner_join(get_sentiments("afinn")) %>% 
                           mutate(method = "afinn to lort1"), 
                         word_lort2 %>%
                           inner_join(get_sentiments("afinn")) %>% 
                           mutate(method = "afinn to lort2"), 
                         word_lort3 %>%
                           inner_join(get_sentiments("afinn")) %>% 
                           mutate(method = "afinn to lort3")) 

afinn_word_counts_lort <- afinn_lort_all %>%
  filter(word!= "merry") %>% #把merry(人名)去除
  count(word, score, method, sort = TRUE) %>%
  ungroup()

afinn_word_counts_lort %>%
  mutate(cont = n*score) %>%
  arrange(desc(abs(cont))) %>%
  head(50) %>%
  mutate(word = reorder(word,cont))%>%
  ggplot(aes(word, n*score , fill = n*score>0)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, scales = "free_y") +
  labs(y = "Lord of rings",
       x = NULL) +
  coord_flip()

afinn_word_counts_lort
## # A tibble: 2,190 x 4
##    word  score method             n
##    <chr> <int> <chr>          <int>
##  1 fire     -2 afinn to lort1   152
##  2 cried    -2 afinn to lort1   147
##  3 hope      2 afinn to lort1   143
##  4 grey     -1 afinn to lort1   134
##  5 fear     -2 afinn to lort1   126
##  6 hope      2 afinn to lort3   120
##  7 cried    -2 afinn to lort3   119
##  8 grey     -1 afinn to lort2   119
##  9 fear     -2 afinn to lort3   116
## 10 dead     -3 afinn to lort3   113
## # ... with 2,180 more rows
值得一題的是“hope”、“fair”三部曲都有出現,但中間一部的數量有明顯的下降;而負面詞則是第一部的“fire”、第二部的“evil”、第三部的“dead”為最高。

 

文字雲

txt_wordcloud <- lort_alltext %>%
  unnest_tokens(word, text) %>%
  anti_join(add_stop)%>%
  inner_join(get_sentiments("bing")) %>% 
  count(word, sentiment, sort = TRUE) %>%
  filter(word!= "merry") %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words =200)

txt_wordcloud

     

雙字元分析

lort_parts_double <- lort_alltext %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)%>%
  count(bigram, book, sort = TRUE)

lort_parts_double_sept <- lort_parts_double %>%
  separate(bigram, c("word1", "word2"), sep = " ")

#否定詞庫
avoid <- c("not", "no", "never", "without")

#AFINN分析否定詞
afinn_not_words <- lort_parts_double_sept %>%
  filter(word1 %in% avoid) %>%
   inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = TRUE)

afinn_not_words
## # A tibble: 255 x 4
##    word1 word2   score     n
##    <chr> <chr>   <int> <int>
##  1 no    doubt      -1     6
##  2 no    good        3     6
##  3 no    no         -1     6
##  4 not   like        2     6
##  5 not   love        3     6
##  6 not   wish        1     6
##  7 no    comfort     2     5
##  8 no    escape     -1     5
##  9 no    evil       -3     5
## 10 no    hope        2     5
## # ... with 245 more rows
#情緒長條圖
ggplot_afinn_not_words <- afinn_not_words %>%
  group_by(word1) %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  top_n(10)%>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score < 0)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~word1, ncol = 2, scales = "free_y") +
  xlab("Words preceded by \"avoid\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()
ggplot_afinn_not_words

我們以“not”, “no”, “never”, “without”,為否定詞,並結合AFINN詞庫進行分析,

可以看到大部分還是以負面辭彙為主,像是“no welcome”、“not love”等。而“hope”在“no hope”、“not hope”、“without hope”都有看到,更顯現魔戒全文的負面情緒。

 

最後進行人名頻率分析

# 人名搜尋 good
name_search_prot <- lort_alltext %>%
  unnest_tokens(word, text) %>%
  filter(word %in% data_names_protagonists) %>%
  mutate(camp = "GOOD") %>%
  count(word, camp, sort = TRUE) #計算各文字出現次數
name_search_prot
## # A tibble: 23 x 3
##    word    camp      n
##    <chr>   <chr> <int>
##  1 frodo   GOOD   1830
##  2 sam     GOOD   1210
##  3 gandalf GOOD   1085
##  4 aragorn GOOD    683
##  5 pippin  GOOD    632
##  6 merry   GOOD    558
##  7 gimli   GOOD    373
##  8 legolas GOOD    328
##  9 bilbo   GOOD    273
## 10 boromir GOOD    262
## # ... with 13 more rows
# 人名搜尋 bad
name_search_anta <- lort_alltext %>%
  unnest_tokens(word, text) %>%
  filter(word %in% data_name_antagonists) %>%
  mutate(camp = "BAD") %>%
  count(word, camp, sort = TRUE) #計算各文字出現次數
name_search_anta 
## # A tibble: 12 x 3
##    word        camp      n
##    <chr>       <chr> <int>
##  1 gollum      BAD     379
##  2 saruman     BAD     257
##  3 sauron      BAD     123
##  4 nazgyl      BAD      50
##  5 shelob      BAD      29
##  6 trolls      BAD      21
##  7 balrog      BAD      13
##  8 ringwraiths BAD      10
##  9 angmar      BAD       5
## 10 goblins     BAD       5
## 11 orcses      BAD       2
## 12 morgoth     BAD       1
#人名長條圖
ggplot_name_good <- bind_rows(name_search_prot, name_search_anta) %>%
  mutate(word = reorder(word, n))%>%
  filter(n > 100) %>%
  ggplot(aes(x = word, y = n, color = camp)) +
  geom_bar(stat = "identity") +
  xlab("Name") +
  ylab("Number")+
  coord_flip()
ggplot_name_good

人名部分,我們分成主角陣營及邪惡陣營,可以看到“frodo”不愧是主角,全文出現達1800次,其次是“sam”,畢竟有“frodo”的場景,sam的名字應該就會出現,而“gandalf”身為重要配角也位列前三。反派方面,則是“gollum”為最高,其次是“saruman”和“sauron”。

 

#特搜 aragorn、sauron"和"ring"
three_vip <- lort_alltext %>%
  unnest_tokens(word, text) %>%
  filter(word == c("aragorn","sauron", "ring")) %>%
  count(word, chapter, sort = TRUE) #以章節區分, 並計算三名詞出現次數

ggplot_vip <- three_vip %>%
  ggplot(aes(x = chapter, y = n, color = word)) +
  geom_line(size=1) +
  geom_vline(xintercept = as.numeric(c(23,44)), color="black", size= 0.5)+
  xlab("Word") +
  ylab("Number")
ggplot_vip

最後,特搜一下“aragorn”、“sauron”、“ring”,可以看到“ring”在第一部大量出現,可能與說明魔戒的力量、九戒等有關;而“aragorn”在一部後半開始出現,第二部前半達到高峰出現,而第三部有兩段高點;“sauron”雖然身為魔君,但全文存在感實在不高。