組員:
B054020042 郭宗翰 B064020014 鄭子婷
M084020023 陳靖中 M084020046 葉君良
N074220002 陳柏翔 N074220022 黃姿榕
M084810010 吳曼瑄

主題:封神演義

載入套件

# 分析文集為《封神演義》-作者:陳仲琳
# 載入packages
require(dplyr)
## Warning: As of rlang 0.4.0, dplyr must be at least version 0.8.0.
## * dplyr 0.7.6 is too old for rlang 0.4.5.
## * Please update dplyr to the latest version.
## * Updating packages on Windows requires precautions:
##   <https://github.com/jennybc/what-they-forgot/issues/62>
require(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.3
require(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.5.3
## Warning: package 'jiebaRD' was built under R version 3.5.3
library(stringr)
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.5.3
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(scales)
library(jiebaR)
library(readr)
library(devtools)
## Warning: package 'devtools' was built under R version 3.5.3
## Warning: package 'usethis' was built under R version 3.5.3
library(stringi)
library(pbapply)
## Warning: package 'pbapply' was built under R version 3.5.3
library(Rcpp)
library(RcppProgress)
## Warning: package 'RcppProgress' was built under R version 3.5.3
library(cidian)
# library(ropencc)

#設定讀取資料路徑
ROOT.DIR<- 'C:/Users/Sean/Documents/20200318_bookclub_1/w4_hw/'

古騰堡網站下載《封神演義》繁體文集

library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 3.5.3
Fengshen <- gutenberg_download(23910) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Warning: `list_len()` is deprecated as of rlang 0.2.0.
## Please use `new_list()` instead.
## This warning is displayed once per session.
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.
## Warning: `env_bind_fns()` is deprecated as of rlang 0.3.0.
## Please use `env_bind_active()` instead.
## This warning is displayed once per session.
## Warning: `new_overscope()` is deprecated as of rlang 0.2.0.
## Please use `new_data_mask()` instead.
## This warning is displayed once per session.
## Warning: `overscope_eval_next()` is deprecated as of rlang 0.2.0.
## Please use `eval_tidy()` with a data mask instead.
## This warning is displayed once per session.
## Warning: `overscope_clean()` is deprecated as of rlang 0.2.0.
## This warning is displayed once per session.
Fengshen
## # A tibble: 3,011 x 2
##    gutenberg_id text                                                      
##           <int> <chr>                                                     
##  1        23910 第一回<U+00A0><U+00A0><U+00A0><U+00A0>紂王女媧宮進香                                  
##  2        23910 古風一首:                                                
##  3        23910   混沌初分盤古先,太極兩儀四象懸。子天丑地人寅出,避除獸患有巢賢。~
##  4        23910   燧人取火免鮮食,伏羲畫卦陰陽前。神農治世嚐百草,軒轅禮樂婚姻聯。~
##  5        23910   少昊五帝民物阜,禹王治水洪波蠲。承平享國至四百,桀王無道乾坤顛,~
##  6        23910   日縱妹喜荒酒色,成湯造亳洗腥羶,放桀南巢拯暴虐,雲霓如願後蘇全。~
##  7        23910   三十一世傳殷紂,商家脈絡如斷弦:紊亂朝綱絕倫紀,殺妻誅子信讒言,~
##  8        23910   穢污宮闈寵妲己,蠆盆炮烙忠貞冤,鹿臺聚斂萬姓苦,愁聲怨氣應障天,~
##  9        23910   直諫剖心盡焚炙,孕婦刳剔朝涉殲,崇信姦回棄朝政,屏逐師保性何偏,~
## 10        23910   郊社不修宗廟廢,奇技淫巧盡心研,昵此罪人乃罔畏,沉酗肆虐如鸇鳶。~
## # ... with 3,001 more rows

使用正規表示式,將句子區分章節並斷出共1~100章回

Fengshen <- Fengshen %>% mutate(chapter = cumsum(str_detect(Fengshen$text, regex("第.*回(\u00a0|$)"))))
# 文集已經完成斷句了
head(Fengshen, 20)
## # A tibble: 20 x 3
##    gutenberg_id text                                               chapter
##           <int> <chr>                                                <int>
##  1        23910 第一回<U+00A0><U+00A0><U+00A0><U+00A0>紂王女媧宮進香                                 1
##  2        23910 古風一首:                                               1
##  3        23910   混沌初分盤古先,太極兩儀四象懸。子天丑地人寅出,避除獸患有巢賢。~       1
##  4        23910   燧人取火免鮮食,伏羲畫卦陰陽前。神農治世嚐百草,軒轅禮樂婚姻聯。~       1
##  5        23910   少昊五帝民物阜,禹王治水洪波蠲。承平享國至四百,桀王無道乾坤顛,~       1
##  6        23910   日縱妹喜荒酒色,成湯造亳洗腥羶,放桀南巢拯暴虐,雲霓如願後蘇全。~       1
##  7        23910   三十一世傳殷紂,商家脈絡如斷弦:紊亂朝綱絕倫紀,殺妻誅子信讒言,~       1
##  8        23910   穢污宮闈寵妲己,蠆盆炮烙忠貞冤,鹿臺聚斂萬姓苦,愁聲怨氣應障天,~       1
##  9        23910   直諫剖心盡焚炙,孕婦刳剔朝涉殲,崇信姦回棄朝政,屏逐師保性何偏,~       1
## 10        23910   郊社不修宗廟廢,奇技淫巧盡心研,昵此罪人乃罔畏,沉酗肆虐如鸇鳶。~       1
## 11        23910   西伯朝商囚羑里,微子抱器走風湮。皇天震怒降災毒,若涉大海無淵邊。~       1
## 12        23910   天下荒荒萬民怨,子牙出世人中仙,終日垂絲釣人主,飛熊入夢獵岐田,~       1
## 13        23910   共載歸周輔朝政,三分有二日相沿。文考末集大勳沒,武王善述日乾乾。~       1
## 14        23910   孟津大會八百國,取彼凶殘伐罪愆。甲子昧爽會牧野,前徒倒戈反回旋。~       1
## 15        23910   若崩厥角齊稽首,血流漂杵脂如泉。戒衣甫著天下定,更於成湯增光妍。~       1
## 16        23910   牧馬華山示偃武,開我周家八百年。太白旗懸獨夫死,戰亡將士幽魂潛。~       1
## 17        23910   天挺人賢號尚父,封神壇上列花箋,大小英靈尊位次,商周演義古今傳。~       1
## 18        23910   成湯乃黃帝之後也,姓子氏。初,帝嚳次妃簡狄祈於高禖,有玄鳥之祥,        遂生契。契事唐~       1
## 19        23910   太戊  仲丁  外壬  河亶甲 祖乙  祖辛         1
## 20        23910   沃甲  祖丁  南庚  陽甲  盤庚  小辛         1

文集斷詞

設定斷詞function與停用字stop_word

jieba_tokenizer <- worker(user="Fengshen.traditional.dict",
                          stop_word = file.path (ROOT.DIR , "stop_words.txt"))
Fengshen_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
tokens <- Fengshen %>% unnest_tokens(word, text, token=Fengshen_tokenizer)
str(tokens)
## Classes 'tbl_df', 'tbl' and 'data.frame':    223870 obs. of  3 variables:
##  $ gutenberg_id: int  23910 23910 23910 23910 23910 23910 23910 23910 23910 23910 ...
##  $ chapter     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ word        : chr  "第一回" "紂王" "女媧" "宮" ...
head(tokens, 20) # 全文斷詞結果
## # A tibble: 20 x 3
##    gutenberg_id chapter word  
##           <int>   <int> <chr> 
##  1        23910       1 第一回
##  2        23910       1 紂王  
##  3        23910       1 女媧  
##  4        23910       1 宮    
##  5        23910       1 進香  
##  6        23910       1 古風  
##  7        23910       1 一首  
##  8        23910       1 混沌  
##  9        23910       1 初分  
## 10        23910       1 盤古  
## 11        23910       1 先    
## 12        23910       1 太極  
## 13        23910       1 兩儀  
## 14        23910       1 四象  
## 15        23910       1 懸    
## 16        23910       1 子天  
## 17        23910       1 丑    
## 18        23910       1 人寅出
## 19        23910       1 避除  
## 20        23910       1 獸

重點一: 文集中角色人物-哪吒的描述

為了找出哪吒武器,將原本沒有區分出來的斷詞加入

new_user_word(jieba_tokenizer, c("兩根火尖槍", "乾坤圈", "混天綾", "九龍神火罩","陰陽劍"))
## [1] TRUE
chi_tokenizer <- function (t) {
  lapply(t, function(x){
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens) > 1]
    return(tokens)
  })
}
# 斷詞
tokens <- Fengshen %>%
  unnest_tokens(word, text, token = chi_tokenizer)
tokens
## # A tibble: 160,376 x 3
##    gutenberg_id chapter word  
##           <int>   <int> <chr> 
##  1        23910       1 第一回
##  2        23910       1 紂王  
##  3        23910       1 女媧  
##  4        23910       1 進香  
##  5        23910       1 古風  
##  6        23910       1 一首  
##  7        23910       1 混沌  
##  8        23910       1 初分  
##  9        23910       1 盤古  
## 10        23910       1 太極  
## # ... with 160,366 more rows

將哪吒武器的等同詞保留成變數
《封神演義》中武器是:兩根火尖槍、風火輪、乾坤圈、混天綾、金磚、九龍神火罩、陰陽劍,共八件兵器(因為封神中哪吒是三頭八臂)
但是原本封神演義斷詞字典中,對於哪吒武器並沒有特別區別出來

weapon_alias = c("金磚","風火輪","乾坤圈","混天綾","兩根火尖槍","九龍神火罩","陰陽劍")

計算哪吒武器在整本書中出現的頻率及比例,可以發現哪吒使用風火輪次數最多

weapon_count <-  tokens %>% 
  filter(nchar(.$word)>1) %>%                     # 保留非空白資料
  mutate(total = nrow(tokens)) %>%                # 加入一個total (資料總列數)   
  group_by(word) %>% 
  filter(word %in% weapon_alias) %>%              # 保留詞
  summarise(count = n(), total = mean(total)) %>% 
  mutate(proportion = count / total) %>%          # 加入比例欄位
  arrange(desc(count))                            # 根據count欄位,由大至小排列
head(weapon_count, 20)
## # A tibble: 6 x 4
##   word       count  total proportion
##   <chr>      <int>  <dbl>      <dbl>
## 1 風火輪        93 160376 0.000580  
## 2 乾坤圈        53 160376 0.000330  
## 3 金磚          13 160376 0.0000811 
## 4 混天綾        12 160376 0.0000748 
## 5 九龍神火罩    10 160376 0.0000624 
## 6 陰陽劍         1 160376 0.00000624

計算哪吒武器在各章節出現的總數,推斷哪吒出場抄傢伙最多的章節

weapon_chapter <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% weapon_alias) %>%
  group_by(chapter) %>%  
  summarise(count = n(), type = "words")%>%
  arrange(desc(count)) 
weapon_chapter
## # A tibble: 50 x 3
##    chapter count type 
##      <int> <int> <chr>
##  1      13    14 words
##  2      34    13 words
##  3      74    10 words
##  4      79    10 words
##  5      14     9 words
##  6      12     8 words
##  7      76     7 words
##  8      36     5 words
##  9      64     5 words
## 10      85     5 words
## # ... with 40 more rows

可以看出第13章及第34章總量最多
第十三章講哪吒年輕時連續闖禍,殺害龍王親人、石磯娘娘的弟子,把自己的父母師父都拖下水,一陣大亂鬥之後,
眼見四海龍王要來取他父母的性命,哪吒只好自殺,留下一縷幽魂,結束這場鬧劇。
第三十四章講成年的哪吒奉太乙真人之命,大亂鬥打敗韓榮、余化,救出黃飛虎、黃滾父子,並打通汜水關讓武成王黃飛虎一家,
可以奔走到西岐投靠周武王,增加周王朝的軍事力量。
從這兩章可以看到哪吒性格從讓父母頭痛、只會闖禍鬧事的流氓,蛻變為成熟並懂得運用自身的力量成就建國大業的神仙。

計算哪吒武器出現最多的章節統計圖

weapon_plot <-  tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% weapon_alias) %>%
  group_by(chapter) %>%  
  summarise(count = n()) %>%
  ggplot(aes(x = chapter, y=count)) +
  geom_col() + 
  ggtitle("各章節的哪吒武器出現總數") + 
  xlab("章節") + 
  ylab("哪吒武器數量") +
  theme(text = element_text(family = "Heiti TC Light"))
weapon_plot
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

重點二: 情緒分析

準備LIWC字典(utf8)

p<-read_file(file.path (ROOT.DIR , "liwc_positive.txt")) 
n<-read_file(file.path (ROOT.DIR , "liwc_negative.txt"))
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
head(LIWC_ch, 20)
##        word sentiment
## 1      一流  positive
## 2  下定決心  positive
## 3  不拘小節  positive
## 4    不費力  positive
## 5      不錯  positive
## 6      主動  positive
## 7      乾杯  positive
## 8      乾淨  positive
## 9    了不起  positive
## 10     享受  positive
## 11     仁心  positive
## 12     仁愛  positive
## 13     仁慈  positive
## 14     仁義  positive
## 15     仁術  positive
## 16     仔細  positive
## 17     付出  positive
## 18     伴侶  positive
## 19     伶俐  positive
## 20     作品  positive

LIWC字典中有多少正面單詞和負面單詞

LIWC_ch %>% filter(sentiment %in% c("positive", "negative")) %>% count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <fct>     <int>
## 1 positive    664
## 2 negative   1047

以LIWC字典判斷文集中的word屬於正面字還是負面字 先將tokens斷詞後的文集過濾詞彙,只有一個字則不列入計算(也過濾掉空格)

all_fengshen_words <- tokens %>% filter(nchar(.$word)>1)

計算所有各詞彙的出現總數

all_word_count <- all_fengshen_words %>% group_by(word) %>% summarise(sum = n()) %>% arrange(desc(sum))

計算所有字在文集中出現的總數

word_count <- all_word_count %>%
  select(word,sum) %>% 
  group_by(word) %>% 
  summarise(sum = sum(sum)) %>%
  filter(sum>3)
print(word_count)
## # A tibble: 7,339 x 2
##    word         sum
##    <chr>      <int>
##  1 一一          13
##  2 一二          10
##  3 一二日         6
##  4 一人          61
##  5 一刀          56
##  6 一丈           4
##  7 一下          18
##  8 一千           4
##  9 一千五百年     6
## 10 一口          44
## # ... with 7,329 more rows

與LIWC情緒字典join,文集中的字出現在LIWC字典中是屬於positive還是negative

liwch_ch_word_counts<-word_count %>% inner_join(LIWC_ch)
print(liwch_ch_word_counts)
## # A tibble: 187 x 3
##    word    sum sentiment
##    <chr> <int> <fct>    
##  1 八卦     62 negative 
##  2 大叫    195 negative 
##  3 大笑     46 positive 
##  4 大膽     16 positive 
##  5 不仁     11 negative 
##  6 不平     10 negative 
##  7 不好     68 negative 
##  8 不安     32 negative 
##  9 不利      6 negative 
## 10 不足     33 negative 
## # ... with 177 more rows
liwch_ch_word_counts %>% filter(sentiment %in% c("positive", "negative")) %>% count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <fct>     <int>
## 1 positive     73
## 2 negative    114

繪圖出以LIWC字典統計的文集情緒字數,觀察兩種情緒值的差異

liwch_ch_word_counts %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14,family = 'Heiti TC Light'))+ #mac要設定字體才能顯示
  coord_flip()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

重點三: 情緒分析(續)

準備NTUSD字典(utf8)

pn<-read_file(file.path (ROOT.DIR , "ntusd_positive.txt"))
nn<-read_file(file.path (ROOT.DIR , "ntusd_negative.txt"))
ps<-strsplit(pn, "[\n]")[[1]]
positive_ntusd<-strsplit(ps, "[\r]")
ns<-strsplit(nn, "[\n]")[[1]]
negative_ntusd<-strsplit(ns, "[\r]")
# 用unlist拆分list後重構矩陣然後轉換為dataframe
positive_ntusd<-data.frame(matrix(unlist(positive_ntusd), nrow=2812, ncol=1, byrow=F),sentiments="positive", stringsAsFactors=FALSE)
colnames(positive_ntusd)<-c("word", "sentiment")
negative_ntusd<-data.frame(matrix(unlist(negative_ntusd), nrow=8276, ncol=1, byrow=F), sentiments="negative", stringsAsFactors = FALSE)
colnames(negative_ntusd)<-c("word","sentiment")
NTUSD_ch<-rbind(positive_ntusd, negative_ntusd)
head(NTUSD_ch, 20)
##          word sentiment
## 1    一帆風順  positive
## 2  一帆風順的  positive
## 3        一流  positive
## 4        一致  positive
## 5      一致的  positive
## 6      了不起  positive
## 7    了不起的  positive
## 8        了解  positive
## 9        人性  positive
## 10     人性的  positive
## 11   人格高尚  positive
## 12 人格高尚的  positive
## 13       人情  positive
## 14     人情味  positive
## 15       入神  positive
## 16     入神的  positive
## 17       入迷  positive
## 18     入迷的  positive
## 19       上好  positive
## 20     上好的  positive

NTUSD字典中有多少正面單詞和負面單詞

NTUSD_ch %>% filter(sentiment %in% c("positive", "negative")) %>% count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   8276
## 2 positive   2812

以NTUSD字典判斷文集中的word屬於正面字還是負面字
先將tokens斷詞後的文本過濾詞彙,只有一個字則不列入計算(也過濾掉空格)

all_fengshen_words_ntusd <- tokens %>% filter(nchar(.$word)>1)

計算所有各詞彙的出現總數

all_word_count_ntusd <- all_fengshen_words_ntusd %>% group_by(word) %>% summarise(sum = n()) %>% arrange(desc(sum))

計算所有字在文集中出現的總數

word_count_ntusd <- all_word_count_ntusd %>%
  select(word,sum) %>% 
  group_by(word) %>% 
  summarise(sum = sum(sum)) %>%
  filter(sum>3)
print(word_count_ntusd)
## # A tibble: 7,339 x 2
##    word         sum
##    <chr>      <int>
##  1 一一          13
##  2 一二          10
##  3 一二日         6
##  4 一人          61
##  5 一刀          56
##  6 一丈           4
##  7 一下          18
##  8 一千           4
##  9 一千五百年     6
## 10 一口          44
## # ... with 7,329 more rows

與NTUSD情緒字典join,文集中的字出現在NTUSD字典中是屬於positive還是negative

ntusd_ch_word_counts<-word_count_ntusd %>% inner_join(NTUSD_ch)

繪圖出以NTUSD字典統計的文集情緒字數,觀察兩種情緒值的差異

ntusd_ch_word_counts %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14,family = 'Heiti TC Light'))+
  coord_flip()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

重點四: 文字雲

計算詞彙的出現次數,如果詞彙只有一個字則不列入計算

tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))
head(tokens_count, 20)
## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 紂王     913
##  2 哪吒     896
##  3 子牙     803
##  4 楊戩     730
##  5 武王     588
##  6 今日     564
##  7 黃飛虎   449
##  8 妲己     424
##  9 諸侯     412
## 10 土行孫   406
## 11 陛下     399
## 12 聞太師   389
## 13 弟子     366
## 14 朝歌     363
## 15 將軍     356
## 16 元帥     342
## 17 太師     328
## 18 天子     319
## 19 人馬     311
## 20 道人     310

文字雲分布詞彙 (全文集)

tokens_count %>% wordcloud2()

繪圖出《封神演義》各章節長度,以語句數來計算

fengshen_plot <- bind_rows(Fengshen %>% group_by(chapter) %>% 
                             summarise(count = n(), type="sentences"), tokens %>% 
                             group_by(chapter) %>% 
                             summarise(count = n(), type="words")) %>% 
  group_by(type) %>% 
  ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) + 
  geom_line() + 
  ggtitle("各章節的句子總數") + xlab("章節") + ylab("句子數量")+
  theme(text = element_text(family = 'Heiti TC Light')) 
print(fengshen_plot)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

計算前五十回和後五十回的詞彙在全文中出現比率的差異

frequency1 <- tokens %>% mutate(part = ifelse(chapter<=50, "First50", "Last50")) %>%
  filter(nchar(.$word)>1) %>%
  mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
  mutate(word = str_extract(word, "[^一二三四五六七八九十]+"))
attach(frequency1)
frequency2 <- frequency1 %>% 
  count(part, word) %>%
  group_by(part) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(part, proportion) %>% 
  gather(part, proportion, `Last50`)
# 匯出圖表
ggplot(frequency2, aes(x = proportion, y = `First50`, color = abs(`First50` - proportion))) +
  geom_abline(color = "gray50", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5,family = 'Heiti TC Light') +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "Firs50", x = "Last50")

重點五: 比較前五十回和後五十回最常出現的前100個詞彙的差異

often_words <- tokens %>% mutate(part = ifelse(chapter<=50, "First50", "Last50")) %>% filter(nchar(.$word)>1)
first_50<-as_tibble(head(often_words, 80223))
last_50<-as_tibble(tail(often_words, 86764))

計算詞彙的出現次數,如果詞彙只有一個字則不列入計算

first50_count <- first_50 %>% filter(nchar(.$word)>1) %>% group_by(word) %>% summarise(sum = n()) %>% filter(sum>10) %>% arrange(desc(sum))
last50_count <- last_50 %>% filter(nchar(.$word)>1) %>% group_by(word) %>% summarise(sum = n()) %>% filter(sum>10) %>% arrange(desc(sum))
print(first50_count);print(last50_count)
## # A tibble: 1,207 x 2
##    word     sum
##    <chr>  <int>
##  1 紂王     570
##  2 哪吒     451
##  3 聞太師   379
##  4 妲己     356
##  5 太師     328
##  6 子牙     280
##  7 陛下     274
##  8 黃飛虎   247
##  9 今日     240
## 10 文王     236
## # ... with 1,197 more rows
## # A tibble: 1,351 x 2
##    word     sum
##    <chr>  <int>
##  1 楊戩     644
##  2 子牙     550
##  3 哪吒     476
##  4 武王     473
##  5 土行孫   406
##  6 紂王     353
##  7 今日     351
##  8 元帥     322
##  9 諸侯     293
## 10 弟子     280
## # ... with 1,341 more rows

視覺化長條圖

first50_bar <- first50_count %>% filter(sum>100) %>% mutate(word=reorder(word, sum)) %>% 
  ggplot(aes(word, sum),nrow=30 ) +
  geom_col() + 
  xlab(NULL) +
  theme(text = element_text(family = 'Heiti TC Light'),
        axis.text.y = element_text(size = 8))+ 
  coord_flip()
last50_bar <- last50_count %>% 
  filter(sum>100) %>% mutate(word=reorder(word, sum)) %>% 
  ggplot(aes(word, sum), nrow=30 ) + 
  geom_col() + 
  xlab(NULL)  +
  theme(text = element_text(size =14,family = 'Heiti TC Light'),
        axis.text.y = element_text(size = 5))+ 
  coord_flip()
# 將兩張ggplot圖合併為一張圖
library(ggpubr)
ggarrange(first50_bar, last50_bar, widths = c(3,4),heights =10)

#library(plyr)
#library(Rmisc)
#multiplot(first50_bar, last50_bar, cols=2)

重點六:道教詞彙整理

# 下載 "封神演義" ,並將text欄位為空的行清除,以及將重複的語句清除
fengshen_text <- gutenberg_download(23910) %>% 
  filter(text != "") %>% distinct(gutenberg_id, text)

章節

fengshen_text <- fengshen_text %>% 
  mutate(chapter = cumsum(str_detect(fengshen_text$text, regex("第.*回(\u00A0|$)"))))

斷詞

jieba_tokenizer <- worker(user = "Fengshen.traditional.dict", stop_word =file.path (ROOT.DIR , "stop_words.txt"))

fengshen_tokenizer <- function(t){
  lapply(t, function(x){
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
tokens <- fengshen_text %>% unnest_tokens(word, text, token = fengshen_tokenizer)
str(tokens)
## Classes 'tbl_df', 'tbl' and 'data.frame':    223870 obs. of  3 variables:
##  $ gutenberg_id: int  23910 23910 23910 23910 23910 23910 23910 23910 23910 23910 ...
##  $ chapter     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ word        : chr  "第一回" "紂王" "女媧" "宮" ...
#計算每一章節出現字數大於1的詞彙出現的總次數
tokens_count_chapter <- tokens %>% 
  filter(nchar(.$word)>1) %>% 
  group_by(chapter,word) %>% 
  dplyr::summarise(sum = n()) %>%
  arrange(chapter) 
tokens_count_chapter <- top_n(tokens_count_chapter,3) #只選出前三名的字彙
## Selecting by sum
tokens_count_chapter
## # A tibble: 315 x 3
## # Groups:   chapter [100]
##    chapter word     sum
##      <int> <chr>  <int>
##  1       1 女媧      11
##  2       1 天子      11
##  3       1 紂王      19
##  4       1 諸侯      11
##  5       2 陛下      23
##  6       2 諸侯      24
##  7       2 蘇護      45
##  8       3 崇黑虎    24
##  9       3 鄭倫      24
## 10       3 蘇護      29
## # ... with 305 more rows

結果可以發現第一章回出現最多的是女媧、天子、紂王、諸侯,推斷可能第一回在講女媧和紂王的故事 到 https://zh.wikisource.org/zh-hant/%E5%B0%81%E7%A5%9E%E6%BC%94%E7%BE%A9 可以看到第一回標題是:紂王女媧宮進香

繪圖

#繪製散布圖X軸是章節,Y軸是詞彙出現的次數
ggplot(tokens_count_chapter, aes(x = chapter, y = sum,  color = abs(sum))) +  
  geom_jitter(alpha = 0.1, size = 5, width = 0.5, height = 0.5) +  
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +  
  scale_y_log10()+
  ggtitle("各章節的出現前三多次數的字") + 
  ylab(label="出現次數")+  
  xlab(label="回數") +
  theme(text = element_text(family = "Heiti TC Light"))
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

結果可以看出第一回是由女媧作為故事的開頭,最後則是以武王作為故事的結束,

也可以發現各個角色大概出現的章回在哪

封神演義內容包含了許多道教的元素 分析出現最多的道教詞彙

#匯入道教詞語辭典
#fileEncoding ="UTF-8":使用R讀取文字檔時, 有時會遇到資料匯入有錯誤訊息或中文亂碼問題
taoism <- read.table(file.path (ROOT.DIR , "taoism.txt"),sep = "\n" ,header = FALSE, fileEncoding ="UTF-8")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec
## = dec, : 輸入連結 'C:/Users/Sean/Documents/20200318_bookclub_1/w4_hw//
## taoism.txt' 中的輸入不正確
taoism_count <- tokens %>% 
  filter(nchar(.$word) > 1) %>% 
  mutate(total = nrow(tokens)) %>% 
  group_by(word) %>% 
  filter(word %in% taoism$V1) %>% 
  summarise(count = n(), total = mean(total)) %>%
  mutate(proportion = count / total) %>% 
  arrange(desc(count))
head(taoism_count, 20)
## # A tibble: 20 x 4
##    word     count  total proportion
##    <chr>    <int>  <dbl>      <dbl>
##  1 廣成子     200 223870 0.000893  
##  2 道術        87 223870 0.000389  
##  3 八卦        62 223870 0.000277  
##  4 丹藥        49 223870 0.000219  
##  5 九宮        10 223870 0.0000447 
##  6 道服        10 223870 0.0000447 
##  7 甲子         9 223870 0.0000402 
##  8 二十八宿     5 223870 0.0000223 
##  9 煉丹         5 223870 0.0000223 
## 10 丁卯         3 223870 0.0000134 
## 11 丙子         3 223870 0.0000134 
## 12 符籙         3 223870 0.0000134 
## 13 丹田         2 223870 0.00000893
## 14 后土         2 223870 0.00000893
## 15 坎離         2 223870 0.00000893
## 16 步罡踏斗     2 223870 0.00000893
## 17 飛昇         2 223870 0.00000893
## 18 交梨火棗     1 223870 0.00000447
## 19 金鼎         1 223870 0.00000447
## 20 重樓         1 223870 0.00000447
結果可以發現出現最多的道教詞彙為“廣成子”,廣成子為中國道教傳說的神仙
推斷封神演義內除了有許多歷史人物外,廣成子是代表道教的重要角色

計算道教詞彙在各章節出現的總數,推斷那些章節可能在講道教的故事

taoism_chapter <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% taoism$V1) %>%
  group_by(chapter) %>%  
  summarise(count = n(), type = "words")%>%
  arrange(desc(count)) 
taoism_chapter
## # A tibble: 76 x 3
##    chapter count type 
##      <int> <int> <chr>
##  1      72    57 words
##  2      65    30 words
##  3      64    21 words
##  4      77    19 words
##  5      46    17 words
##  6      63    14 words
##  7      75    14 words
##  8      47    13 words
##  9      81    13 words
## 10      50    12 words
## # ... with 66 more rows

結果可以看出在72回是講到最多道教相關詞彙的章節

#計算道教詞彙出現在各章節的長條圖
taoism_plot = tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% taoism$V1) %>%
  group_by(chapter) %>%  
  summarise(count = n()) %>%
  ggplot(aes(x = chapter, y=count)) +
  geom_col() + 
  ggtitle("各章節的道教詞彙出現總數") + 
  xlab("章節") + 
  ylab("道教詞彙數量")+
  theme(text = element_text(family = "Heiti TC Light"))
taoism_plot
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

結果可以看出在75回前後有講到大量的道教相關詞彙