第九組_讀書會討論

組員:
B054020042 郭宗翰 B064020014 鄭子婷
M084020023 陳靖中 M084020046 葉君良
N074220002 陳柏翔 N074220022 黃姿榕
M084810010 吳曼瑄

主題：封神演義

載入套件

# 分析文集為《封神演義》-作者:陳仲琳
# 載入packages
require(dplyr)

## Warning: As of rlang 0.4.0, dplyr must be at least version 0.8.0.
## * dplyr 0.7.6 is too old for rlang 0.4.5.
## * Please update dplyr to the latest version.
## * Updating packages on Windows requires precautions:
##   <https://github.com/jennybc/what-they-forgot/issues/62>

require(tidytext)

## Warning: package 'tidytext' was built under R version 3.5.3

require(jiebaR)

## Warning: package 'jiebaR' was built under R version 3.5.3

## Warning: package 'jiebaRD' was built under R version 3.5.3

library(stringr)
library(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 3.5.3

library(ggplot2)
library(tidyr)

## Warning: package 'tidyr' was built under R version 3.5.3

library(scales)
library(jiebaR)
library(readr)
library(devtools)

## Warning: package 'devtools' was built under R version 3.5.3

## Warning: package 'usethis' was built under R version 3.5.3

library(stringi)
library(pbapply)

## Warning: package 'pbapply' was built under R version 3.5.3

library(Rcpp)
library(RcppProgress)

## Warning: package 'RcppProgress' was built under R version 3.5.3

library(cidian)
# library(ropencc)

#設定讀取資料路徑
ROOT.DIR<- 'C:/Users/Sean/Documents/20200318_bookclub_1/w4_hw/'

古騰堡網站下載《封神演義》繁體文集

library(gutenbergr)

## Warning: package 'gutenbergr' was built under R version 3.5.3

Fengshen <- gutenberg_download(23910) %>% filter(text!="") %>% distinct(gutenberg_id, text)

## Warning: `list_len()` is deprecated as of rlang 0.2.0.
## Please use `new_list()` instead.
## This warning is displayed once per session.

## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.

## Warning: `env_bind_fns()` is deprecated as of rlang 0.3.0.
## Please use `env_bind_active()` instead.
## This warning is displayed once per session.

## Warning: `new_overscope()` is deprecated as of rlang 0.2.0.
## Please use `new_data_mask()` instead.
## This warning is displayed once per session.

## Warning: `overscope_eval_next()` is deprecated as of rlang 0.2.0.
## Please use `eval_tidy()` with a data mask instead.
## This warning is displayed once per session.

## Warning: `overscope_clean()` is deprecated as of rlang 0.2.0.
## This warning is displayed once per session.

Fengshen

## # A tibble: 3,011 x 2
##    gutenberg_id text                                                      
##           <int> <chr>                                                     
##  1        23910 第一回<U+00A0><U+00A0><U+00A0><U+00A0>紂王女媧宮進香                                  
##  2        23910 古風一首：                                                
##  3        23910 　　混沌初分盤古先，太極兩儀四象懸。子天丑地人寅出，避除獸患有巢賢。~
##  4        23910 　　燧人取火免鮮食，伏羲畫卦陰陽前。神農治世嚐百草，軒轅禮樂婚姻聯。~
##  5        23910 　　少昊五帝民物阜，禹王治水洪波蠲。承平享國至四百，桀王無道乾坤顛，~
##  6        23910 　　日縱妹喜荒酒色，成湯造亳洗腥羶，放桀南巢拯暴虐，雲霓如願後蘇全。~
##  7        23910 　　三十一世傳殷紂，商家脈絡如斷弦：紊亂朝綱絕倫紀，殺妻誅子信讒言，~
##  8        23910 　　穢污宮闈寵妲己，蠆盆炮烙忠貞冤，鹿臺聚斂萬姓苦，愁聲怨氣應障天，~
##  9        23910 　　直諫剖心盡焚炙，孕婦刳剔朝涉殲，崇信姦回棄朝政，屏逐師保性何偏，~
## 10        23910 　　郊社不修宗廟廢，奇技淫巧盡心研，昵此罪人乃罔畏，沉酗肆虐如鸇鳶。~
## # ... with 3,001 more rows

使用正規表示式，將句子區分章節並斷出共1~100章回

Fengshen <- Fengshen %>% mutate(chapter = cumsum(str_detect(Fengshen$text, regex("第.*回(\u00a0|$)"))))
# 文集已經完成斷句了
head(Fengshen, 20)

## # A tibble: 20 x 3
##    gutenberg_id text                                               chapter
##           <int> <chr>                                                <int>
##  1        23910 第一回<U+00A0><U+00A0><U+00A0><U+00A0>紂王女媧宮進香                                 1
##  2        23910 古風一首：                                               1
##  3        23910 　　混沌初分盤古先，太極兩儀四象懸。子天丑地人寅出，避除獸患有巢賢。~       1
##  4        23910 　　燧人取火免鮮食，伏羲畫卦陰陽前。神農治世嚐百草，軒轅禮樂婚姻聯。~       1
##  5        23910 　　少昊五帝民物阜，禹王治水洪波蠲。承平享國至四百，桀王無道乾坤顛，~       1
##  6        23910 　　日縱妹喜荒酒色，成湯造亳洗腥羶，放桀南巢拯暴虐，雲霓如願後蘇全。~       1
##  7        23910 　　三十一世傳殷紂，商家脈絡如斷弦：紊亂朝綱絕倫紀，殺妻誅子信讒言，~       1
##  8        23910 　　穢污宮闈寵妲己，蠆盆炮烙忠貞冤，鹿臺聚斂萬姓苦，愁聲怨氣應障天，~       1
##  9        23910 　　直諫剖心盡焚炙，孕婦刳剔朝涉殲，崇信姦回棄朝政，屏逐師保性何偏，~       1
## 10        23910 　　郊社不修宗廟廢，奇技淫巧盡心研，昵此罪人乃罔畏，沉酗肆虐如鸇鳶。~       1
## 11        23910 　　西伯朝商囚羑里，微子抱器走風湮。皇天震怒降災毒，若涉大海無淵邊。~       1
## 12        23910 　　天下荒荒萬民怨，子牙出世人中仙，終日垂絲釣人主，飛熊入夢獵岐田，~       1
## 13        23910 　　共載歸周輔朝政，三分有二日相沿。文考末集大勳沒，武王善述日乾乾。~       1
## 14        23910 　　孟津大會八百國，取彼凶殘伐罪愆。甲子昧爽會牧野，前徒倒戈反回旋。~       1
## 15        23910 　　若崩厥角齊稽首，血流漂杵脂如泉。戒衣甫著天下定，更於成湯增光妍。~       1
## 16        23910 　　牧馬華山示偃武，開我周家八百年。太白旗懸獨夫死，戰亡將士幽魂潛。~       1
## 17        23910 　　天挺人賢號尚父，封神壇上列花箋，大小英靈尊位次，商周演義古今傳。~       1
## 18        23910 　　成湯乃黃帝之後也，姓子氏。初，帝嚳次妃簡狄祈於高禖，有玄鳥之祥，        遂生契。契事唐~       1
## 19        23910 　　太戊　　仲丁　　外壬　　河亶甲　祖乙　　祖辛         1
## 20        23910 　　沃甲　　祖丁　　南庚　　陽甲　　盤庚　　小辛         1

文集斷詞

設定斷詞function與停用字stop_word

jieba_tokenizer <- worker(user="Fengshen.traditional.dict",
                          stop_word = file.path (ROOT.DIR , "stop_words.txt"))
Fengshen_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
tokens <- Fengshen %>% unnest_tokens(word, text, token=Fengshen_tokenizer)
str(tokens)

## Classes 'tbl_df', 'tbl' and 'data.frame':    223870 obs. of  3 variables:
##  $ gutenberg_id: int  23910 23910 23910 23910 23910 23910 23910 23910 23910 23910 ...
##  $ chapter     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ word        : chr  "第一回" "紂王" "女媧" "宮" ...

head(tokens, 20) # 全文斷詞結果

## # A tibble: 20 x 3
##    gutenberg_id chapter word  
##           <int>   <int> <chr> 
##  1        23910       1 第一回
##  2        23910       1 紂王  
##  3        23910       1 女媧  
##  4        23910       1 宮    
##  5        23910       1 進香  
##  6        23910       1 古風  
##  7        23910       1 一首  
##  8        23910       1 混沌  
##  9        23910       1 初分  
## 10        23910       1 盤古  
## 11        23910       1 先    
## 12        23910       1 太極  
## 13        23910       1 兩儀  
## 14        23910       1 四象  
## 15        23910       1 懸    
## 16        23910       1 子天  
## 17        23910       1 丑    
## 18        23910       1 人寅出
## 19        23910       1 避除  
## 20        23910       1 獸

重點一: 文集中角色人物-哪吒的描述

為了找出哪吒武器，將原本沒有區分出來的斷詞加入

new_user_word(jieba_tokenizer, c("兩根火尖槍", "乾坤圈", "混天綾", "九龍神火罩","陰陽劍"))

## [1] TRUE

chi_tokenizer <- function (t) {
  lapply(t, function(x){
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens) > 1]
    return(tokens)
  })
}
# 斷詞
tokens <- Fengshen %>%
  unnest_tokens(word, text, token = chi_tokenizer)
tokens

## # A tibble: 160,376 x 3
##    gutenberg_id chapter word  
##           <int>   <int> <chr> 
##  1        23910       1 第一回
##  2        23910       1 紂王  
##  3        23910       1 女媧  
##  4        23910       1 進香  
##  5        23910       1 古風  
##  6        23910       1 一首  
##  7        23910       1 混沌  
##  8        23910       1 初分  
##  9        23910       1 盤古  
## 10        23910       1 太極  
## # ... with 160,366 more rows

將哪吒武器的等同詞保留成變數
《封神演義》中武器是：兩根火尖槍、風火輪、乾坤圈、混天綾、金磚、九龍神火罩、陰陽劍，共八件兵器（因為封神中哪吒是三頭八臂)
但是原本封神演義斷詞字典中，對於哪吒武器並沒有特別區別出來

weapon_alias = c("金磚","風火輪","乾坤圈","混天綾","兩根火尖槍","九龍神火罩","陰陽劍")

計算哪吒武器在整本書中出現的頻率及比例，可以發現哪吒使用風火輪次數最多

weapon_count <-  tokens %>% 
  filter(nchar(.$word)>1) %>%                     # 保留非空白資料
  mutate(total = nrow(tokens)) %>%                # 加入一個total (資料總列數)   
  group_by(word) %>% 
  filter(word %in% weapon_alias) %>%              # 保留詞
  summarise(count = n(), total = mean(total)) %>% 
  mutate(proportion = count / total) %>%          # 加入比例欄位
  arrange(desc(count))                            # 根據count欄位，由大至小排列
head(weapon_count, 20)

## # A tibble: 6 x 4
##   word       count  total proportion
##   <chr>      <int>  <dbl>      <dbl>
## 1 風火輪        93 160376 0.000580  
## 2 乾坤圈        53 160376 0.000330  
## 3 金磚          13 160376 0.0000811 
## 4 混天綾        12 160376 0.0000748 
## 5 九龍神火罩    10 160376 0.0000624 
## 6 陰陽劍         1 160376 0.00000624

計算哪吒武器在各章節出現的總數，推斷哪吒出場抄傢伙最多的章節

weapon_chapter <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% weapon_alias) %>%
  group_by(chapter) %>%  
  summarise(count = n(), type = "words")%>%
  arrange(desc(count)) 
weapon_chapter

## # A tibble: 50 x 3
##    chapter count type 
##      <int> <int> <chr>
##  1      13    14 words
##  2      34    13 words
##  3      74    10 words
##  4      79    10 words
##  5      14     9 words
##  6      12     8 words
##  7      76     7 words
##  8      36     5 words
##  9      64     5 words
## 10      85     5 words
## # ... with 40 more rows

可以看出第13章及第34章總量最多
第十三章講哪吒年輕時連續闖禍，殺害龍王親人、石磯娘娘的弟子，把自己的父母師父都拖下水，一陣大亂鬥之後，
眼見四海龍王要來取他父母的性命，哪吒只好自殺，留下一縷幽魂，結束這場鬧劇。
第三十四章講成年的哪吒奉太乙真人之命，大亂鬥打敗韓榮、余化，救出黃飛虎、黃滾父子，並打通汜水關讓武成王黃飛虎一家，
可以奔走到西岐投靠周武王，增加周王朝的軍事力量。
從這兩章可以看到哪吒性格從讓父母頭痛、只會闖禍鬧事的流氓，蛻變為成熟並懂得運用自身的力量成就建國大業的神仙。

計算哪吒武器出現最多的章節統計圖

weapon_plot <-  tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% weapon_alias) %>%
  group_by(chapter) %>%  
  summarise(count = n()) %>%
  ggplot(aes(x = chapter, y=count)) +
  geom_col() + 
  ggtitle("各章節的哪吒武器出現總數") + 
  xlab("章節") + 
  ylab("哪吒武器數量") +
  theme(text = element_text(family = "Heiti TC Light"))
weapon_plot

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

重點二: 情緒分析

準備LIWC字典(utf8)

p<-read_file(file.path (ROOT.DIR , "liwc_positive.txt")) 
n<-read_file(file.path (ROOT.DIR , "liwc_negative.txt"))
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
head(LIWC_ch, 20)

##        word sentiment
## 1      一流  positive
## 2  下定決心  positive
## 3  不拘小節  positive
## 4    不費力  positive
## 5      不錯  positive
## 6      主動  positive
## 7      乾杯  positive
## 8      乾淨  positive
## 9    了不起  positive
## 10     享受  positive
## 11     仁心  positive
## 12     仁愛  positive
## 13     仁慈  positive
## 14     仁義  positive
## 15     仁術  positive
## 16     仔細  positive
## 17     付出  positive
## 18     伴侶  positive
## 19     伶俐  positive
## 20     作品  positive

LIWC字典中有多少正面單詞和負面單詞

LIWC_ch %>% filter(sentiment %in% c("positive", "negative")) %>% count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <fct>     <int>
## 1 positive    664
## 2 negative   1047

以LIWC字典判斷文集中的word屬於正面字還是負面字先將tokens斷詞後的文集過濾詞彙，只有一個字則不列入計算(也過濾掉空格)

all_fengshen_words <- tokens %>% filter(nchar(.$word)>1)

計算所有各詞彙的出現總數

all_word_count <- all_fengshen_words %>% group_by(word) %>% summarise(sum = n()) %>% arrange(desc(sum))

計算所有字在文集中出現的總數

word_count <- all_word_count %>%
  select(word,sum) %>% 
  group_by(word) %>% 
  summarise(sum = sum(sum)) %>%
  filter(sum>3)
print(word_count)

## # A tibble: 7,339 x 2
##    word         sum
##    <chr>      <int>
##  1 一一          13
##  2 一二          10
##  3 一二日         6
##  4 一人          61
##  5 一刀          56
##  6 一丈           4
##  7 一下          18
##  8 一千           4
##  9 一千五百年     6
## 10 一口          44
## # ... with 7,329 more rows

與LIWC情緒字典join，文集中的字出現在LIWC字典中是屬於positive還是negative

liwch_ch_word_counts<-word_count %>% inner_join(LIWC_ch)
print(liwch_ch_word_counts)

## # A tibble: 187 x 3
##    word    sum sentiment
##    <chr> <int> <fct>    
##  1 八卦     62 negative 
##  2 大叫    195 negative 
##  3 大笑     46 positive 
##  4 大膽     16 positive 
##  5 不仁     11 negative 
##  6 不平     10 negative 
##  7 不好     68 negative 
##  8 不安     32 negative 
##  9 不利      6 negative 
## 10 不足     33 negative 
## # ... with 177 more rows

liwch_ch_word_counts %>% filter(sentiment %in% c("positive", "negative")) %>% count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <fct>     <int>
## 1 positive     73
## 2 negative    114

繪圖出以LIWC字典統計的文集情緒字數，觀察兩種情緒值的差異

liwch_ch_word_counts %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14,family = 'Heiti TC Light'))+ #mac要設定字體才能顯示
  coord_flip()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

重點三: 情緒分析(續)

準備NTUSD字典(utf8)

pn<-read_file(file.path (ROOT.DIR , "ntusd_positive.txt"))
nn<-read_file(file.path (ROOT.DIR , "ntusd_negative.txt"))
ps<-strsplit(pn, "[\n]")[[1]]
positive_ntusd<-strsplit(ps, "[\r]")
ns<-strsplit(nn, "[\n]")[[1]]
negative_ntusd<-strsplit(ns, "[\r]")
# 用unlist拆分list後重構矩陣然後轉換為dataframe
positive_ntusd<-data.frame(matrix(unlist(positive_ntusd), nrow=2812, ncol=1, byrow=F),sentiments="positive", stringsAsFactors=FALSE)
colnames(positive_ntusd)<-c("word", "sentiment")
negative_ntusd<-data.frame(matrix(unlist(negative_ntusd), nrow=8276, ncol=1, byrow=F), sentiments="negative", stringsAsFactors = FALSE)
colnames(negative_ntusd)<-c("word","sentiment")
NTUSD_ch<-rbind(positive_ntusd, negative_ntusd)
head(NTUSD_ch, 20)

##          word sentiment
## 1    一帆風順  positive
## 2  一帆風順的  positive
## 3        一流  positive
## 4        一致  positive
## 5      一致的  positive
## 6      了不起  positive
## 7    了不起的  positive
## 8        了解  positive
## 9        人性  positive
## 10     人性的  positive
## 11   人格高尚  positive
## 12 人格高尚的  positive
## 13       人情  positive
## 14     人情味  positive
## 15       入神  positive
## 16     入神的  positive
## 17       入迷  positive
## 18     入迷的  positive
## 19       上好  positive
## 20     上好的  positive

NTUSD字典中有多少正面單詞和負面單詞

NTUSD_ch %>% filter(sentiment %in% c("positive", "negative")) %>% count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   8276
## 2 positive   2812

以NTUSD字典判斷文集中的word屬於正面字還是負面字
先將tokens斷詞後的文本過濾詞彙，只有一個字則不列入計算(也過濾掉空格)

all_fengshen_words_ntusd <- tokens %>% filter(nchar(.$word)>1)

計算所有各詞彙的出現總數

all_word_count_ntusd <- all_fengshen_words_ntusd %>% group_by(word) %>% summarise(sum = n()) %>% arrange(desc(sum))

計算所有字在文集中出現的總數

word_count_ntusd <- all_word_count_ntusd %>%
  select(word,sum) %>% 
  group_by(word) %>% 
  summarise(sum = sum(sum)) %>%
  filter(sum>3)
print(word_count_ntusd)

## # A tibble: 7,339 x 2
##    word         sum
##    <chr>      <int>
##  1 一一          13
##  2 一二          10
##  3 一二日         6
##  4 一人          61
##  5 一刀          56
##  6 一丈           4
##  7 一下          18
##  8 一千           4
##  9 一千五百年     6
## 10 一口          44
## # ... with 7,329 more rows

與NTUSD情緒字典join，文集中的字出現在NTUSD字典中是屬於positive還是negative

ntusd_ch_word_counts<-word_count_ntusd %>% inner_join(NTUSD_ch)

繪圖出以NTUSD字典統計的文集情緒字數，觀察兩種情緒值的差異

ntusd_ch_word_counts %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14,family = 'Heiti TC Light'))+
  coord_flip()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

重點四: 文字雲

計算詞彙的出現次數，如果詞彙只有一個字則不列入計算

tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))
head(tokens_count, 20)

## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 紂王     913
##  2 哪吒     896
##  3 子牙     803
##  4 楊戩     730
##  5 武王     588
##  6 今日     564
##  7 黃飛虎   449
##  8 妲己     424
##  9 諸侯     412
## 10 土行孫   406
## 11 陛下     399
## 12 聞太師   389
## 13 弟子     366
## 14 朝歌     363
## 15 將軍     356
## 16 元帥     342
## 17 太師     328
## 18 天子     319
## 19 人馬     311
## 20 道人     310

文字雲分布詞彙 (全文集)

tokens_count %>% wordcloud2()

繪圖出《封神演義》各章節長度，以語句數來計算

fengshen_plot <- bind_rows(Fengshen %>% group_by(chapter) %>% 
                             summarise(count = n(), type="sentences"), tokens %>% 
                             group_by(chapter) %>% 
                             summarise(count = n(), type="words")) %>% 
  group_by(type) %>% 
  ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) + 
  geom_line() + 
  ggtitle("各章節的句子總數") + xlab("章節") + ylab("句子數量")+
  theme(text = element_text(family = 'Heiti TC Light')) 
print(fengshen_plot)

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

計算前五十回和後五十回的詞彙在全文中出現比率的差異

frequency1 <- tokens %>% mutate(part = ifelse(chapter<=50, "First50", "Last50")) %>%
  filter(nchar(.$word)>1) %>%
  mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
  mutate(word = str_extract(word, "[^一二三四五六七八九十]+"))
attach(frequency1)
frequency2 <- frequency1 %>% 
  count(part, word) %>%
  group_by(part) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(part, proportion) %>% 
  gather(part, proportion, `Last50`)
# 匯出圖表
ggplot(frequency2, aes(x = proportion, y = `First50`, color = abs(`First50` - proportion))) +
  geom_abline(color = "gray50", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5,family = 'Heiti TC Light') +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "Firs50", x = "Last50")

重點五: 比較前五十回和後五十回最常出現的前100個詞彙的差異

often_words <- tokens %>% mutate(part = ifelse(chapter<=50, "First50", "Last50")) %>% filter(nchar(.$word)>1)
first_50<-as_tibble(head(often_words, 80223))
last_50<-as_tibble(tail(often_words, 86764))

計算詞彙的出現次數，如果詞彙只有一個字則不列入計算

first50_count <- first_50 %>% filter(nchar(.$word)>1) %>% group_by(word) %>% summarise(sum = n()) %>% filter(sum>10) %>% arrange(desc(sum))
last50_count <- last_50 %>% filter(nchar(.$word)>1) %>% group_by(word) %>% summarise(sum = n()) %>% filter(sum>10) %>% arrange(desc(sum))
print(first50_count);print(last50_count)

## # A tibble: 1,207 x 2
##    word     sum
##    <chr>  <int>
##  1 紂王     570
##  2 哪吒     451
##  3 聞太師   379
##  4 妲己     356
##  5 太師     328
##  6 子牙     280
##  7 陛下     274
##  8 黃飛虎   247
##  9 今日     240
## 10 文王     236
## # ... with 1,197 more rows

## # A tibble: 1,351 x 2
##    word     sum
##    <chr>  <int>
##  1 楊戩     644
##  2 子牙     550
##  3 哪吒     476
##  4 武王     473
##  5 土行孫   406
##  6 紂王     353
##  7 今日     351
##  8 元帥     322
##  9 諸侯     293
## 10 弟子     280
## # ... with 1,341 more rows

視覺化長條圖

first50_bar <- first50_count %>% filter(sum>100) %>% mutate(word=reorder(word, sum)) %>% 
  ggplot(aes(word, sum),nrow=30 ) +
  geom_col() + 
  xlab(NULL) +
  theme(text = element_text(family = 'Heiti TC Light'),
        axis.text.y = element_text(size = 8))+ 
  coord_flip()
last50_bar <- last50_count %>% 
  filter(sum>100) %>% mutate(word=reorder(word, sum)) %>% 
  ggplot(aes(word, sum), nrow=30 ) + 
  geom_col() + 
  xlab(NULL)  +
  theme(text = element_text(size =14,family = 'Heiti TC Light'),
        axis.text.y = element_text(size = 5))+ 
  coord_flip()
# 將兩張ggplot圖合併為一張圖
library(ggpubr)
ggarrange(first50_bar, last50_bar, widths = c(3,4),heights =10)

#library(plyr)
#library(Rmisc)
#multiplot(first50_bar, last50_bar, cols=2)

重點六：道教詞彙整理

# 下載 "封神演義" ，並將text欄位為空的行清除，以及將重複的語句清除
fengshen_text <- gutenberg_download(23910) %>% 
  filter(text != "") %>% distinct(gutenberg_id, text)

章節

fengshen_text <- fengshen_text %>% 
  mutate(chapter = cumsum(str_detect(fengshen_text$text, regex("第.*回(\u00A0|$)"))))

斷詞

jieba_tokenizer <- worker(user = "Fengshen.traditional.dict", stop_word =file.path (ROOT.DIR , "stop_words.txt"))

fengshen_tokenizer <- function(t){
  lapply(t, function(x){
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

tokens <- fengshen_text %>% unnest_tokens(word, text, token = fengshen_tokenizer)
str(tokens)

## Classes 'tbl_df', 'tbl' and 'data.frame':    223870 obs. of  3 variables:
##  $ gutenberg_id: int  23910 23910 23910 23910 23910 23910 23910 23910 23910 23910 ...
##  $ chapter     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ word        : chr  "第一回" "紂王" "女媧" "宮" ...

#計算每一章節出現字數大於1的詞彙出現的總次數
tokens_count_chapter <- tokens %>% 
  filter(nchar(.$word)>1) %>% 
  group_by(chapter,word) %>% 
  dplyr::summarise(sum = n()) %>%
  arrange(chapter) 
tokens_count_chapter <- top_n(tokens_count_chapter,3) #只選出前三名的字彙

## Selecting by sum

tokens_count_chapter

## # A tibble: 315 x 3
## # Groups:   chapter [100]
##    chapter word     sum
##      <int> <chr>  <int>
##  1       1 女媧      11
##  2       1 天子      11
##  3       1 紂王      19
##  4       1 諸侯      11
##  5       2 陛下      23
##  6       2 諸侯      24
##  7       2 蘇護      45
##  8       3 崇黑虎    24
##  9       3 鄭倫      24
## 10       3 蘇護      29
## # ... with 305 more rows

結果可以發現第一章回出現最多的是女媧、天子、紂王、諸侯，推斷可能第一回在講女媧和紂王的故事到 https://zh.wikisource.org/zh-hant/%E5%B0%81%E7%A5%9E%E6%BC%94%E7%BE%A9 可以看到第一回標題是：紂王女媧宮進香

繪圖

#繪製散布圖X軸是章節，Y軸是詞彙出現的次數
ggplot(tokens_count_chapter, aes(x = chapter, y = sum,  color = abs(sum))) +  
  geom_jitter(alpha = 0.1, size = 5, width = 0.5, height = 0.5) +  
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +  
  scale_y_log10()+
  ggtitle("各章節的出現前三多次數的字") + 
  ylab(label="出現次數")+  
  xlab(label="回數") +
  theme(text = element_text(family = "Heiti TC Light"))

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

結果可以看出第一回是由女媧作為故事的開頭，最後則是以武王作為故事的結束，

也可以發現各個角色大概出現的章回在哪

封神演義內容包含了許多道教的元素分析出現最多的道教詞彙

#匯入道教詞語辭典
#fileEncoding ="UTF-8"：使用R讀取文字檔時, 有時會遇到資料匯入有錯誤訊息或中文亂碼問題
taoism <- read.table(file.path (ROOT.DIR , "taoism.txt"),sep = "\n" ,header = FALSE, fileEncoding ="UTF-8")

## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec
## = dec, : 輸入連結 'C:/Users/Sean/Documents/20200318_bookclub_1/w4_hw//
## taoism.txt' 中的輸入不正確

taoism_count <- tokens %>% 
  filter(nchar(.$word) > 1) %>% 
  mutate(total = nrow(tokens)) %>% 
  group_by(word) %>% 
  filter(word %in% taoism$V1) %>% 
  summarise(count = n(), total = mean(total)) %>%
  mutate(proportion = count / total) %>% 
  arrange(desc(count))
head(taoism_count, 20)

## # A tibble: 20 x 4
##    word     count  total proportion
##    <chr>    <int>  <dbl>      <dbl>
##  1 廣成子     200 223870 0.000893  
##  2 道術        87 223870 0.000389  
##  3 八卦        62 223870 0.000277  
##  4 丹藥        49 223870 0.000219  
##  5 九宮        10 223870 0.0000447 
##  6 道服        10 223870 0.0000447 
##  7 甲子         9 223870 0.0000402 
##  8 二十八宿     5 223870 0.0000223 
##  9 煉丹         5 223870 0.0000223 
## 10 丁卯         3 223870 0.0000134 
## 11 丙子         3 223870 0.0000134 
## 12 符籙         3 223870 0.0000134 
## 13 丹田         2 223870 0.00000893
## 14 后土         2 223870 0.00000893
## 15 坎離         2 223870 0.00000893
## 16 步罡踏斗     2 223870 0.00000893
## 17 飛昇         2 223870 0.00000893
## 18 交梨火棗     1 223870 0.00000447
## 19 金鼎         1 223870 0.00000447
## 20 重樓         1 223870 0.00000447

結果可以發現出現最多的道教詞彙為“廣成子”，廣成子為中國道教傳說的神仙

推斷封神演義內除了有許多歷史人物外，廣成子是代表道教的重要角色

計算道教詞彙在各章節出現的總數，推斷那些章節可能在講道教的故事

taoism_chapter <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% taoism$V1) %>%
  group_by(chapter) %>%  
  summarise(count = n(), type = "words")%>%
  arrange(desc(count)) 
taoism_chapter

## # A tibble: 76 x 3
##    chapter count type 
##      <int> <int> <chr>
##  1      72    57 words
##  2      65    30 words
##  3      64    21 words
##  4      77    19 words
##  5      46    17 words
##  6      63    14 words
##  7      75    14 words
##  8      47    13 words
##  9      81    13 words
## 10      50    12 words
## # ... with 66 more rows

結果可以看出在72回是講到最多道教相關詞彙的章節

#計算道教詞彙出現在各章節的長條圖
taoism_plot = tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% taoism$V1) %>%
  group_by(chapter) %>%  
  summarise(count = n()) %>%
  ggplot(aes(x = chapter, y=count)) +
  geom_col() + 
  ggtitle("各章節的道教詞彙出現總數") + 
  xlab("章節") + 
  ylab("道教詞彙數量")+
  theme(text = element_text(family = "Heiti TC Light"))
taoism_plot

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

第九組_讀書會討論

斷句斷詞與情緒分析

2020-03-29 11:29:37

主題：封神演義

文集斷詞

重點一: 文集中角色人物-哪吒的描述

計算哪吒武器出現最多的章節統計圖

重點二: 情緒分析

計算所有字在文集中出現的總數

重點三: 情緒分析(續)

繪圖出以NTUSD字典統計的文集情緒字數，觀察兩種情緒值的差異

重點四: 文字雲

文字雲分布詞彙 (全文集)

繪圖出《封神演義》各章節長度，以語句數來計算

重點五: 比較前五十回和後五十回最常出現的前100個詞彙的差異

視覺化長條圖

重點六：道教詞彙整理

繪圖

結果可以看出第一回是由女媧作為故事的開頭，最後則是以武王作為故事的結束，

也可以發現各個角色大概出現的章回在哪

結果可以發現出現最多的道教詞彙為“廣成子”，廣成子為中國道教傳說的神仙

推斷封神演義內除了有許多歷史人物外，廣成子是代表道教的重要角色

計算道教詞彙在各章節出現的總數，推斷那些章節可能在講道教的故事

結果可以看出在72回是講到最多道教相關詞彙的章節

結果可以看出在75回前後有講到大量的道教相關詞彙