系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""

安裝需要的packages

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 3.4.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 3.4.4
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 3.4.4
library(stringr)
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.4.4
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(tidyr)
library(scales)
## Warning: package 'scales' was built under R version 3.4.4
require(caTools)
## Loading required package: caTools
require(knitr)
## Loading required package: knitr

書籍下載

# 下載 "三國演義" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除

red_org <- gutenberg_download(23950) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Warning: package 'bindrcpp' was built under R version 3.4.4
doc = paste0(red_org$text,collapse = "") #將text欄位的全部文字合併
docVector = unlist(strsplit(doc,"[。.]"), use.names=FALSE) #以全形或半形句號斷句
red = data.frame(gutenberg_id = "23950" , text = docVector,stringsAsFactors = FALSE) #gutenberg_id換成自己的書本id

三國演義大綱

history_tmp=read.csv("三國演義歷史.csv",header=FALSE,stringsAsFactors=FALSE,fileEncoding="big5")
my_str_split<-function(x,i) lapply(strsplit(x,split="~"),function(y) y[i]) %>% unlist %>% as.numeric()
#history_tmp$V1[9]="長坂坡"
history_tmp2=history_tmp %>% mutate(start=my_str_split(V2,1),end=my_str_split(V2,2)) %>% select(-V2);
colnames(history_tmp2)[1]="大綱"
history_tmp2 %>% kable
大綱 start end
破黃巾,殺閹宦 1 2
收呂布,除董卓 3 9
曹操發跡 10 14
斬呂布 15 19
密謀誅曹 20 24
關羽附曹,孫權承兄 25 29
掃清袁氏,統一北方 30 33
三顧茅廬,孔明出山 34 40
長?坡 41 42
赤壁之戰 43 50
三氣周瑜 51 57
戰馬超 58 59
劉備入川 60 65
戰合肥,立魏嗣 66 69
取漢中 70 73
荊州失,關公死 74 77
魏篡漢,蜀稱帝 78 80
夷陵大敗,劉備託孤 81 86
七擒孟獲 87 90
六出祁山,孔明病逝 91 105
九伐中原 106 115
破西蜀 116 118
司馬篡魏,三分歸晉 119 120

章節與關鍵字尋找

  • 三國演義中每章的開始會有“第XXX回:”為標題
  • 關鍵字字句尋找
  • 劉備 :【劉備】、【玄德】
  • 關羽 :【雲長】、【關羽】、【關公】
  • 張飛 :【張飛】、【翼德】、【張翼德】
  • 諸葛亮:【諸葛亮】、【孔明】
  • 曹操 :【曹操】、【阿瞞】、【孟德】
  • 魏國 :【魏兵】
  • 蜀國 :【蜀兵】
  • 吳國 :【吳兵】
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
red <- red %>% 
  mutate(chapter = cumsum(str_detect(text, regex("第.*回:"))),
         chapter_ind = (str_detect(text, regex("第.*回:"))),
         liu_bei=(str_detect(text, regex("劉備|玄德"))),
         cao_cao=(str_detect(text, regex("曹操|阿瞞|孟德"))),
         guan_yu=(str_detect(text, regex("雲長|關公|關羽"))),
         yi_de=(str_detect(text, regex("張翼德|翼德|張飛"))),
         kong_ming=(str_detect(text, regex("孔明|諸葛亮"))),
         wei_guo=str_detect(text, regex("魏兵")),
         shu_guo=str_detect(text, regex("蜀兵")),
         wu_guo=str_detect(text, regex("吳兵"))
         ) %>% 
    mutate(chapter_content=ifelse(chapter_ind,gsub(".*第.*回:(.*)(  ).*", "\\1", text),""))
red_chapter=red %>% filter(chapter_ind) %>% select(chapter,chapter_content);
#   mutate(chapter_content=ifelse(chapter_ind,gsub("(  ).*","", text),""))
# red_chapter=red %>% filter(chapter_ind) %>% select(chapter,chapter_content);
  • 計算三國演義每一回,各國兵句子出現頻率
  • 由該圖能簡單看出三國的創立和交戰火熱情況
  • 將最高點的章節名稱放入
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
red_group <- red %>% select(-gutenberg_id,-text,-chapter_content,-chapter_ind) %>% select(chapter,wei_guo,shu_guo,wu_guo) %>% 
  group_by(chapter) %>% summarise_all(mean)

red_group2 <- red_group %>% gather(word,prop,-chapter) %>% group_by(word)%>% left_join(red_chapter,by="chapter") %>% 
  mutate(chapter_show=ifelse(prop==max(prop),chapter_content,""))

red_group2 %>% ggplot(aes(x = chapter, y=(prop), fill="type", color=factor(word))) +
  geom_line(size=1) + 
  scale_colour_discrete(name = "人物",breaks=c("wei_guo", "shu_guo", "wu_guo"),labels =c("魏兵","蜀兵","吳兵"))+ylab("各回字句出現頻率")+xlab("章節")+
  geom_text(aes(label = chapter_show), vjust = 0.02,hjust=1,fontface="bold")

  • 同樣計算三國演義每一回,人物在句子出現頻率
  • 由該圖能看出作者描述各人物的情況
  • 劉備各回的描述較多,並且部分最高的其中一回 0.4 的句子都有劉備參與 (人物飽滿 )
  • 描述關羽的回數相比於劉備較少,但在最高回有 0.45 的句子都有關羽參與 (人物飽滿 )
  • 孔明的描述主要在七擒孟獲和與司馬懿交戰才大量提升
  • 張飛描述相對其他人較少
  • 將最高點的章節名稱放入
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
red_group <- red %>% select(-gutenberg_id,-text,-chapter_content,-chapter_ind) %>% select(-wei_guo,-shu_guo,-wu_guo) %>% 
  group_by(chapter) %>% summarise_all(mean)

red_group2 <- red_group %>% gather(word,prop,-chapter) %>% group_by(word)%>% left_join(red_chapter,by="chapter") %>% 
  mutate(chapter_show=ifelse(prop>=max(prop),chapter_content,""))

red_group2$chapter_show[red_group2$word=="yi_de"]=""
red_group2$chapter_show[red_group2$word=="cao_cao"]=""

red_group2 %>% ggplot(aes(x = chapter, y=(prop), fill="type", color=factor(word))) +
  geom_line(size=1) + 
  scale_colour_discrete(name = "人物",breaks=c("liu_bei", "guan_yu", "yi_de","kong_ming","cao_cao"),labels =c("劉備","關羽","張飛","諸葛亮","曹操"))+ylab("各回字句出現頻率")+xlab("章節")+
  geom_text(aes(label = chapter_show), vjust = 0.02,hjust=0.5,fontface="bold",size=5)

中文斷詞

  • 將三國演義詞典【san_guo.scel_2019-03-16_17_23_08_trad.dict】納入建立 R jieba worker
  • 進行斷詞
# 使用三國演義專有名詞字典
jieba_tokenizer <- worker(user = 'san_guo.scel_2019-03-16_17_23_08_trad.dict', stop_word = "stop_words.txt")
# 設定斷詞function
red_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

tokens <- red %>% unnest_tokens(word, text, token=red_tokenizer)

計算詞頻

  • 只保留字數大於 1 與詞頻大於 10 的字
  • 列出詞頻前 20 大的字
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(tokens_count, 20) %>% kable
word sum
玄德 1779
孔明 1644
曹操 922
將軍 725
丞相 533
關公 501
雲長 430
荊州 408
張飛 364
引兵 359
呂布 355
商議 341
軍士 321
魏延 321
主公 319
大喜 310
孫權 309
趙雲 309
左右 292
軍馬 289

文字雲

畫出詞頻的文字雲

tokens_count2=tokens_count %>% mutate(sum=(sum)^1 %>% as.integer())
tokens_count2%>% wordcloud2()

各章節長度,以語句數來計算

plot <- 
  bind_rows(
    red %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="sentences"),
    tokens %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="words")) %>% 
  group_by(type)%>%
  ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
  geom_line() + 
  ggtitle("各章節的句子總數") + 
  xlab("章節") + 
  ylab("句子數量") #+ 
  #theme(text = element_text(family = "Heiti TC Light"))
plot

書籍撰寫前 80 回於後 40 回字頻差異

frequency <- tokens %>% mutate(part = ifelse(chapter<=80, "First 80", "Last 40")) %>%
  filter(nchar(.$word)>1) %>%
  mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
  mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
  count(part, word)%>%
  group_by(part) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(part, proportion) %>% 
  gather(part, proportion, `Last 40`) 
frequency[is.na(frequency)]=min(frequency %>% select(`First 80`,proportion))
ggplot(frequency, aes(x = proportion, y = `First 80`, color = abs(`First 80` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  #geom_jitter(alpha = 0.01, size = 2.5, width = 0.01, height = 0.01) +
  #geom_point() +
  geom_text(aes(label = word), vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "First 80", x = "Last 40")
## Warning: Removed 32048 rows containing missing values (geom_text).


額外補充

使用搜狗詞彙庫
搜狗輸入法是中國主流的拼音輸入法,在中國的市佔率高達50%。
並且,其官網提供了眾多專有詞彙的詞彙庫供使用者免費下載。
詳情可以參考一下連結:
https://pinyin.sogou.com/dict/

# 安裝packages
packages = c("readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# 載入library
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(devtools)

# 解碼scel用
#install_github("qinwf/cidian")
library(cidian)
## Loading required package: stringi
## Warning: package 'stringi' was built under R version 3.4.4
## Loading required package: pbapply
## Warning: package 'pbapply' was built under R version 3.4.4
# 簡體轉繁體套件
#install_github("qinwf/ropencc")
library(ropencc)
# 解碼scel檔案
decode_scel(scel = "./san_guo.scel",cpp = TRUE)
## output file: ./san_guo.scel_2019-03-18_23_35_35.dict
# 讀取解碼後生成的詞庫檔案
scan(file="./san_guo.scel_2019-03-16_17_23_08.dict",
      what=character(),nlines=50,sep='\n',
      encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "./san_guo.scel_2019-03-16_17_23_08.dict", what =
## character(), : 輸入連結 './san_guo.scel_2019-03-16_17_23_08.dict' 中的輸入
## 不正確
## [1] "阿斗"
dict <- read_file("./san_guo.scel_2019-03-16_17_23_08.dict")
# 將簡體詞庫轉為繁體
cc <- converter(S2TW)
dict_trad <- cc[dict]
write_file(dict_trad, "./san_guo.scel_2019-03-16_17_23_08_trad.dict")
# 讀取轉換成繁體後的詞庫檔案
scan(file="./san_guo.scel_2019-03-16_17_23_08_trad.dict",
      what=character(),nlines=50,sep='\n',
      encoding='utf-8',fileEncoding='utf-8')
##  [1] "阿斗當皇帝軟弱無能 n" "阿斗的江山白送 n"     "阿會喃 n"            
##  [4] "阿陽 n"               "哀牢 n"               "艾縣 n"              
##  [7] "安北將軍 n"           "安城 n"               "安次 n"              
## [10] "安德 n"               "安定 n"               "安定郡 n"            
## [13] "安東將軍 n"           "安豐 n"               "安故 n"              
## [16] "安廣 n"               "安國 n"               "安漢 n"              
## [19] "安樂 n"               "安樂公 n"             "安陵 n"              
## [22] "安陸 n"               "安彌 n"               "安南將軍 n"          
## [25] "安平 n"               "安平國 n"             "安丘 n"              
## [28] "安世 n"               "安市 n"               "安熹 n"              
## [31] "安西將軍 n"           "安陽 n"               "安夷 n"              
## [34] "安邑 n"               "安遠將軍 n"           "安眾 n"              
## [37] "奧汀多賴把 n"         "鰲頭兩刃斧 n"         "媼圍 n"              
## [40] "巴郡 n"               "霸陵 n"               "八路諸侯 n"          
## [43] "灞水 n"               "八校尉兵 n"           "拔用 n"              
## [46] "霸者之威 n"           "白帝城託孤 n"         "白鶴 n"              
## [49] "白虹 n"               "白虎銀牙 n"