Abstract
結合jiebar與Tidy text套件,處理Gutenberg上的中文小說:三國演義Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
[1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
library(stringr)
library(wordcloud2)
library(ggplot2)
library(tidyr)
library(scales)
# 下載 "三國演義" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
three <- gutenberg_download(23950) %>% filter(text!="") %>% distinct(gutenberg_id, text)
doc = paste0(three$text,collapse = "") #將text欄位的全部文字合併
docVector = unlist(strsplit(doc,"[。.?!]"), use.names=FALSE) #以全形或半形句號斷句
three = data.frame(gutenberg_id = "23950" , text = docVector) #gutenberg_id換成自己的書本id
three$text <- as.character(three$text)
three$gutenberg_id <- as.integer(three$gutenberg_id)
View(three)
觀察資料我們可以發現,三國演義中每章皆以“第X回”為標題。
ex.第一回:宴桃園豪傑三結義,斬黃巾英雄首立功
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
three <- three %>%
mutate(chapter = cumsum(str_detect(three$text, regex("^第.*回.*"))))
str(three)
'data.frame': 29830 obs. of 3 variables:
$ gutenberg_id: int 1 1 1 1 1 1 1 1 1 1 ...
$ text : chr "第一回:宴桃園豪傑三結義,斬黃巾英雄首立功 詞曰: 滾滾長江東逝水,浪花淘盡英雄" "是非成敗轉頭空:青山依舊在,幾度夕陽紅" "白髮漁樵江渚上,慣看秋月春風" "一壺濁酒喜相逢:古今多少事,都付笑談中" ...
$ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
# 安裝packages
packages = c("readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# 載入library
library(readr)
library(devtools)
# 解碼scel用
install_github("qinwf/cidian")
Skipping install of 'cidian' from a github remote, the SHA1 (834f0bd0) has not changed since last install.
Use `force = TRUE` to force installation
library(cidian)
# 簡體轉繁體套件
install_github("qinwf/ropencc")
Skipping install of 'ropencc' from a github remote, the SHA1 (a5deb1fb) has not changed since last install.
Use `force = TRUE` to force installation
library(ropencc)
# 解碼scel檔案
decode_scel(scel = "three.scel",cpp = TRUE)
output file: three.scel_2019-03-17_13_23_16.dict
# 讀取解碼後生成的詞庫檔案
scan(file="three.scel_2019-03-15_19_50_08.dict",
what=character(),nlines=50,sep='\n',
encoding='utf-8',fileEncoding='utf-8')
Read 50 items
[1] "阿斗当皇帝软弱无能 n" "阿斗的江山白送 n" "阿会喃 n" "阿阳 n"
[5] "哀牢 n" "艾县 n" "安北将军 n" "安城 n"
[9] "安次 n" "安德 n" "安定 n" "安定郡 n"
[13] "安东将军 n" "安丰 n" "安故 n" "安广 n"
[17] "安国 n" "安汉 n" "安乐 n" "安乐公 n"
[21] "安陵 n" "安陆 n" "安弥 n" "安南将军 n"
[25] "安平 n" "安平国 n" "安丘 n" "安世 n"
[29] "安市 n" "安熹 n" "安西将军 n" "安阳 n"
[33] "安夷 n" "安邑 n" "安远将军 n" "安众 n"
[37] "奥汀多赖把 n" "鳌头两刃斧 n" "媪围 n" "巴郡 n"
[41] "霸陵 n" "八路诸侯 n" "灞水 n" "八校尉兵 n"
[45] "拔用 n" "霸者之威 n" "白帝城托孤 n" "白鹤 n"
[49] "白虹 n" "白虎银牙 n"
dict <- read_file("three.scel_2019-03-15_19_50_08.dict")
# 將簡體詞庫轉為繁體
cc <- converter(S2TW)
dict_trad <- cc[dict]
write_file(dict_trad, "three.traditional.dict")
# 讀取轉換成繁體後的詞庫檔案
scan(file="/Users/kunhsiang/Desktop/three/three.traditional.dict",
what=character(),nlines=50,sep='\n',
encoding='utf-8',fileEncoding='utf-8')
Read 50 items
[1] "阿斗當皇帝軟弱無能 n" "阿斗的江山白送 n" "阿會喃 n" "阿陽 n"
[5] "哀牢 n" "艾縣 n" "安北將軍 n" "安城 n"
[9] "安次 n" "安德 n" "安定 n" "安定郡 n"
[13] "安東將軍 n" "安豐 n" "安故 n" "安廣 n"
[17] "安國 n" "安漢 n" "安樂 n" "安樂公 n"
[21] "安陵 n" "安陸 n" "安彌 n" "安南將軍 n"
[25] "安平 n" "安平國 n" "安丘 n" "安世 n"
[29] "安市 n" "安熹 n" "安西將軍 n" "安陽 n"
[33] "安夷 n" "安邑 n" "安遠將軍 n" "安眾 n"
[37] "奧汀多賴把 n" "鰲頭兩刃斧 n" "媼圍 n" "巴郡 n"
[41] "霸陵 n" "八路諸侯 n" "灞水 n" "八校尉兵 n"
[45] "拔用 n" "霸者之威 n" "白帝城託孤 n" "白鶴 n"
[49] "白虹 n" "白虎銀牙 n"
# 使用三國演義專有名詞字典
jieba_tokenizer <- worker(user="three.traditional.dict", stop_word = "stop_words.txt")
# 設定斷詞function
three_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
three_tokens <- three %>% unnest_tokens(word, text, token=three_tokenizer)
str(three_tokens)
'data.frame': 236150 obs. of 3 variables:
$ gutenberg_id: int 1 1 1 1 1 1 1 1 1 1 ...
$ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
$ word : chr "第一回" "宴桃園豪傑三結義" "斬黃巾英雄首立功" "詞曰" ...
head(three_tokens, 20)
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
three_tokens_count <- three_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(three_tokens_count, 20)
three_tokens_count %>% wordcloud2()
因本文中對同一人有多種稱呼,ex.玄德、劉備,因此為了該名人物的次數是正確的, 此處將不同稱呼但為同一人,皆計算為同一人的出場次數。
tokens_shiuan_de <- three_tokens %>%
filter(.$word == "玄德" | .$word == "劉備") %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "玄德")
tokens_kung_ming <- three_tokens %>%
filter(.$word == "孔明" | .$word == "諸葛亮") %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "孔明")
tokens_tsau_tsau<- three_tokens %>%
filter(.$word == "曹操" | .$word == "孟德") %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "曹操")
tokens_guan_gung<- three_tokens %>%
filter(.$word == "關公" | .$word == "雲長") %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "關公")
tokens_jang_fei<- three_tokens %>%
filter(.$word == "張飛" | .$word == "翼德") %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "張飛")
出現率最高的三人。
major_name_compare_plot <-
bind_rows(tokens_shiuan_de, tokens_kung_ming, tokens_tsau_tsau) %>%
ggplot(aes(x = chapter, y=count, fill=word)) +
geom_col(show.legend = F) +
facet_wrap(~word, ncol = 1) +
ggtitle("「曹操」v.s.「孔明」v.s.「玄德」") +
xlab("章節") +
ylab("出現次數") +
theme(text = element_text(family = "Heiti TC Light"))
major_name_compare_plot
著名的桃園三結義。
lgj_name_compare_plot <-
bind_rows(tokens_shiuan_de, tokens_guan_gung, tokens_jang_fei) %>%
ggplot(aes(x = chapter, y=count, fill=word)) +
geom_col(show.legend = F) +
facet_wrap(~word, ncol = 1) +
ggtitle("「關公」v.s.「玄德」v.s.「張飛」") +
xlab("章節") +
ylab("出現次數") +
theme(text = element_text(family = "Heiti TC Light"))
lgj_name_compare_plot
ch_sentences_three <- three %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences")
ch_word_three <- three_tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")
three_length_plot <-
bind_rows(ch_sentences_three, ch_word_three) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
geom_vline(xintercept = 71, col='red', size = 0.2) +
ggtitle("各章節的句子和詞彙總數") +
xlab("章節") +
ylab("數量") +
theme(text = element_text(family = "Heiti TC Light"))
three_length_plot
三國時期(西元184年~西元280年)共97年,三國演義前一百零三回述說了前51年,
而後的46年只短短的用十七回收場,比較前後的用詞有何差異。
計算 前一百零三回 和 後十七回 的詞彙在全文中出現比率的差異。
frequency <- three_tokens %>% mutate(part = ifelse(chapter<=103, "First 103", "Last 17")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(part, word) %>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `Last 17`)
匯出圖表
ggplot(frequency, aes(x = proportion, y = `First 103`, color = abs(`First 103` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2, width = 0.3, height = 0.3, na.rm = T) +
geom_text(aes(label = word), check_overlap = T, family = "Heiti TC Light", na.rm = T) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "First 103", x = "Last 17")