Abstract
結合jiebar與Tidy text套件,處理中文文字資料(三國演義)Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 3.4.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 3.4.4
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 3.4.4
library(stringr)
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.4.4
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(tidyr)
library(scales)
## Warning: package 'scales' was built under R version 3.4.4
require(caTools)
## Loading required package: caTools
require(knitr)
## Loading required package: knitr
# 下載 "三國演義" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
red_org <- gutenberg_download(23950) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Warning: package 'bindrcpp' was built under R version 3.4.4
doc = paste0(red_org$text,collapse = "") #將text欄位的全部文字合併
docVector = unlist(strsplit(doc,"[。.]"), use.names=FALSE) #以全形或半形句號斷句
red = data.frame(gutenberg_id = "23950" , text = docVector,stringsAsFactors = FALSE) #gutenberg_id換成自己的書本id
history_tmp=read.csv("三國演義歷史.csv",header=FALSE,stringsAsFactors=FALSE,fileEncoding="big5")
my_str_split<-function(x,i) lapply(strsplit(x,split="~"),function(y) y[i]) %>% unlist %>% as.numeric()
#history_tmp$V1[9]="長坂坡"
history_tmp2=history_tmp %>% mutate(start=my_str_split(V2,1),end=my_str_split(V2,2)) %>% select(-V2);
colnames(history_tmp2)[1]="大綱"
history_tmp2 %>% kable
| 大綱 | start | end |
|---|---|---|
| 破黃巾,殺閹宦 | 1 | 2 |
| 收呂布,除董卓 | 3 | 9 |
| 曹操發跡 | 10 | 14 |
| 斬呂布 | 15 | 19 |
| 密謀誅曹 | 20 | 24 |
| 關羽附曹,孫權承兄 | 25 | 29 |
| 掃清袁氏,統一北方 | 30 | 33 |
| 三顧茅廬,孔明出山 | 34 | 40 |
| 長?坡 | 41 | 42 |
| 赤壁之戰 | 43 | 50 |
| 三氣周瑜 | 51 | 57 |
| 戰馬超 | 58 | 59 |
| 劉備入川 | 60 | 65 |
| 戰合肥,立魏嗣 | 66 | 69 |
| 取漢中 | 70 | 73 |
| 荊州失,關公死 | 74 | 77 |
| 魏篡漢,蜀稱帝 | 78 | 80 |
| 夷陵大敗,劉備託孤 | 81 | 86 |
| 七擒孟獲 | 87 | 90 |
| 六出祁山,孔明病逝 | 91 | 105 |
| 九伐中原 | 106 | 115 |
| 破西蜀 | 116 | 118 |
| 司馬篡魏,三分歸晉 | 119 | 120 |
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
red <- red %>%
mutate(chapter = cumsum(str_detect(text, regex("第.*回:"))),
chapter_ind = (str_detect(text, regex("第.*回:"))),
liu_bei=(str_detect(text, regex("劉備|玄德"))),
cao_cao=(str_detect(text, regex("曹操|阿瞞|孟德"))),
guan_yu=(str_detect(text, regex("雲長|關公|關羽"))),
yi_de=(str_detect(text, regex("張翼德|翼德|張飛"))),
kong_ming=(str_detect(text, regex("孔明|諸葛亮"))),
wei_guo=str_detect(text, regex("魏兵")),
shu_guo=str_detect(text, regex("蜀兵")),
wu_guo=str_detect(text, regex("吳兵"))
) %>%
mutate(chapter_content=ifelse(chapter_ind,gsub(".*第.*回:(.*)( ).*", "\\1", text),""))
red_chapter=red %>% filter(chapter_ind) %>% select(chapter,chapter_content);
# mutate(chapter_content=ifelse(chapter_ind,gsub("( ).*","", text),""))
# red_chapter=red %>% filter(chapter_ind) %>% select(chapter,chapter_content);
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
red_group <- red %>% select(-gutenberg_id,-text,-chapter_content,-chapter_ind) %>% select(chapter,wei_guo,shu_guo,wu_guo) %>%
group_by(chapter) %>% summarise_all(mean)
red_group2 <- red_group %>% gather(word,prop,-chapter) %>% group_by(word)%>% left_join(red_chapter,by="chapter") %>%
mutate(chapter_show=ifelse(prop==max(prop),chapter_content,""))
red_group2 %>% ggplot(aes(x = chapter, y=(prop), fill="type", color=factor(word))) +
geom_line(size=1) +
scale_colour_discrete(name = "人物",breaks=c("wei_guo", "shu_guo", "wu_guo"),labels =c("魏兵","蜀兵","吳兵"))+ylab("各回字句出現頻率")+xlab("章節")+
geom_text(aes(label = chapter_show), vjust = 0.02,hjust=1,fontface="bold")
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
red_group <- red %>% select(-gutenberg_id,-text,-chapter_content,-chapter_ind) %>% select(-wei_guo,-shu_guo,-wu_guo) %>%
group_by(chapter) %>% summarise_all(mean)
red_group2 <- red_group %>% gather(word,prop,-chapter) %>% group_by(word)%>% left_join(red_chapter,by="chapter") %>%
mutate(chapter_show=ifelse(prop>=max(prop),chapter_content,""))
red_group2$chapter_show[red_group2$word=="yi_de"]=""
red_group2$chapter_show[red_group2$word=="cao_cao"]=""
red_group2 %>% ggplot(aes(x = chapter, y=(prop), fill="type", color=factor(word))) +
geom_line(size=1) +
scale_colour_discrete(name = "人物",breaks=c("liu_bei", "guan_yu", "yi_de","kong_ming","cao_cao"),labels =c("劉備","關羽","張飛","諸葛亮","曹操"))+ylab("各回字句出現頻率")+xlab("章節")+
geom_text(aes(label = chapter_show), vjust = 0.02,hjust=0.5,fontface="bold",size=5)
# 使用三國演義專有名詞字典
jieba_tokenizer <- worker(user = 'san_guo.scel_2019-03-16_17_23_08_trad.dict', stop_word = "stop_words.txt")
# 設定斷詞function
red_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
tokens <- red %>% unnest_tokens(word, text, token=red_tokenizer)
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(tokens_count, 20) %>% kable
| word | sum |
|---|---|
| 玄德 | 1779 |
| 孔明 | 1644 |
| 曹操 | 922 |
| 將軍 | 725 |
| 丞相 | 533 |
| 關公 | 501 |
| 雲長 | 430 |
| 荊州 | 408 |
| 張飛 | 364 |
| 引兵 | 359 |
| 呂布 | 355 |
| 商議 | 341 |
| 軍士 | 321 |
| 魏延 | 321 |
| 主公 | 319 |
| 大喜 | 310 |
| 孫權 | 309 |
| 趙雲 | 309 |
| 左右 | 292 |
| 軍馬 | 289 |
畫出詞頻的文字雲
tokens_count2=tokens_count %>% mutate(sum=(sum)^1 %>% as.integer())
tokens_count2%>% wordcloud2()
plot <-
bind_rows(
red %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量") #+
#theme(text = element_text(family = "Heiti TC Light"))
plot
frequency <- tokens %>% mutate(part = ifelse(chapter<=80, "First 80", "Last 40")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(part, word)%>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `Last 40`)
frequency[is.na(frequency)]=min(frequency %>% select(`First 80`,proportion))
ggplot(frequency, aes(x = proportion, y = `First 80`, color = abs(`First 80` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
#geom_jitter(alpha = 0.01, size = 2.5, width = 0.01, height = 0.01) +
#geom_point() +
geom_text(aes(label = word), vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "First 80", x = "Last 40")
## Warning: Removed 32048 rows containing missing values (geom_text).
使用搜狗詞彙庫
搜狗輸入法是中國主流的拼音輸入法,在中國的市佔率高達50%。
並且,其官網提供了眾多專有詞彙的詞彙庫供使用者免費下載。
詳情可以參考一下連結:
https://pinyin.sogou.com/dict/
# 安裝packages
packages = c("readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# 載入library
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(devtools)
# 解碼scel用
#install_github("qinwf/cidian")
library(cidian)
## Loading required package: stringi
## Warning: package 'stringi' was built under R version 3.4.4
## Loading required package: pbapply
## Warning: package 'pbapply' was built under R version 3.4.4
# 簡體轉繁體套件
#install_github("qinwf/ropencc")
library(ropencc)
# 解碼scel檔案
decode_scel(scel = "./san_guo.scel",cpp = TRUE)
## output file: ./san_guo.scel_2019-03-18_23_35_35.dict
# 讀取解碼後生成的詞庫檔案
scan(file="./san_guo.scel_2019-03-16_17_23_08.dict",
what=character(),nlines=50,sep='\n',
encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "./san_guo.scel_2019-03-16_17_23_08.dict", what =
## character(), : 輸入連結 './san_guo.scel_2019-03-16_17_23_08.dict' 中的輸入
## 不正確
## [1] "阿斗"
dict <- read_file("./san_guo.scel_2019-03-16_17_23_08.dict")
# 將簡體詞庫轉為繁體
cc <- converter(S2TW)
dict_trad <- cc[dict]
write_file(dict_trad, "./san_guo.scel_2019-03-16_17_23_08_trad.dict")
# 讀取轉換成繁體後的詞庫檔案
scan(file="./san_guo.scel_2019-03-16_17_23_08_trad.dict",
what=character(),nlines=50,sep='\n',
encoding='utf-8',fileEncoding='utf-8')
## [1] "阿斗當皇帝軟弱無能 n" "阿斗的江山白送 n" "阿會喃 n"
## [4] "阿陽 n" "哀牢 n" "艾縣 n"
## [7] "安北將軍 n" "安城 n" "安次 n"
## [10] "安德 n" "安定 n" "安定郡 n"
## [13] "安東將軍 n" "安豐 n" "安故 n"
## [16] "安廣 n" "安國 n" "安漢 n"
## [19] "安樂 n" "安樂公 n" "安陵 n"
## [22] "安陸 n" "安彌 n" "安南將軍 n"
## [25] "安平 n" "安平國 n" "安丘 n"
## [28] "安世 n" "安市 n" "安熹 n"
## [31] "安西將軍 n" "安陽 n" "安夷 n"
## [34] "安邑 n" "安遠將軍 n" "安眾 n"
## [37] "奧汀多賴把 n" "鰲頭兩刃斧 n" "媼圍 n"
## [40] "巴郡 n" "霸陵 n" "八路諸侯 n"
## [43] "灞水 n" "八校尉兵 n" "拔用 n"
## [46] "霸者之威 n" "白帝城託孤 n" "白鶴 n"
## [49] "白虹 n" "白虎銀牙 n"