## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): OS
## reports request to set locale to "zh_TW.UTF-8" cannot be honored
## [1] ""
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: tidytext
## Loading required package: jiebaR
## Loading required package: jiebaRD
## Loading required package: gutenbergr
## Loading required package: stringr
## Loading required package: wordcloud2
## Loading required package: ggplot2
## Loading required package: tidyr
## Loading required package: scales
# 下載 "三國演義" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
three <- gutenbergr::gutenberg_download(23950, mirror = "http://mirrors.xmission.com/gutenberg/") %>% filter(text!="") %>% distinct(gutenberg_id, text)
# 將text欄位的全部文字合併
doc = paste0(three$text,collapse = "")
# 以全形或半形句號斷句
docVector = unlist(strsplit(doc,"[。.?!]"), use.names=FALSE)
three = data.frame(gutenberg_id = "23950" , text = docVector)
three$text <- as.character(three$text)
three$gutenberg_id <- as.integer(three$gutenberg_id)
# tail(three, 100)# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
three <- three %>%
mutate(chapter = cumsum(str_detect(three$text, regex(".*第.{1,3}回.*"))))
str(three)## 'data.frame': 29830 obs. of 3 variables:
## $ gutenberg_id: int 23950 23950 23950 23950 23950 23950 23950 23950 23950 23950 ...
## $ text : chr "第一回:宴桃園豪傑三結義,斬黃巾英雄首立功 詞曰: 滾滾長江東逝水,浪花淘盡英雄" "是非成敗轉頭空:青山依舊在,幾度夕陽紅" "白髮漁樵江渚上,慣看秋月春風" "一壺濁酒喜相逢:古今多少事,都付笑談中" ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
# 安裝packages
packages = c("readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
## Loading required package: usethis
## # tmcn Version: 0.2-13
dict <- read_file("./three.text")
# 簡體轉為繁體
dict_trad <-toTrad(dict)
write_file(dict_trad, "three.traditional.dict")
# 讀取轉換成繁體後的詞庫檔案
scan(file="D:/R_1092/three.traditional.dict",
what=character(),nlines=50,sep='\n',
encoding='utf-8',fileEncoding='utf-8')## [1] "阿斗當皇帝軟弱無能" "阿斗的江山白送" "阿會喃"
## [4] "阿陽" "哀牢" "艾縣"
## [7] "安北將軍" "安城" "安次"
## [10] "安德" "安定" "安定郡"
## [13] "安東將軍" "安豐" "安故"
## [16] "安廣" "安國" "安漢"
## [19] "安樂" "安樂公" "安陵"
## [22] "安陸" "安彌" "安南將軍"
## [25] "安平" "安平國" "安丘"
## [28] "安世" "安市" "安熹"
## [31] "安西將軍" "安陽" "安夷"
## [34] "安邑" "安遠將軍" "安眾"
## [37] "奧汀多賴把" "鰲頭兩刃斧" "媼圍"
## [40] "巴郡" "霸陵" "八路諸侯"
## [43] "灞水" "八校尉兵" "拔用"
## [46] "霸者之威" "白帝城托孤" "白鶴"
## [49] "白虹" "白虎銀牙"
# 使用三國演義專有名詞字典
jieba_tokenizer <- worker(user="three.traditional.dict", stop_word = "stop_word.txt")
new_user_word(jieba_tokenizer, c("桃園豪傑三結義","長江"))## [1] TRUE
tokens = segment(three$text, jieba_tokenizer)
# 設定斷詞function
three_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}## 'data.frame': 293709 obs. of 3 variables:
## $ gutenberg_id: int 23950 23950 23950 23950 23950 23950 23950 23950 23950 23950 ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr "第一回" "宴" "桃園豪傑三結義" "斬黃巾英雄首立功" ...
## gutenberg_id chapter word
## 1 23950 1 第一回
## 2 23950 1 宴
## 3 23950 1 桃園豪傑三結義
## 4 23950 1 斬黃巾英雄首立功
## 5 23950 1
## 6 23950 1
## 7 23950 1 詞曰
## 8 23950 1
## 9 23950 1
## 10 23950 1 滾滾
## 11 23950 1 長江
## 12 23950 1 東
## 13 23950 1 逝水
## 14 23950 1 浪花
## 15 23950 1 淘盡
## 16 23950 1 英雄
## 17 23950 1 是非成敗
## 18 23950 1 轉頭
## 19 23950 1 空
## 20 23950 1 青山
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
three_tokens_count <- three_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(three_tokens_count, 20)## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 玄德 1779
## 2 孔明 1644
## 3 曹操 923
## 4 將軍 708
## 5 丞相 532
## 6 關公 502
## 7 雲長 429
## 8 荊州 408
## 9 張飛 364
## 10 引兵 360
## 11 呂布 355
## 12 商議 341
## 13 軍士 321
## 14 魏延 321
## 15 主公 319
## 16 大喜 310
## 17 孫權 309
## 18 趙雲 309
## 19 左右 292
## 20 軍馬 289
tokens_shiuan_de <- three_tokens %>%
filter(.$word == "玄德" | .$word == "劉備") %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "玄德")
tokens_kung_ming <- three_tokens %>%
filter(.$word == "孔明" | .$word == "諸葛亮") %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "孔明")
tokens_tsau_tsau<- three_tokens %>%
filter(.$word == "曹操" | .$word == "孟德") %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "曹操")
tokens_guan_gung<- three_tokens %>%
filter(.$word == "關公" | .$word == "雲長") %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "關公")
tokens_jang_fei<- three_tokens %>%
filter(.$word == "張飛" | .$word == "翼德") %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "張飛")major_name_compare_plot <-
bind_rows(tokens_shiuan_de, tokens_kung_ming, tokens_tsau_tsau, tokens_guan_gung, tokens_jang_fei) %>%
ggplot(aes(x = chapter, y=count, fill=word)) +
geom_col(show.legend = F) +
facet_wrap(~word, ncol = 1) +
ggtitle("「劉備」v.s.「孔明」v.s.「曹操」v.s.「關羽」v.s.「張飛」") +
xlab("章節") +
ylab("出現次數")
# theme(text = element_text(family = ""))
major_name_compare_plotch_sentences_three <- three %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences")
ch_word_three <- three_tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")
which.max(ch_word_three$count == 3598)## [1] 15
three_length_plot <-
bind_rows(ch_sentences_three, ch_word_three) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
geom_vline(xintercept = 15, col='red', size = 0.2) +
ggtitle("各章節的句子和詞彙總數") +
xlab("章節") +
ylab("數量")
# theme(text = element_text())
three_length_plot在第15回時出現詞彙總數最多
frequency <- three_tokens %>% mutate(part = ifelse(chapter<=87, "First 87", "Last 33")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(part, word) %>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `Last 33`)
frequency## # A tibble: 40,239 x 4
## word `First 87` part proportion
## <chr> <dbl> <chr> <dbl>
## 1 丁巳 0.00000820 Last 33 NA
## 2 丁公是 0.00000820 Last 33 NA
## 3 丁夫人 0.0000164 Last 33 NA
## 4 丁立 NA Last 33 0.0000213
## 5 丁命 0.00000820 Last 33 NA
## 6 丁奉 0.000312 Last 33 0.000384
## 7 丁奉雪中奮短兵 NA Last 33 0.0000213
## 8 丁咸 NA Last 33 0.0000213
## 9 丁封 NA Last 33 0.0000426
## 10 丁軍健 NA Last 33 0.0000213
## # ... with 40,229 more rows
ggplot(frequency, aes(x = proportion, y = `First 87`, color = abs(`First 87` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2, width = 0.3, height = 0.3, na.rm = T) +
geom_text(aes(label = word), check_overlap = T, na.rm = T) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "First 87", x = "Last 33")