Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)require(dplyr)## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)## Loading required package: tidytext
require(jiebaR)## Loading required package: jiebaR
## Loading required package: jiebaRD
require(gutenbergr)## Loading required package: gutenbergr
require(stringr)## Loading required package: stringr
require(wordcloud2)## Loading required package: wordcloud2
require(ggplot2)## Loading required package: ggplot2
require(tidyr)## Loading required package: tidyr
require(scales)## Loading required package: scales
Gutenberg free eBooks
Also, various chinese books can be found in the link below:
水滸後傳 by Chen, Chen:
# 下載 "水滸後傳" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
water <- gutenbergr::gutenberg_download(25217, mirror="http://mirrors.xmission.com/gutenberg/") %>% filter(text!="") %>% distinct(gutenberg_id, text)
water## # A tibble: 7,651 x 2
## gutenberg_id text
## <int> <chr>
## 1 25217 第一回 阮統制感舊梁山泊 張別駕激變石碣村
## 2 25217 甲馬營中香孩兒,志氣倜儻真雄姿。殿前點檢作天子,陳橋兵變回京師。~
## 3 25217 黃袍加身御海宇,五代紛爭從此止。功臣杯酒釋兵權,神武不殺古無比。可惜~
## 4 25217 時無輔弼臣,雜王雜霸治未馴。燭影斧聲千古疑,豈容再誤傷天倫。立未逾年~
## 5 25217 改號蚤,金滕誓約為故草。秦王貶黜尺布謠,德昭德芳俱橫夭。豎儒倡議欲南~
## 6 25217 遷,宗社岌岌烽火連。御蓋過河呼萬歲,南兄北弟始兩全。澶淵之役作孤注,~
## 7 25217 乾坤再造功無二。朝中不拔眼中釘,雷陽枯竹沾新淚。聖人特降赤腳仙,深仁~
## 8 25217 厚澤四十年。南街笑似黃河清,樞使夜奪崑崙天。青苗法行繫安石,鄭俠繪圖~
## 9 25217 傷國脈。天津橋上子規啼,半山堂內無籌畫。首揆幸有涑水公,市夫傭販皆融~
## 10 25217 融。軍中韓范驚破膽,金蓮送歸內翰營。元祐黨人何所負,竄逐誅夷皆准奏。~
## # ... with 7,641 more rows
water <- water %>%
mutate(chapter = cumsum(str_detect(water$text, regex("^( |\u3000)*第.*回 {1}"))))head(water, 20)## # A tibble: 20 x 3
## gutenberg_id text chapter
## <int> <chr> <int>
## 1 25217 第一回 阮統制感舊梁山泊 張別駕激變石碣村 1
## 2 25217 甲馬營中香孩兒,志氣倜儻真雄姿。殿前點檢作天子,陳橋兵變回京師。~ 1
## 3 25217 黃袍加身御海宇,五代紛爭從此止。功臣杯酒釋兵權,神武不殺古無比。可惜~ 1
## 4 25217 時無輔弼臣,雜王雜霸治未馴。燭影斧聲千古疑,豈容再誤傷天倫。立未逾年~ 1
## 5 25217 改號蚤,金滕誓約為故草。秦王貶黜尺布謠,德昭德芳俱橫夭。豎儒倡議欲南~ 1
## 6 25217 遷,宗社岌岌烽火連。御蓋過河呼萬歲,南兄北弟始兩全。澶淵之役作孤注,~ 1
## 7 25217 乾坤再造功無二。朝中不拔眼中釘,雷陽枯竹沾新淚。聖人特降赤腳仙,深仁~ 1
## 8 25217 厚澤四十年。南街笑似黃河清,樞使夜奪崑崙天。青苗法行繫安石,鄭俠繪圖~ 1
## 9 25217 傷國脈。天津橋上子規啼,半山堂內無籌畫。首揆幸有涑水公,市夫傭販皆融~ 1
## 10 25217 融。軍中韓范驚破膽,金蓮送歸內翰營。元祐黨人何所負,竄逐誅夷皆准奏。~ 1
## 11 25217 日射晚霞金世界,竟成詩讖為北狩。崔君泥馬渡九哥,六宮能唱杭州歌。二聖~ 1
## 12 25217 環且丟腦後,將軍憤死呼渡河。朱仙鎮上蟣生冑,痛飲黃龍志未售。風波亭內~ 1
## 13 25217 碧血凝,甘心屈膝微臣構。天道昭昭不可移,神器重歸藝祖裔。侍奉兩宮孝莫~ 1
## 14 25217 倫,茸母生時雪窖悲。十里荷花三秋桂,立馬吳山勢崩潰。濰淮之捷出書生,~ 1
## 15 25217 干戈禍定天應悔。炙手可熱握大權,侍郎充犬吠籬邊。空談性命成何濟,謝金~ 1
## 16 25217 函首玉津園。半閉堂中鬥蟋蟀,襄陽五年圍不撤。樓台燈火葛嶺西,湖上平章~ 1
## 17 25217 宴未歇。破竹迎降水逆流,東南半壁誰能留。可憐無寸乾淨地,開花結子在棉~ 1
## 18 25217 州。<U+81EF>亭山下嘶萬馬,孤兒寡婦何為者。錢塘江上潮不來,朝臣盡立降旗下。~ 1
## 19 25217 零仃洋裡歎零仃,空扶幼主在翔興。甲子門中大星隕,趙氏塊肉浮沙汀。小樓~ 1
## 20 25217 三年在燕市,成仁就義真國士。黃冠故鄉不可期,大宋正統才絕此。六陵冬青~ 1
下載下來的書已經完成斷句
jieba_tokenizer <- worker(user="shuihuzhuan.txt", stop_word = "stop_words.txt")jieba_tokenizer <- worker(user="shuihuzhuan.txt", stop_word = "stop_words.txt")
# 設定斷詞function
water_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}tokens <- water %>% unnest_tokens(word, text, token=water_tokenizer)
str(tokens)## tibble [116,017 x 3] (S3: tbl_df/tbl/data.frame)
## $ gutenberg_id: int [1:116017] 25217 25217 25217 25217 25217 25217 25217 25217 25217 25217 ...
## $ chapter : int [1:116017] 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr [1:116017] "第一回" "阮" "統制" "感舊" ...
head(tokens, 20)## # A tibble: 20 x 3
## gutenberg_id chapter word
## <int> <int> <chr>
## 1 25217 1 第一回
## 2 25217 1 阮
## 3 25217 1 統制
## 4 25217 1 感舊
## 5 25217 1 梁山泊
## 6 25217 1
## 7 25217 1 張
## 8 25217 1 別
## 9 25217 1 駕
## 10 25217 1 激變
## 11 25217 1 石碣村
## 12 25217 1
## 13 25217 1
## 14 25217 1 甲馬
## 15 25217 1 營中
## 16 25217 1 香
## 17 25217 1 孩兒
## 18 25217 1 志氣
## 19 25217 1 倜儻
## 20 25217 1 真
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))head(tokens_count, 20)## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 李俊 338
## 2 燕青 298
## 3 楊林 240
## 4 樂和 230
## 5 一個 228
## 6 兩個 221
## 7 說道 219
## 8 李應 208
## 9 銀子 191
## 10 哪裡 189
## 11 不得 162
## 12 呼延 156
## 13 呼延灼 148
## 14 杜興 143
## 15 阮小七 143
## 16 只是 139
## 17 這裡 139
## 18 安道全 138
## 19 戴宗 136
## 20 穆春 121
tokens_count %>% wordcloud2()
依文字雲結果來看,水滸後傳為以李俊為主角的故事。並且從圖中可以發現“燕青”與“李俊”被提及的次數不相上下,經調查後發現,燕青於水滸候傳的故事當中扮演著智多星的角色,參與了抵抗金軍入侵的戰役,在後面的章節中燕青還在海外輔佐李俊創立基業,也為書中主角之一。同時,從文字雲中可以看到「兄弟」、「兵馬」、「領兵」、「文武」、「軍士」等字詞,進而可以推測水滸後傳的內容主要與英雄好漢或軍事起義有關。相較於紅樓夢的文字雲,從水滸後傳的字詞更能看出故事主軸。
以句子數量來計算
plot <-
bind_rows(
water %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量") +
theme(text = element_text(family = "Heiti TC Light"))
plot## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
# 將李俊的等同詞保留成變數
lijun_alias = c("李俊", "天壽星","混龍江")
# 計算李俊在整本書中出現的頻率及比例
lijun_count = tokens %>%
filter(nchar(.$word)>1) %>% # 保留非空白資料
mutate(total = nrow(tokens)) %>% # 加入一個total (資料總列數)
group_by(word) %>%
filter(word %in% lijun_alias) %>% # 保留李俊的等同詞
summarise(count = n(), total = mean(total)) %>% # 計算群組內的數量;mean 函式無意義,僅用來讓total 出現在表格
mutate(proportion = count / total) %>% # 加入比例欄位
arrange(desc(count)) # 根據count欄位,由大至小排列
head(lijun_count, 20)## # A tibble: 1 x 4
## word count total proportion
## <chr> <int> <dbl> <dbl>
## 1 李俊 338 116017 0.00291
lijun_plot = tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% lijun_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的李俊出現總數") +
xlab("章節") +
ylab("李俊數量") +
theme(text = element_text(family = "Heiti TC Light"))
lijun_plot## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
於第9回時,李俊出現的次數最高。仔細去看水滸後傳的第9回故事內容,我們發現李俊在那一章節初次登場,於是作者用了大量筆墨來描寫其背景。並且敘述了李俊在征方臘后,沒有隨宋江等進京謀官,而與童威、童猛去投奔江南豪傑費保等人,在太湖消夏灣蓋造房屋打魚。而遇上了當地惡霸巴山蛇與常州知府相勾結,將大半個太湖占為己有,漁船打得的魚需繳一半。他恨巴山蛇“奪了眾百姓的飯碗”,與眾弟兄將其巡邏收魚的船隻撞翻。然而,巴山蛇與知府串通,借元宵放燈,將他與費保等捉拿下獄,但最後還是幸得樂和等救出的故事。
frequency <- tokens %>% mutate(part = ifelse(chapter<=20, "First 20", "Last 20")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(part, word) %>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `Last 20`)
frequency## # A tibble: 22,813 x 4
## word `First 20` part proportion
## <chr> <dbl> <chr> <dbl>
## 1 <U+6527>下馬 0.0000275 Last 20 NA
## 2 <U+7240>上 0.000248 Last 20 0.0000859
## 3 <U+7240>底 0.000193 Last 20 NA
## 4 <U+7240>沿 0.0000275 Last 20 NA
## 5 <U+7240>鋪 0.0000275 Last 20 NA
## 6 <U+7240>頭 NA Last 20 0.0000286
## 7 <U+7240>邊 0.0000275 Last 20 NA
## 8 <U+817C>顏 NA Last 20 0.0000286
## 9 <U+85C1>葬 NA Last 20 0.0000286
## 10 丁自 0.0000275 Last 20 NA
## # ... with 22,803 more rows
ggplot(frequency, aes(x = proportion, y = `First 20`, color = abs(`First 20` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "First 20", x = "Last 20")## Warning: Removed 17760 rows containing missing values (geom_point).
## Warning: Removed 17761 rows containing missing values (geom_text).
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
-燕青的座標偏右下角 → 說明他出場較晚,在後20回出現的比率較高(呼應了我們前面文字雲的敘述,因為在後期燕青都在海外輔佐李俊創立基業,故後面20回都有在寫他的故事)。
-根據前一張圖( 剛剛李俊在各章節出現的頻率之barplot ),李俊的座標位置說明由於整個水滸後傳是以他為主角,因此不管前後20回他出現的比率相當,並無太大的比重差異。