Abstract
結合jiebar與Tidy text套件,處理Gutenberg上的中文小說packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 3.4.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 3.4.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.4.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 3.4.4
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.4.4
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(scales)
## Warning: package 'scales' was built under R version 3.4.4
# 下載 "三國演義" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
three <- gutenberg_download(23950) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Warning: package 'bindrcpp' was built under R version 3.4.4
doc = paste0(three$text,collapse = "") #將text欄位的全部文字合併
docVector = unlist(strsplit(doc,"[。.]"), use.names=FALSE) #以全形或半形句號斷句
three = data.frame(gutenberg_id = "23950" , text = docVector, stringsAsFactors = FALSE) #gutenberg_id換成自己的書本id
View(three)
# 斷完句後
head(three, 20)
## gutenberg_id
## 1 23950
## 2 23950
## 3 23950
## 4 23950
## 5 23950
## 6 23950
## 7 23950
## 8 23950
## 9 23950
## 10 23950
## 11 23950
## 12 23950
## 13 23950
## 14 23950
## 15 23950
## 16 23950
## 17 23950
## 18 23950
## 19 23950
## 20 23950
## text
## 1 第一回:宴桃園豪傑三結義,斬黃巾英雄首立功 詞曰: 滾滾長江東逝水,浪花淘盡英雄
## 2 是非成敗轉頭空:青山依舊在,幾度夕陽紅
## 3 白髮漁樵江渚上,慣看秋月春風
## 4 一壺濁酒喜相逢:古今多少事,都付笑談中
## 5 話說天下大勢,分久必合,合久必分:周末七國分爭,并入於秦
## 6 及秦滅之後,楚、漢分爭,又并入於漢
## 7 漢朝自高祖斬白蛇而起義,一統天下
## 8 後來光武中興,傳至獻帝,遂分為三國
## 9 推其致亂之由,殆始於桓、靈二帝
## 10 桓帝禁錮善類,崇信宦官
## 11 及桓帝崩,靈帝即位,大將軍竇武、太傅陳蕃,共相輔佐
## 12 時有宦官曹節等弄權,竇武、陳蕃謀誅之,作事不密,反為所害
## 13 中涓自此愈橫
## 14 建寧二年四月望日,帝御溫德殿
## 15 方陞座,殿角狂風驟起,只見一條大青蛇,從梁上飛將下來,蟠於椅上
## 16 帝驚倒,左右急救入宮,百官俱奔避
## 17 須臾,蛇不見了
## 18 忽然大雷大雨,加以冰雹,落到半夜方止,壞卻房屋無數
## 19 建寧四年二月,洛陽地震;又海水泛溢,沿海居民,盡被大浪捲入海中
## 20 光和元年,雌雞化雄
觀察資料我們可以發現,三國演義中每章的開始會有“第X回:”為標題。
差別在於有的章節單純一“第X回”表示
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
three <- three %>%
mutate(chapter = cumsum(str_detect(three$text, regex("第.*回:"))))
# 使用三國演義專有名詞字典
jieba_tokenizer <- worker(user="three_kingdoms.traditional.dict", stop_word = "stop_words.txt")
# 設定斷詞function
three_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
tokens <- three %>% unnest_tokens(word, text, token=three_tokenizer)
str(tokens)
## 'data.frame': 236502 obs. of 3 variables:
## $ gutenberg_id: chr "23950" "23950" "23950" "23950" ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr "第一回" "宴桃園豪傑三結義" "斬黃巾英雄首立功" "詞曰" ...
head(tokens, 20)
## gutenberg_id chapter word
## 1 23950 1 第一回
## 1.1 23950 1 宴桃園豪傑三結義
## 1.2 23950 1 斬黃巾英雄首立功
## 1.3 23950 1 詞曰
## 1.4 23950 1 滾滾
## 1.5 23950 1 長
## 1.6 23950 1 江東
## 1.7 23950 1 逝水
## 1.8 23950 1 浪花
## 1.9 23950 1 淘盡
## 1.10 23950 1 英雄
## 2 23950 1 是非成敗
## 2.1 23950 1 轉頭
## 2.2 23950 1 空
## 2.3 23950 1 青山
## 2.4 23950 1 依舊
## 2.5 23950 1 夕陽紅
## 3 23950 1 白髮
## 3.1 23950 1 漁樵
## 3.2 23950 1 江渚上
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>20) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(tokens_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 玄德 1779
## 2 孔明 1644
## 3 曹操 922
## 4 將軍 725
## 5 卻說 644
## 6 丞相 533
## 7 關公 501
## 8 二人 460
## 9 雲長 430
## 10 不可 425
## 11 荊州 408
## 12 張飛 364
## 13 引兵 359
## 14 呂布 355
## 15 商議 341
## 16 軍士 321
## 17 魏延 321
## 18 主公 319
## 19 大喜 310
## 20 孫權 309
tokens_count %>% wordcloud2()
plot <-
bind_rows(
three %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量") #+
#theme(text = element_text(family = "Heiti TC Light"))
plot
true <- gutenberg_download(25606) %>% filter(text!="") %>% distinct(gutenberg_id, text)
#View(book)
doc <- paste0(true$text,collapse = "") #將text欄位的全部文字合併
docVector = unlist(strsplit(doc,"[。.]"), use.names=FALSE) #以全形或半形句號斷句
true <- data.frame(gutenberg_id = "25606" , text = docVector,stringsAsFactors = F) #gutenberg_id換成自己的書本id
top20 <- head(tokens_count, 20)
top3 <- head(top20,3)
top1_tokens <- subset(tokens, str_detect(tokens$word, top3$word[1]))
top2_tokens <- subset(tokens, str_detect(tokens$word, top3$word[2]))
View(top2_tokens)
top3_tokens <- subset(tokens, str_detect(tokens$word, top3$word[3]))
binds <- bind_rows(
top1_tokens %>%
group_by(chapter) %>%
summarise(count = n(), name=top3$word[1]),
top2_tokens %>%
group_by(chapter) %>%
summarise(count = n(), name=top3$word[2]),
top3_tokens %>%
group_by(chapter) %>%
summarise(count = n(), name=top3$word[3]))
top3_plot <-
binds %>%
group_by(name)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(name))) +
geom_line() +
ggtitle("最常出現的3個詞彙其在各章節的數量") +
xlab("章節") +
ylab("數量")
top3_plot
bro1_tokens <- subset(tokens, str_detect(tokens$word, "玄德"))
bro2_tokens <- subset(tokens, str_detect(tokens$word, "關羽"))
bro3_tokens <- subset(tokens, str_detect(tokens$word, "張飛"))
binds_bro <- bind_rows(
bro1_tokens %>%
group_by(chapter) %>%
summarise(count = n(), name="玄德"),
bro2_tokens %>%
group_by(chapter) %>%
summarise(count = n(), name="關羽"),
bro3_tokens %>%
group_by(chapter) %>%
summarise(count = n(), name="張飛"))
bro3_plot <-
binds_bro %>%
group_by(name)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(name))) +
geom_line()+
geom_point() +
ggtitle("桃園三結義在各章節的數量") +
xlab("章節") +
ylab("數量")
bro3_plot
# 使用三國志專有名詞字典
jieba_tokenizer <- worker(user="true_three.traditional.dict", stop_word = "stop_words.txt")
# 設定斷詞function
book_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
true_tokens <- true %>% unnest_tokens(word, text, token=book_tokenizer)
str(true_tokens)
## 'data.frame': 164633 obs. of 2 variables:
## $ gutenberg_id: chr "25606" "25606" "25606" "25606" ...
## $ word : chr "魏書" "武帝紀" "第一" "太祖" ...
head(true_tokens, 50)
## gutenberg_id word
## 1 25606 魏書
## 1.1 25606 武帝紀
## 1.2 25606 第一
## 1.3 25606 太祖
## 1.4 25606 武皇帝
## 1.5 25606 沛國譙人也
## 1.6 25606 姓曹
## 1.7 25606 諱操
## 1.8 25606 字孟德
## 1.9 25606 漢相國參之後
## 2 25606 曹
## 2.1 25606 瞞
## 2.2 25606 曰
## 2.3 25606 太祖一名吉利
## 2.4 25606 小字阿瞞
## 3 25606 王沈魏書曰
## 3.1 25606 其先出於黃帝
## 4 25606 當高陽世
## 4.1 25606 陸終之子曰安
## 4.2 25606 是為曹姓
## 5 25606 周武王克殷
## 5.1 25606 存先世之後
## 5.2 25606 封曹俠於邾
## 6 25606 春秋之世
## 6.1 25606 與於盟會
## 6.2 25606 逮至戰國
## 6.3 25606 為楚所滅
## 7 25606 子孫分流
## 7.1 25606 或家於沛
## 8 25606 漢高祖之起
## 8.1 25606 曹參以功封平陽侯
## 8.2 25606 世襲爵土
## 8.3 25606 絕而
## 8.4 25606 複
## 8.5 25606 紹
## 8.6 25606 至今適嗣國於容城
## 9 25606 桓帝
## 9.1 25606 世
## 9.2 25606 曹騰為中常侍大長秋
## 9.3 25606 封費亭侯
## 10 25606 司馬彪續漢書曰
## 10.1 25606 騰父節
## 10.2 25606 字元偉
## 10.3 25606 素以仁厚稱
## 11 25606 鄰人有亡豕者
## 11.1 25606 與節豕相類
## 11.2 25606 詣門認之
## 11.3 25606 節不與爭
## 11.4 25606 後所亡豕自還其家
## 11.5 25606 豕主人大慚
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
true_tokens_count <- true_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(true_tokens_count, 50)
## # A tibble: 50 x 2
## word sum
## <chr> <int>
## 1 將軍 845
## 2 太祖 609
## 3 太守 415
## 4 天下 382
## 5 陛下 204
## 6 不可 203
## 7 大將軍 202
## 8 天子 192
## 9 春秋 161
## 10 以為 155
## # ... with 40 more rows
#true_tokens_count %>% wordcloud2()
三國志
# 下載 "水滸傳" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
water <- gutenberg_download(23863) %>% filter(text!="") %>% distinct(gutenberg_id, text)
doc = paste0(water$text,collapse = "") #將text欄位的全部文字合併
docVector = unlist(strsplit(doc,"[。.]"), use.names=FALSE) #以全形或半形句號斷句
water = data.frame(gutenberg_id = "23863" , text = docVector, stringsAsFactors = FALSE) #gutenberg_id換成自己的書本id
View(water)
# 使用水滸傳專有名詞字典
jieba_tokenizer <- worker(user="water_margin.traditional.dict", stop_word = "stop_words.txt")
# 設定斷詞function
water_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
water <- water %>%
mutate(chapter = cumsum(str_detect(water$text, regex("^第.{1,3}回 ?"))))
water_tokens <- water %>% unnest_tokens(word, text, token=water_tokenizer)
str(water_tokens)
## 'data.frame': 194187 obs. of 3 variables:
## $ gutenberg_id: chr "23863" "23863" "23863" "23863" ...
## $ chapter : int 0 0 0 0 0 0 0 0 0 0 ...
## $ word : chr "楔子" "張天師" "祈" "禳" ...
head(water_tokens, 20)
## gutenberg_id chapter word
## 1 23863 0 楔子
## 1.1 23863 0 張天師
## 1.2 23863 0 祈
## 1.3 23863 0 禳
## 1.4 23863 0 瘟疫
## 1.5 23863 0 洪太尉
## 1.6 23863 0 誤走
## 1.7 23863 0 妖魔
## 1.8 23863 0 紛紛
## 1.9 23863 0 五代
## 1.10 23863 0 亂
## 1.11 23863 0 離間
## 1.12 23863 0 雲開
## 1.13 23863 0 復
## 1.14 23863 0 見天
## 1.15 23863 0 草木
## 1.16 23863 0 百年
## 1.17 23863 0 新
## 1.18 23863 0 雨露
## 1.19 23863 0 車書
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
water_tokens_count <- water_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>20) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(water_tokens_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 宋江 1884
## 2 兩個 1260
## 3 一個 983
## 4 武松 943
## 5 只見 667
## 6 哥哥 616
## 7 林沖 611
## 8 李逵 603
## 9 說道 538
## 10 頭領 478
## 11 小人 449
## 12 兄弟 426
## 13 <U+8846>人 425
## 14 婦人 400
## 15 戴宗 395
## 16 吳用 385
## 17 今日 366
## 18 好漢 348
## 19 便是 337
## 20 問道 321
#water_tokens_count %>% wordcloud2()
水滸傳
計算前 三國演義與水滸傳 的詞彙在全文中出現比率的差異
bind_tokens <- rbind(tokens, water_tokens)
frequency <- bind_tokens %>% select(-chapter) %>%
mutate(name = ifelse(gutenberg_id==23950, "three", "water")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(name, word)%>%
group_by(name) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(name, proportion)
匯出圖表
plot1 <- ggplot(frequency, aes(x = `three`, y = `water`, color = abs(`three` - `water`))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "water", x = "three")
計算前 三國演義後40回與水滸傳 的詞彙在全文中出現比率的差異
last40_three_tokens <- filter(tokens, chapter > 80)
bind_last40_tokens <- rbind(last40_three_tokens, water_tokens)
last40_frequency <- bind_last40_tokens %>% select(-chapter) %>%
mutate(name = ifelse(gutenberg_id==23950, "three", "water")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(name, word)%>%
group_by(name) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(name, proportion)
plot2 <- ggplot(last40_frequency, aes(x = `three`, y = `water`, color = abs(`three` - `water`))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "water", x = "three_last40")
par(mfrow = c(1,1))
plot1
## Warning: Removed 49752 rows containing missing values (geom_point).
## Warning: Removed 49753 rows containing missing values (geom_text).
plot2
## Warning: Removed 34143 rows containing missing values (geom_point).
## Warning: Removed 34144 rows containing missing values (geom_text).