Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/en_US.UTF-8"
rm(list=ls(all=T))
# 整理
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(tmcn) # 簡轉繁
## # tmcn Version: 0.2-13
library(readr)
# 繪圖
library(wordcloud2)
library(ggridges)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(ggplot2)
library(forcats)
# 下載 "三國演義" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
three_kingdom <- gutenberg_download(23950,mirror = "http://mirrors.xmission.com/gutenberg/") %>%
filter(text!="") %>%
distinct(gutenberg_id, text)
三國演義每章節的開頭都會有“第X回”的詞,但有些僅以“第X回”表示,有些則在“第X回”後面加上章節名稱,EX.「第一回:宴桃園豪傑三結義,斬黃巾英雄首立功」
# 按章節整理
three_kingdom <- three_kingdom %>%
mutate(chapter = cumsum(str_detect(three_kingdom$text, regex("^第.{1,3}回"))))
three_kingdom_doc = paste0(three_kingdom$text,collapse = "")
# 若要將一個字串分割成多個字串,可以使用 strsplit 函數,第一個參數是輸入的字串,而第二個參數則是分隔字串:
three_kingdom_docVector = unlist(strsplit(three_kingdom_doc ,"[。.?!」「]"), use.names=FALSE)
head(three_kingdom_docVector)
## [1] "第一回:宴桃園豪傑三結義,斬黃巾英雄首立功 詞曰: 滾滾長江東逝水,浪花淘盡英雄"
## [2] "是非成敗轉頭空:青山依舊在,幾度夕陽紅"
## [3] "白髮漁樵江渚上,慣看秋月春風"
## [4] "一壺濁酒喜相逢:古今多少事,都付笑談中"
## [5] " 話說天下大勢,分久必合,合久必分:周末七國分爭,并入於秦"
## [6] "及秦滅之後,楚、漢分爭,又并入於漢"
three_kingdom_doc <- as.data.frame(three_kingdom_docVector)
three_kingdom_doc$id <- 23950
three_kingdom_doc <- three_kingdom_doc %>%
transmute(id =id, text = three_kingdom_docVector)
three_kingdom_doc <- filter(three_kingdom_doc,text!= "")
head(three_kingdom_doc)
## id
## 1 23950
## 2 23950
## 3 23950
## 4 23950
## 5 23950
## 6 23950
## text
## 1 第一回:宴桃園豪傑三結義,斬黃巾英雄首立功 詞曰: 滾滾長江東逝水,浪花淘盡英雄
## 2 是非成敗轉頭空:青山依舊在,幾度夕陽紅
## 3 白髮漁樵江渚上,慣看秋月春風
## 4 一壺濁酒喜相逢:古今多少事,都付笑談中
## 5 話說天下大勢,分久必合,合久必分:周末七國分爭,并入於秦
## 6 及秦滅之後,楚、漢分爭,又并入於漢
根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
three_kingdom_doc <- three_kingdom_doc %>%
mutate(chapter = cumsum(str_detect(three_kingdom_doc$text, regex("^第.{1,3}回"))))
下載下來的書已經完成斷句
# 停用詞簡轉繁
three_kingdom_dict <- read_csv("three_kingdom.text",col_names =F )
## Parsed with column specification:
## cols(
## X1 = col_character()
## )
three_kingdom_dict <- toTrad(three_kingdom_dict$X1)
write.table(three_kingdom_dict,file="three_kingdom_dict.txt",sep=",",row.names = F,col.names = F,fileEncoding="UTF-8")
jieba_tokenizer <- worker(user="three_kingdom_dict.txt", stop_word = "stop_words.txt")
# 設定斷詞function
three_kingdom_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
# 斷詞
three_kingdom_doc$text <- as.character(three_kingdom_doc$text)
tokens <- three_kingdom_doc %>%
unnest_tokens(word, text, token=three_kingdom_tokenizer)
str(tokens)
## 'data.frame': 287947 obs. of 3 variables:
## $ id : num 23950 23950 23950 23950 23950 ...
## $ chapter: int 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr "第一回" "宴" "桃園" "豪傑" ...
head(tokens)
## id chapter word
## 1 23950 1 第一回
## 1.1 23950 1 宴
## 1.2 23950 1 桃園
## 1.3 23950 1 豪傑
## 1.4 23950 1 三
## 1.5 23950 1 結義
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 10) %>%
arrange(desc(sum))
head(tokens_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 曹操 932
## 2 孔明 843
## 3 將軍 776
## 4 卻說 642
## 5 玄德 519
## 6 關公 507
## 7 丞相 482
## 8 二人 458
## 9 不可 426
## 10 荊州 418
## 11 孔明曰 386
## 12 玄德曰 386
## 13 不能 384
## 14 如此 372
## 15 張飛 349
## 16 如何 344
## 17 商議 341
## 18 主公 327
## 19 軍士 320
## 20 操曰 307
# 用出現次數大於200的繪圖為文字雲
tokens_count %>%
filter(sum>200)%>%
wordcloud2()
# 劉禪的部分
liuchan = tokens %>%
filter(word=="劉禪"|word=="公嗣")%>%
group_by(chapter) %>%
summarize(sum = n()) %>%
mutate(name = "劉禪(公嗣)",total = sum(sum))
adou = tokens %>%
filter(word== "阿斗")%>%
group_by(chapter) %>%
summarize(sum = n())%>%
mutate(name = "阿斗",total = sum(sum))
c = bind_rows(liuchan,adou)
c %>% ggplot(aes(x=chapter,y =sum,color = name))+geom_line()+xlab("chapter")+ylab("出現次數")+theme(text = element_text(family='STHeiti'))
d = c[,3:4] %>% distinct()
d[1,1] = "劉禪"
d[3,1] = "公嗣"
d[3,2] = 0
d %>% ggplot(aes(x= name , y = total,fill = name))+geom_bar(stat="identity")+theme(text = element_text(family='STHeiti'))
其他人物大都都有較為正式的稱呼,但是「劉禪」字「公嗣」 加起來沒「阿斗」,以此可見「劉禪」不配擁有姓名
# "'劉備', '諸葛亮', '曹操', '關雲長', '趙雲', '司馬懿', '周瑜', '張飛', '孫權'"
zhuge_tokens <- tokens_count %>%
filter(word %in% c('諸葛亮','孔明曰','孔明笑', '孔明'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "諸葛亮")
liubei_tokens <- tokens_count %>%
filter(word %in% c('劉備', '玄德', '玄德曰', '主公', '劉皇叔', '劉玄德','玄德大', '玄德問', '劉豫州', '玄德聞', '玄德乃'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "劉備")
caoca_tokens <- tokens_count %>%
filter(word %in% c('曹操', '丞相', '操大怒', '曹丞相', '操大喜'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "曹操")
guanyu_tokens <- tokens_count %>%
filter(word %in% c('關雲長', '雲長', '關公'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "關羽")
zhangfei_tokens <- tokens_count %>%
filter(word %in% c('張飛', '翼德'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "張飛")
zhaoyun_tokens <- tokens_count %>%
filter(word %in% c('趙雲', '子龍', '趙子龍'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "趙雲")
simayi_tokens <- tokens_count %>%
filter(word %in% c('司馬懿', '司馬', '仲達'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "司馬懿")
zhouyu_tokens <- tokens_count %>%
filter(word %in% c('公瑾', '周瑜'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "周瑜")
sunquan_tokens <- tokens_count %>%
filter(word %in% c('仲謀', '孫權'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "孫權")
# 全書重要人物出現的「總次數」
allname <- rbind(zhuge_tokens,liubei_tokens,caoca_tokens,guanyu_tokens,
zhangfei_tokens,zhaoyun_tokens,simayi_tokens,zhouyu_tokens,sunquan_tokens)
head(allname,20)
## # A tibble: 9 x 2
## allsum name
## <int> <chr>
## 1 1432 諸葛亮
## 2 1763 劉備
## 3 1505 曹操
## 4 728 關羽
## 5 364 張飛
## 6 321 趙雲
## 7 290 司馬懿
## 8 250 周瑜
## 9 264 孫權
# Reorder following the value of another column:
plot <- allname %>%
mutate(name = fct_reorder(name, allsum)) %>%
ggplot( aes(x=name, y=allsum))+
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
coord_flip() +
xlab("") +
theme(text = element_text(family='STHeiti'))
plot
可以非常明顯的看出《三國演義》故事情節圍繞曹操、劉備、諸葛亮三大男主角展開。
# 各主要人物在「各個章節出現的次數」變化
# 以各個chapter和word來groupby
CW_tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(chapter,word) %>%
summarise(sum = n()) %>%
filter(sum > 10) %>%
arrange(desc(sum))
# head(CW_tokens_count)
CW_zhuge_tokens <- CW_tokens_count %>%
filter(word %in% c('諸葛亮','孔明曰','孔明笑', '孔明'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "諸葛亮")
CW_liubei_tokens <- CW_tokens_count %>%
filter(word %in% c('劉備', '玄德', '玄德曰', '主公', '劉皇叔', '劉玄德','玄德大', '玄德問', '劉豫州', '玄德聞', '玄德乃'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "劉備")
CW_caoca_tokens <- CW_tokens_count %>%
filter(word %in% c('曹操', '丞相', '操大怒', '曹丞相', '操大喜','阿瞞'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "曹操")
CW_guanyu_tokens <- CW_tokens_count %>%
filter(word %in% c('關雲長', '雲長', '關公'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "關羽")
CW_zhangfei_tokens <- CW_tokens_count %>%
filter(word %in% c('張飛', '翼德'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "張飛")
CW_zhaoyun_tokens <- CW_tokens_count %>%
filter(word %in% c('趙雲', '子龍', '趙子龍'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "趙雲")
CW_simayi_tokens <- CW_tokens_count %>%
filter(word %in% c('司馬懿', '司馬', '仲達'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "司馬懿")
CW_zhouyu_tokens <- CW_tokens_count %>%
filter(word %in% c('公瑾', '周瑜'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "周瑜")
CW_sunquan_tokens <- CW_tokens_count %>%
filter(word %in% c('仲謀', '孫權'))%>%
summarise(allsum = sum(sum))%>%
mutate(name = "孫權")
CW_allname <- rbind(CW_zhuge_tokens,CW_liubei_tokens,CW_caoca_tokens,CW_guanyu_tokens,
CW_zhangfei_tokens,CW_zhaoyun_tokens,CW_simayi_tokens,CW_zhouyu_tokens,CW_sunquan_tokens)
head(CW_allname)
## # A tibble: 6 x 3
## chapter allsum name
## <int> <int> <chr>
## 1 34 20 諸葛亮
## 2 36 37 諸葛亮
## 3 40 48 諸葛亮
## 4 41 27 諸葛亮
## 5 42 20 諸葛亮
## 6 43 43 諸葛亮
top_plot <-
CW_allname %>%
group_by(name)%>%
ggplot(aes(x = chapter, y=allsum, fill="type", color=factor(name))) +
geom_line() +
ggtitle("各主其要人物在各章節的出現的數量") +
xlab("章節") +
ylab("數量") +
theme(text = element_text(family='STHeiti'))
top_plot
ttop_plot <- CW_allname %>%
group_by(name)%>%
ggplot(aes(x = chapter, y=name, fill=name)) +
geom_density_ridges(alpha=0.6, stat="binline", bins=20) +
theme_ridges() +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
) +
xlab("") +
ylab("Assigned Probability (%)")+
theme(text = element_text(family='STHeiti'))
ttop_plot
我們可以透過上方的山脊圖,比較清楚的看出,各個人物活躍的章節和時期
three_smart_plot <- bind_rows(CW_zhuge_tokens, CW_simayi_tokens, CW_zhouyu_tokens) %>%
ggplot(aes(x = chapter, y=allsum, fill=name,color = name)) +
geom_point() +
geom_line() +
facet_wrap(~name, ncol = 1) +
ggtitle("智力型角色:「孔明」v.s.「司馬懿」v.s.「周瑜」") +
xlab("章節") +
ylab("出現次數") +
theme(text = element_text(family = "STHeiti"))
three_smart_plot
司馬懿的值得稱道的事跡有:在五丈原拖死了諸葛孔明,平亂孟達,公孫淵。他最大的貢獻自然是逐步掌握了曹魏的政權,讓他的孫子坐享其成,最後改朝換代, 所以也主要出現在比較後期的時候。
周瑜早年輔佐孫策平定江東,也是吳國開國立業的最大功臣。同時作為赤壁之戰的主帥確實是一手締造出了這三分局勢。而赤壁之戰就主要發生在四十回左右,也正式周瑜出場最多的時候
諸葛亮是劉備死後蜀漢的實際統治者,在內政軍事等方面都有所建,平定南方少數民族和北伐都鞏固了蜀漢政權,延長了三國的局面,是三國後期最具影響力的人物之一,所以在後期出現的也較多,也比較活躍。
plot <-
bind_rows(
three_kingdom %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量")+
theme(text = element_text(family='STHeiti'))
plot
> 第七十一回用了最多的句子和單字,這章就是在講定軍山之戰,也就是劉備稱霸成為漢中王前的一役。
frequency <- tokens %>% mutate(part = ifelse(chapter<=60, "First 60", "Last 60")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(part, word) %>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `Last 60`)
frequency
## # A tibble: 45,071 x 4
## word `First 60` part proportion
## <chr> <dbl> <chr> <dbl>
## 1 阿斗 0.000290 Last 60 0.0000129
## 2 阿房宮 NA Last 60 0.0000129
## 3 阿父 0.0000104 Last 60 NA
## 4 阿附 0.0000104 Last 60 0.0000387
## 5 阿會喃 NA Last 60 0.000180
## 6 阿瞞 0.0000207 Last 60 NA
## 7 阿鴦 NA Last 60 0.0000129
## 8 阿諛 0.0000104 Last 60 NA
## 9 阿之原 0.0000104 Last 60 NA
## 10 哀告 0.0000104 Last 60 0.0000644
## # … with 45,061 more rows
ggplot(frequency, aes(x = proportion, y = `First 60`, color = abs(`First 60` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "First 60", x = "Last 60")+
theme(text = element_text(family='Heiti TC Light'))
## Warning: Removed 35802 rows containing missing values (geom_point).
## Warning: Removed 35803 rows containing missing values (geom_text).