載入所需套件
library(gutenbergr)
library(jiebaR)
library(stringr)
library(dplyr)
library(tidyr)
library(wordcloud2)
library(tidytext)
library(ggplot2)
設定結巴斷詞,並載入三國演義的斷詞字典
jieba_tokenizer = worker(user = "three_kingdoms_lexicon.traditional.dict")
從古騰堡計畫下載三國演義的文本資料
three_kingdoms <- gutenberg_download(23950) %>%
filter(text != "") %>%
distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
新增章節欄位
three_kingdoms <- three_kingdoms %>%
mutate(chapter = cumsum(str_detect(three_kingdoms$text, regex("^第.*回:"))))
中文斷詞方法
chi_tokenizer <- function (t) {
lapply(t, function(x){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens) > 1]
return(tokens)
})
}
斷詞
tokens <- three_kingdoms %>%
unnest_tokens(word, text, token = chi_tokenizer)
計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count <- tokens %>%
filter(nchar(.$word) > 1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 10) %>%
arrange(desc(sum))
三國演義文字雲
wordcloud2(tokens_count)
各章節斷句、斷詞出現次數 - 折線圖
章節斷句次數
chapter_sentences <- three_kingdoms %>%
group_by(chapter) %>%
summarise(count = n(), type = "sentences")
章節斷詞次數
chapter_words <- tokens %>%
group_by(chapter) %>%
summarise(count = n(), type = "words")
作圖
bind_rows(chapter_sentences, chapter_words) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量") +
theme(text = element_text(family = "Heiti TC Light"))
