Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)require(dplyr)## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)## Loading required package: stringr
## Warning: package 'stringr' was built under R version 4.0.4
require(wordcloud2)## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 4.0.4
require(scales)## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4
Gutenberg free eBooks
Also, various chinese books can be found in the link below:
###使用默認參數初始化一個斷詞引
jieba_tokenizer = worker()west <- gutenbergr::gutenberg_download(23962, mirror = "http://mirrors.xmission.com/gutenberg/")View(west)west1 <- west %>% slice(1:16) %>% mutate(chapter = 0 )
west<- west %>% slice(17:3208)
west <- west %>%
mutate(chapter = cumsum(str_detect(west$text, regex("^第.*回( |$)"))))
# rbind 透過 row 合併
# cbind 透過 column 合併
west<-rbind(west1,west)# 列出資料內每個欄位的狀態
str(west)## tibble [3,208 x 3] (S3: tbl_df/tbl/data.frame)
## $ gutenberg_id: int [1:3208] 23962 23962 23962 23962 23962 23962 23962 23962 23962 23962 ...
## $ text : chr [1:3208] "第一回 靈根育孕源流出 心性修持大道生" "" "" " 詩曰:" ...
## $ chapter : num [1:3208] 0 0 0 0 0 0 0 0 0 0 ...
head(west, 20)## # A tibble: 20 x 3
## gutenberg_id text chapter
## <int> <chr> <dbl>
## 1 23962 "第一回 靈根育孕源流出 心性修持大道生" 0
## 2 23962 "" 0
## 3 23962 "" 0
## 4 23962 " 詩曰:" 0
## 5 23962 " 混沌未分天地亂,茫茫渺渺無人見。" 0
## 6 23962 " 自從盤古破鴻濛,開闢從茲清濁辨。" 0
## 7 23962 " 覆載群生仰至仁,發明萬物皆成善。" 0
## 8 23962 " 欲知造化會元功,須看西遊釋厄傳。" 0
## 9 23962 "" 0
## 10 23962 "" 0
## 11 23962 "蓋聞天地之數,有十二萬九千六百歲為一元。將一元分為十二會,乃子、丑、寅"~ 0
## 12 23962 "、卯、辰、巳、午、未、申、酉、戌、亥之十二支也。每會該一萬八百歲。且就"~ 0
## 13 23962 "一日而論:子時得陽氣,而丑則雞鳴﹔寅不通光,而卯則日出﹔辰時食後,而巳"~ 0
## 14 23962 "則挨排﹔日午天中,而未則西蹉﹔申時晡,而日落酉,戌黃昏,而人定亥。譬於"~ 0
## 15 23962 "大數,若到戌會之終,則天地昏曚而萬物否矣。再去五千四百歲,交亥會之初,"~ 0
## 16 23962 "則當黑暗,而兩間人物俱無矣,故曰混沌。又五千四百歲,亥會將終,貞下起元"~ 0
## 17 23962 ",近子之會,而復逐漸開明。邵康節曰::「冬至子之半,天心無改移。一陽初"~ 0
## 18 23962 "動處,萬物未生時。」到此,天始有根。再五千四百歲,正當子會,輕清上騰,"~ 0
## 19 23962 "有日,有月,有星,有辰。日、月、星、辰,謂之四象。故曰,天開於子。又經"~ 0
## 20 23962 "五千四百歲,子會將終,近丑之會,而逐漸堅實。《易》曰:「大哉乾元!至哉"~ 0
jieba_tokenizer <- worker(user="west.txt", stop_word = "stop_words.txt")# 設定斷詞function
west_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}tokens <- west %>% unnest_tokens(word, text, token=west_tokenizer)
str(tokens)## tibble [43,347 x 3] (S3: tbl_df/tbl/data.frame)
## $ gutenberg_id: int [1:43347] 23962 23962 23962 23962 23962 23962 23962 23962 23962 23962 ...
## $ chapter : num [1:43347] 0 0 0 0 0 0 0 0 0 0 ...
## $ word : chr [1:43347] "第一回" "靈根育孕" "源流" "出" ...
head(tokens, 20)## # A tibble: 20 x 3
## gutenberg_id chapter word
## <int> <dbl> <chr>
## 1 23962 0 第一回
## 2 23962 0 靈根育孕
## 3 23962 0 源流
## 4 23962 0 出
## 5 23962 0
## 6 23962 0 心性
## 7 23962 0 修持
## 8 23962 0 大道
## 9 23962 0 生
## 10 23962 0
## 11 23962 0
## 12 23962 0 詩
## 13 23962 0 曰
## 14 23962 0
## 15 23962 0
## 16 23962 0
## 17 23962 0
## 18 23962 0 混沌
## 19 23962 0 未分
## 20 23962 0 天地
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))印出最常見的20個詞彙
head(tokens_count, 20)## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 大聖 142
## 2 悟空 142
## 3 菩薩 126
## 4 太宗 103
## 5 猴王 94
## 6 一個 92
## 7 龍王 68
## 8 玉帝 66
## 9 祖師 65
## 10 陛下 63
## 11 不知 60
## 12 只見 60
## 13 大王 55
## 14 玄奘 54
## 15 唐王 53
## 16 如何 50
## 17 那裏 47
## 18 如來 46
## 19 甚麼 46
## 20 兩個 45
tokens_count %>% wordcloud2()plot <-
bind_rows(
west %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量") #+
#theme(text = element_text(family = "Heiti TC Light"))
plot## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?