Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)require(dplyr)## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)## Loading required package: stringr
## Warning: package 'stringr' was built under R version 4.0.4
require(wordcloud2)## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 4.0.4
require(scales)## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4
Gutenberg free eBooks
Also, various chinese books can be found in the link below:
###使用默認參數初始化一個斷詞引
jieba_tokenizer = worker()flower <- gutenbergr::gutenberg_download(23938, mirror = "http://mirrors.xmission.com/gutenberg/")View(flower)flower1 <- flower %>% slice(1:16) %>% mutate(chapter = 0 )
flower<- flower %>% slice(17:3208)
flower <- flower %>%
mutate(chapter = cumsum(str_detect(flower$text, regex("^第.*回( |$)"))))
# rbind 透過 row 合併
# cbind 透過 column 合併
flower<-rbind(flower1,flower)# 列出資料內每個欄位的狀態
str(flower)## tibble [3,208 x 3] (S3: tbl_df/tbl/data.frame)
## $ gutenberg_id: int [1:3208] 23938 23938 23938 23938 23938 23938 23938 23938 23938 23938 ...
## $ text : chr [1:3208] "序" "" "嘗思人道之大,莫大於倫常;學問之精,莫精於性命。自有書籍以來,所載傳人不少," "求其交盡乎倫常者鮮矣,求其交盡乎性命者益鮮矣。蓋倫常之地,或盡孝而不必兼忠," ...
## $ chapter : num [1:3208] 0 0 0 0 0 0 0 0 0 0 ...
head(flower, 20)## # A tibble: 20 x 3
## gutenberg_id text chapter
## <int> <chr> <dbl>
## 1 23938 "序" 0
## 2 23938 "" 0
## 3 23938 "嘗思人道之大,莫大於倫常;學問之精,莫精於性命。自有書籍以來,所載傳人不少,"~ 0
## 4 23938 "求其交盡乎倫常者鮮矣,求其交盡乎性命者益鮮矣。蓋倫常之地,或盡孝而不必兼忠,"~ 0
## 5 23938 "或盡忠而不必兼孝,或盡忠孝而安常處順,不必兼勇烈。遭際未極其變,即倫常未盡其"~ 0
## 6 23938 "難也。性命之理,有不悟性根者,有不知命蒂者,有修性命而旁歧雜出者,有修性命而"~ 0
## 7 23938 "後先倒置者。涵養未得其中,即性命未盡其奧也。乃木蘭一女子耳,擔荷倫常,研求性"~ 0
## 8 23938 "命,而獨無所不盡也哉!" 0
## 9 23938 "" 0
## 10 23938 " 予幼讀《木蘭詩》,觀其代父從軍,可謂孝矣;立功絕塞,可謂忠矣。後閱《唐書"~ 0
## 11 23938 "》,言木蘭唐女,西陵人,嫻弓馬,諳韜略,轉戰沙漠,累大功十二,何其勇也。封武"~ 0
## 12 23938 "昭將軍,凱旋還里。當時筮者謂致亂必由武姓,讒臣嫁禍武昭,詔徵至京。木蘭具表陳"~ 0
## 13 23938 "情,掣劍剜心,示使者,目視而死。死後,位證雷部大神,何其烈也。去冬閱《木蘭奇"~ 0
## 14 23938 "女傳》,復知其幼而領悟者性命也,長而行持者性命也。且通部議論極精微,極顯豁,"~ 0
## 15 23938 "又無非性命之妙諦也。盡人所當盡,亦盡人所難盡。惟其無所不盡,則亦無所不奇。而"~ 0
## 16 23938 "人奇,行奇,事奇,文奇,讀者莫不驚奇叫絕也。此書相傳為奎斗馬祖所演,卷首有武"~ 0
## 17 23938 "聖帝序。今序已失,同人集貲付梓。書成,爰敘其緣起如此。" 0
## 18 23938 "" 0
## 19 23938 " 書於滬江梅花書館南窗之下" 0
## 20 23938 "" 0
jieba_tokenizer <- worker(user="", stop_word = "stop_words.txt")# 設定斷詞function
flower_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}tokens <- flower %>% unnest_tokens(word, text, token=flower_tokenizer)
str(tokens)## tibble [43,593 x 3] (S3: tbl_df/tbl/data.frame)
## $ gutenberg_id: int [1:43593] 23938 23938 23938 23938 23938 23938 23938 23938 23938 23938 ...
## $ chapter : num [1:43593] 0 0 0 0 0 0 0 0 0 0 ...
## $ word : chr [1:43593] "序" "嘗思人" "道" "之" ...
head(tokens, 20)## # A tibble: 20 x 3
## gutenberg_id chapter word
## <int> <dbl> <chr>
## 1 23938 0 序
## 2 23938 0 嘗思人
## 3 23938 0 道
## 4 23938 0 之
## 5 23938 0 大
## 6 23938 0 莫
## 7 23938 0 大於
## 8 23938 0 倫常
## 9 23938 0 學問
## 10 23938 0 之精
## 11 23938 0 莫
## 12 23938 0 精於
## 13 23938 0 性命
## 14 23938 0 自有
## 15 23938 0 書籍
## 16 23938 0 以來
## 17 23938 0 所載
## 18 23938 0 傳人
## 19 23938 0 不少
## 20 23938 0 求其
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))印出最常見的20個詞彙
head(tokens_count, 20)## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 木蘭 315
## 2 李靖 249
## 3 尉遲恭 177
## 4 元帥 143
## 5 公子 94
## 6 先生 74
## 7 唐兵 72
## 8 天祿 71
## 9 如此 65
## 10 將軍 63
## 11 次日 62
## 12 軍士 60
## 13 突厥 58
## 14 一個 57
## 15 一日 55
## 16 二人 55
## 17 如何 54
## 18 今日 52
## 19 不知 49
## 20 卻說 48
tokens_count %>% wordcloud2()plot <-
bind_rows(
flower %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量") #+
#theme(text = element_text(family = "Heiti TC Light"))
plot## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?