require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)
## Loading required package: stringr
## Warning: package 'stringr' was built under R version 4.0.4
require(wordcloud2)
## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 4.0.4
require(scales)
## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4
require(htmlwidgets)
## Loading required package: htmlwidgets
## Warning: package 'htmlwidgets' was built under R version 4.0.4
require(webshot)
## Loading required package: webshot
## Warning: package 'webshot' was built under R version 4.0.4
jieba_tokenizer = worker()
west <- gutenberg_download(23962,mirror = "http://mirrors.xmission.com/gutenberg/") %>% filter(text!="") %>% distinct(gutenberg_id, text)
View(west)
west <- west %>%
mutate(chapter = cumsum(str_detect(west$text, regex("^第.*回( |$)"))))
str(west)
## tibble [23,328 x 3] (S3: tbl_df/tbl/data.frame)
## $ gutenberg_id: int [1:23328] 23962 23962 23962 23962 23962 23962 23962 23962 23962 23962 ...
## $ text : chr [1:23328] "第一回 靈根育孕源流出 心性修持大道生" " 詩曰:" " 混沌未分天地亂,茫茫渺渺無人見。" " 自從盤古破鴻濛,開闢從茲清濁辨。" ...
## $ chapter : int [1:23328] 0 0 0 0 0 0 0 0 0 0 ...
head(west,20)
## # A tibble: 20 x 3
## gutenberg_id text chapter
## <int> <chr> <int>
## 1 23962 第一回 靈根育孕源流出 心性修持大道生 0
## 2 23962 詩曰: 0
## 3 23962 混沌未分天地亂,茫茫渺渺無人見。 0
## 4 23962 自從盤古破鴻濛,開闢從茲清濁辨。 0
## 5 23962 覆載群生仰至仁,發明萬物皆成善。 0
## 6 23962 欲知造化會元功,須看西遊釋厄傳。 0
## 7 23962 蓋聞天地之數,有十二萬九千六百歲為一元。將一元分為十二會,乃子、丑、寅~ 0
## 8 23962 、卯、辰、巳、午、未、申、酉、戌、亥之十二支也。每會該一萬八百歲。且就~ 0
## 9 23962 一日而論:子時得陽氣,而丑則雞鳴﹔寅不通光,而卯則日出﹔辰時食後,而巳~ 0
## 10 23962 則挨排﹔日午天中,而未則西蹉﹔申時晡,而日落酉,戌黃昏,而人定亥。譬於~ 0
## 11 23962 大數,若到戌會之終,則天地昏曚而萬物否矣。再去五千四百歲,交亥會之初,~ 0
## 12 23962 則當黑暗,而兩間人物俱無矣,故曰混沌。又五千四百歲,亥會將終,貞下起元~ 0
## 13 23962 ,近子之會,而復逐漸開明。邵康節曰::「冬至子之半,天心無改移。一陽初~ 0
## 14 23962 動處,萬物未生時。」到此,天始有根。再五千四百歲,正當子會,輕清上騰,~ 0
## 15 23962 有日,有月,有星,有辰。日、月、星、辰,謂之四象。故曰,天開於子。又經~ 0
## 16 23962 五千四百歲,子會將終,近丑之會,而逐漸堅實。《易》曰:「大哉乾元!至哉~ 0
## 17 23962 坤元!萬物資生,乃順承天。」至此,地始凝結。再五千四百歲,正當丑會,重~ 0
## 18 23962 濁下凝,有水,有火,有山,有石,有土。水、火、山、石、土,謂之五形。故~ 0
## 19 23962 曰,地闢於丑。又經五千四百歲,丑會終而寅會之初,發生萬物。曆曰:「天氣~ 0
## 20 23962 下降,地氣上升﹔天地交合,群物皆生。」至此,天清地爽,陰陽交合。再五千~ 0
jieba_tokenizer <- worker(user="west_dict.dict")
stop_words <- c("怎麼","我們","不是","兩個","一個","如何","這個","原來")
West_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
tokens <- west %>% unnest_tokens(word, text, token=West_tokenizer)
str(tokens)
## tibble [376,922 x 3] (S3: tbl_df/tbl/data.frame)
## $ gutenberg_id: int [1:376922] 23962 23962 23962 23962 23962 23962 23962 23962 23962 23962 ...
## $ chapter : int [1:376922] 0 0 0 0 0 0 0 0 0 0 ...
## $ word : chr [1:376922] "第一回" "靈根育孕" "源流" "出" ...
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
a <- head(tokens_count, 50)
head(tokens_count,50) %>% wordcloud2(shape = "star")
head(tokens_count,50) %>% wordcloud2()
my_graph <- wordcloud2(a, size = 1.5)
saveWidget(my_graph, "temp.html", selfcontained = F)
webshot("temp.html", "wc1.png", delay = 5, vwidth = 2000, vheight = 2000)
##2. 所有文字出現的次數跑點圖(其實應該要跑bar 的,但是有問題出不來所以改用point),可以發現其實大部分的詞都不超過一千。
bar1 <- ggplot(tokens_count,aes(x = word,y = sum)) + geom_point()
bar1
bar2 <- ggplot(tokens_count,aes(x = word,y = sum)) + geom_point() + ylim(0,1000)
bar2
## Warning: Removed 5 rows containing missing values (geom_point).