Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","knitr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4
require(dplyr)
require(ggplot2)
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(scales)
require(wordcloud2)
require(tidytext)
# 下載繁體聖經 繁體和合本
bible <- fread("./cut/booksx_2.txt", encoding = "UTF-8",fill=TRUE)
str(bible)
## Classes 'data.table' and 'data.frame': 31238 obs. of 5 variables:
## $ book : chr "=001" "Ge" "Ge" "Ge" ...
## $ chapter : chr "Genesis" "1:1" "1:2" "1:3" ...
## $ ch_book : chr "-" "創世紀" "創世紀" "創世紀" ...
## $ ch_chapter: chr "創世紀" "1:1" "1:2" "1:3" ...
## $ text : chr "" "起初 神創造天地。" "地是空虛混沌.淵面黑暗. 神的靈運行在水面上。" " 神說、要有光、就有了光。" ...
## - attr(*, ".internal.selfref")=<externalptr>
針對舊約,新約聖經每一卷進行編碼
bible_1 <- bible %>%
mutate(bookcode = cumsum(str_detect(bible$book,regex("^="))))
bible_2 <- bible %>%
mutate(bookcode = cumsum(str_detect(bible$book,regex("^=[0-1][0-9]{2}")))) %>% select (-book,-chapter)
#格式是=第1碼是0或1,0:舊約, 1:新約, 第2-3碼為流水號
str(bible_2)
## Classes 'data.table' and 'data.frame': 31238 obs. of 4 variables:
## $ ch_book : chr "-" "創世紀" "創世紀" "創世紀" ...
## $ ch_chapter: chr "創世紀" "1:1" "1:2" "1:3" ...
## $ text : chr "" "起初 神創造天地。" "地是空虛混沌.淵面黑暗. 神的靈運行在水面上。" " 神說、要有光、就有了光。" ...
## $ bookcode : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
根據上方整理出來的規則,我們可以使用正規表示式,將句子區分新舊約
head(bible_2,10)
## ch_book ch_chapter
## 1: - 創世紀
## 2: 創世紀 1:1
## 3: 創世紀 1:2
## 4: 創世紀 1:3
## 5: 創世紀 1:4
## 6: 創世紀 1:5
## 7: 創世紀 1:6
## 8: 創世紀 1:7
## 9: 創世紀 1:8
## 10: 創世紀 1:9
## text bookcode
## 1: 1
## 2: 起初 神創造天地。 1
## 3: 地是空虛混沌.淵面黑暗. 神的靈運行在水面上。 1
## 4: 神說、要有光、就有了光。 1
## 5: 神看光是好的、就把光暗分開了。 1
## 6: 神稱光為晝、稱暗為夜.有晚上、有早晨、這是頭一日。 1
## 7: 神說、諸水之間要有空氣、將水分為上下。 1
## 8: 神就造出空氣、將空氣以下的水、空氣以上的水分開了.事就這樣成了。 1
## 9: 神稱空氣為天.有晚上、有早晨、是第二日。 1
## 10: 神說、天下的水要聚在一處、使旱地露出來.事就這樣成了。 1
下載下來的書已經完成斷句
bible_jieba_tokenizer <- worker(user="bible_lexicon.tradictional_2.txt", stop_word = "bible_stop_words.txt")
# 設定斷詞function
bible_jieba_tokenizer <- worker(user="bible_lexicon.tradictional_2.txt", stop_word = "bible_stop_words.txt")
bible_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, bible_jieba_tokenizer)
return(tokens)
})
}
bible_tokens <- bible_2 %>% unnest_tokens(word, text, token=bible_tokenizer)
str(bible_tokens)
## Classes 'data.table' and 'data.frame': 545995 obs. of 4 variables:
## $ ch_book : chr "創世紀" "創世紀" "創世紀" "創世紀" ...
## $ ch_chapter: chr "1:1" "1:1" "1:1" "1:1" ...
## $ bookcode : int 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr "起初" " " "神" "創造" ...
## - attr(*, ".internal.selfref")=<externalptr>
head(bible_tokens, 20)
## ch_book ch_chapter bookcode word
## 1: 創世紀 1:1 1 起初
## 2: 創世紀 1:1 1
## 3: 創世紀 1:1 1 神
## 4: 創世紀 1:1 1 創造
## 5: 創世紀 1:1 1 天地
## 6: 創世紀 1:2 1 地是
## 7: 創世紀 1:2 1 空虛
## 8: 創世紀 1:2 1 混沌
## 9: 創世紀 1:2 1 淵
## 10: 創世紀 1:2 1 面
## 11: 創世紀 1:2 1 黑暗
## 12: 創世紀 1:2 1
## 13: 創世紀 1:2 1 神
## 14: 創世紀 1:2 1 的
## 15: 創世紀 1:2 1 靈
## 16: 創世紀 1:2 1 運行
## 17: 創世紀 1:2 1 在
## 18: 創世紀 1:2 1 水面
## 19: 創世紀 1:2 1 上
## 20: 創世紀 1:3 1
#2個字以上的token
bible_tokens_count_2 <- bible_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>100) %>% #數量>100
arrange(desc(sum))
#"神"-4213 "主"-1028 的頻率高 另外加入
bible_tokens_count_1<- filter(bible_tokens,word %in% c("主","神") )%>%
group_by(word) %>%
summarise(sum = n()) %>%
arrange(desc(sum))
# 進行combine
bible_tokens_count <- rbind(bible_tokens_count_2,bible_tokens_count_1) %>%
arrange(desc(sum))
bible_tokens_count %>% wordcloud2()
head(bible_tokens_count, 30)
## # A tibble: 30 x 2
## word sum
## <chr> <int>
## 1 耶和華 6980
## 2 神 4213
## 3 以色列 2704
## 4 兒子 2398
## 5 耶穌 1496
## 6 大衛 1164
## 7 知道 1078
## 8 主 1028
## 9 猶大 1017
## 10 摩西 870
## # ... with 20 more rows
詩篇是聖經中的歌集與禱文,由不同作者在不同時間所寫 有頌讚,敬拜上帝,有祁求幫助,保護;有求赦罪,感謝神恩,有向仇敵報復的禱詞,探索其詞彙的特性 -“慈愛”, “讚美”的詞頻高,顯示該卷為頌讚為主
bible_tokens_count_Psalms <- bible_tokens %>%
filter(nchar(.$word)>1, .$bookcode=="19" ) %>% #詩篇bookcode=19
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>20) %>%
mutate(ch_bookname="詩篇") %>%
arrange(desc(sum))
head(bible_tokens_count_Psalms, 30)
## # A tibble: 30 x 3
## word sum ch_bookname
## <chr> <int> <chr>
## 1 耶和華 754 詩篇
## 2 永遠 129 詩篇
## 3 慈愛 122 詩篇
## 4 讚美 105 詩篇
## 5 惡人 93 詩篇
## 6 大衛 90 詩篇
## 7 公義 83 詩篇
## 8 仇敵 81 詩篇
## 9 脫離 72 詩篇
## 10 以色列 65 詩篇
## # ... with 20 more rows
#詩篇的文字雲
bible_tokens_count_Psalms %>% wordcloud2()
因為很多學者同意《路加福音》的執筆者就是《使徒行傳》的執筆者 比較二卷用詞的詞頻
bible_tokens_count_Luka <- bible_tokens %>%
filter(nchar(.$word)>1, .$bookcode=="42" ) %>% #路加福音bookcode=42
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>20) %>%
# mutate(ch_bookname="路加福音") %>%
arrange(desc(sum))
bible_tokens_count_Act <- bible_tokens %>%
filter(nchar(.$word)>1, .$bookcode=="44" ) %>% #使徒行傳bookcode=44
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>20) %>%
# mutate(ch_bookname="使徒行傳") %>%
arrange(desc(sum))
bible_tokens_count_Luka_Act <- rbind(bible_tokens_count_Luka %>% mutate(ch_bookname="路加福音"), bible_tokens_count_Act %>% mutate(ch_bookname="使徒行傳"))
plot_merge <- bible_tokens_count_Luka_Act %>%
group_by(ch_bookname) %>%
top_n(20, sum) %>%
ungroup() %>%
mutate(date = as.factor(ch_bookname),
word = reorder_within(word, sum, date)) %>%
ggplot(aes(x=word, y=sum, fill = ch_bookname)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "詞頻") +
facet_wrap(~date, ncol = 1, scales="free") +
coord_flip()+
scale_x_reordered() +
theme(text = element_text(family = "Heiti TC Light"))
plot_merge
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## 各卷的句子及詞彙長度 1.舊約共三十九卷,新約共二十七卷 2.以新約bookcode為40開始,標示紅線
plot <-
bind_rows(
bible_2 %>%
group_by(bookcode) %>%
summarise(count = n(), type="sentences"),
bible_tokens %>%
group_by(bookcode) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = bookcode, y=count, fill="type", color=factor(type))) +
geom_line() +
geom_vline(xintercept = as.numeric("40"), col='red', size = 1) +
ggtitle("各卷句子總數") +
xlab("卷") +
ylab("句子數量") +
theme(text = element_text(family = "Heiti TC Light"))
plot
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
bible_frequency <- bible_tokens %>% mutate(part = ifelse(bookcode<40, "Old Testament", "New Testament")) %>%
filter(nchar(.$word)>1) %>%
count(part, word) %>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `New Testament`)
bible_frequency
## # A tibble: 25,653 x 4
## word `Old Testament` part proportion
## <chr> <dbl> <chr> <dbl>
## 1 一一 0.0000329 New Testament 0.0000743
## 2 一七 0.0000110 New Testament NA
## 3 一人 0.000362 New Testament 0.000372
## 4 一人入 NA New Testament 0.0000186
## 5 一人必 0.00000548 New Testament NA
## 6 一人作 0.00000548 New Testament NA
## 7 一人呢 0.00000548 New Testament NA
## 8 一人的心 0.00000548 New Testament NA
## 9 一人能 NA New Testament 0.0000372
## 10 一人將 0.00000548 New Testament NA
## # ... with 25,643 more rows
ggplot(bible_frequency, aes(x = proportion, y = `Old Testament`, color = abs(`Old Testament` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "#D55E00", high = "gray75") +
theme(legend.position="none") +
labs(y = "Old Testament", x = "New Testament")
## Warning: Removed 20060 rows containing missing values (geom_point).
## Warning: Removed 20060 rows containing missing values (geom_text).
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database