CH01:套件安裝及載入

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""

安裝需要的packages

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","knitr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

載入packages

require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4

CH02 下載繁體聖經 繁體和合本

https://sites.google.com/site/downloadbibles/uniont

require(dplyr)
require(ggplot2)
require(data.table)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
require(scales)
require(wordcloud2)
require(tidytext)
# 下載繁體聖經 繁體和合本 
bible <- fread("./cut/booksx_2.txt", encoding = "UTF-8",fill=TRUE)
str(bible)
## Classes 'data.table' and 'data.frame':   31238 obs. of  5 variables:
##  $ book      : chr  "=001" "Ge" "Ge" "Ge" ...
##  $ chapter   : chr  "Genesis" "1:1" "1:2" "1:3" ...
##  $ ch_book   : chr  "-" "創世紀" "創世紀" "創世紀" ...
##  $ ch_chapter: chr  "創世紀" "1:1" "1:2" "1:3" ...
##  $ text      : chr  "" "起初 神創造天地。" "地是空虛混沌.淵面黑暗. 神的靈運行在水面上。" " 神說、要有光、就有了光。" ...
##  - attr(*, ".internal.selfref")=<externalptr>

針對舊約,新約聖經每一卷進行編碼

bible_1 <- bible %>% 
  mutate(bookcode = cumsum(str_detect(bible$book,regex("^="))))
bible_2 <- bible %>% 
  mutate(bookcode = cumsum(str_detect(bible$book,regex("^=[0-1][0-9]{2}")))) %>%   select (-book,-chapter)
  #格式是=第1碼是0或1,0:舊約, 1:新約, 第2-3碼為流水號 
str(bible_2)
## Classes 'data.table' and 'data.frame':   31238 obs. of  4 variables:
##  $ ch_book   : chr  "-" "創世紀" "創世紀" "創世紀" ...
##  $ ch_chapter: chr  "創世紀" "1:1" "1:2" "1:3" ...
##  $ text      : chr  "" "起初 神創造天地。" "地是空虛混沌.淵面黑暗. 神的靈運行在水面上。" " 神說、要有光、就有了光。" ...
##  $ bookcode  : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

根據上方整理出來的規則,我們可以使用正規表示式,將句子區分新舊約

head(bible_2,10)
##     ch_book ch_chapter
##  1:       -     創世紀
##  2:  創世紀        1:1
##  3:  創世紀        1:2
##  4:  創世紀        1:3
##  5:  創世紀        1:4
##  6:  創世紀        1:5
##  7:  創世紀        1:6
##  8:  創世紀        1:7
##  9:  創世紀        1:8
## 10:  創世紀        1:9
##                                                                   text bookcode
##  1:                                                                           1
##  2:                                                 起初 神創造天地。        1
##  3:                     地是空虛混沌.淵面黑暗. 神的靈運行在水面上。        1
##  4:                                          神說、要有光、就有了光。        1
##  5:                                    神看光是好的、就把光暗分開了。        1
##  6:                神稱光為晝、稱暗為夜.有晚上、有早晨、這是頭一日。        1
##  7:                            神說、諸水之間要有空氣、將水分為上下。        1
##  8:  神就造出空氣、將空氣以下的水、空氣以上的水分開了.事就這樣成了。        1
##  9:                          神稱空氣為天.有晚上、有早晨、是第二日。        1
## 10:            神說、天下的水要聚在一處、使旱地露出來.事就這樣成了。        1

下載下來的書已經完成斷句

自行定義聖經專有名詞字典-目前為先知的名字,及至狐狗搜尋基督教專有名詞

先知名字參考wiki 聖經中的先知列表 [先知列表] https://zh.wikipedia.org/wiki/%E5%9C%A3%E7%BB%8F%E4%B8%AD%E7%9A%84%E5%85%88%E7%9F%A5%E5%88%97%E8%A1%A8

bible_jieba_tokenizer <- worker(user="bible_lexicon.tradictional_2.txt", stop_word = "bible_stop_words.txt")

設定聖經斷詞

# 設定斷詞function
bible_jieba_tokenizer <- worker(user="bible_lexicon.tradictional_2.txt", stop_word = "bible_stop_words.txt")

bible_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, bible_jieba_tokenizer)
    return(tokens)
  })
}
bible_tokens <- bible_2  %>% unnest_tokens(word, text, token=bible_tokenizer)
str(bible_tokens)
## Classes 'data.table' and 'data.frame':   545995 obs. of  4 variables:
##  $ ch_book   : chr  "創世紀" "創世紀" "創世紀" "創世紀" ...
##  $ ch_chapter: chr  "1:1" "1:1" "1:1" "1:1" ...
##  $ bookcode  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ word      : chr  "起初" " " "神" "創造" ...
##  - attr(*, ".internal.selfref")=<externalptr>
head(bible_tokens, 20)
##     ch_book ch_chapter bookcode word
##  1:  創世紀        1:1        1 起初
##  2:  創世紀        1:1        1    
##  3:  創世紀        1:1        1   神
##  4:  創世紀        1:1        1 創造
##  5:  創世紀        1:1        1 天地
##  6:  創世紀        1:2        1 地是
##  7:  創世紀        1:2        1 空虛
##  8:  創世紀        1:2        1 混沌
##  9:  創世紀        1:2        1   淵
## 10:  創世紀        1:2        1   面
## 11:  創世紀        1:2        1 黑暗
## 12:  創世紀        1:2        1    
## 13:  創世紀        1:2        1   神
## 14:  創世紀        1:2        1   的
## 15:  創世紀        1:2        1   靈
## 16:  創世紀        1:2        1 運行
## 17:  創世紀        1:2        1   在
## 18:  創世紀        1:2        1 水面
## 19:  創世紀        1:2        1   上
## 20:  創世紀        1:3        1    

CH03 圖

文字雲

計算詞彙的出現次數,如果詞彙只有一個字則不列入計算,但另加入

#2個字以上的token
bible_tokens_count_2 <- bible_tokens %>% 
  filter(nchar(.$word)>1) %>% 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>100) %>% #數量>100 
  arrange(desc(sum))

#"神"-4213 "主"-1028 的頻率高 另外加入 
bible_tokens_count_1<- filter(bible_tokens,word %in% c("主","神") )%>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  arrange(desc(sum))

# 進行combine 
bible_tokens_count <- rbind(bible_tokens_count_2,bible_tokens_count_1)  %>% 
  arrange(desc(sum))
bible_tokens_count %>% wordcloud2() 

印出最常見的30個詞彙

head(bible_tokens_count, 30)
## # A tibble: 30 x 2
##    word     sum
##    <chr>  <int>
##  1 耶和華  6980
##  2 神      4213
##  3 以色列  2704
##  4 兒子    2398
##  5 耶穌    1496
##  6 大衛    1164
##  7 知道    1078
##  8 主      1028
##  9 猶大    1017
## 10 摩西     870
## # ... with 20 more rows

詩篇卷特性

詩篇是聖經中的歌集與禱文,由不同作者在不同時間所寫 有頌讚,敬拜上帝,有祁求幫助,保護;有求赦罪,感謝神恩,有向仇敵報復的禱詞,探索其詞彙的特性 -“慈愛”, “讚美”的詞頻高,顯示該卷為頌讚為主

bible_tokens_count_Psalms <- bible_tokens %>% 
  filter(nchar(.$word)>1, .$bookcode=="19" ) %>%  #詩篇bookcode=19 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>20) %>% 
  mutate(ch_bookname="詩篇") %>% 
  arrange(desc(sum)) 

head(bible_tokens_count_Psalms, 30)
## # A tibble: 30 x 3
##    word     sum ch_bookname
##    <chr>  <int> <chr>      
##  1 耶和華   754 詩篇       
##  2 永遠     129 詩篇       
##  3 慈愛     122 詩篇       
##  4 讚美     105 詩篇       
##  5 惡人      93 詩篇       
##  6 大衛      90 詩篇       
##  7 公義      83 詩篇       
##  8 仇敵      81 詩篇       
##  9 脫離      72 詩篇       
## 10 以色列    65 詩篇       
## # ... with 20 more rows
#詩篇的文字雲
bible_tokens_count_Psalms %>% wordcloud2() 

用長條圖比較《路加福音》及《使徒行傳》

因為很多學者同意《路加福音》的執筆者就是《使徒行傳》的執筆者 比較二卷用詞的詞頻

bible_tokens_count_Luka <- bible_tokens %>% 
  filter(nchar(.$word)>1, .$bookcode=="42" ) %>%  #路加福音bookcode=42 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>20) %>% 
  # mutate(ch_bookname="路加福音") %>% 
  arrange(desc(sum)) 

bible_tokens_count_Act <- bible_tokens %>% 
  filter(nchar(.$word)>1, .$bookcode=="44" ) %>%  #使徒行傳bookcode=44 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>20) %>% 
  # mutate(ch_bookname="使徒行傳") %>% 
  arrange(desc(sum)) 

bible_tokens_count_Luka_Act  <- rbind(bible_tokens_count_Luka %>% mutate(ch_bookname="路加福音"), bible_tokens_count_Act %>% mutate(ch_bookname="使徒行傳"))

plot_merge <- bible_tokens_count_Luka_Act %>% 
  group_by(ch_bookname) %>% 
  top_n(20, sum) %>% 
  ungroup() %>% 
  mutate(date = as.factor(ch_bookname),
         word = reorder_within(word, sum, date)) %>%
  ggplot(aes(x=word, y=sum, fill = ch_bookname)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "詞頻") +
  facet_wrap(~date, ncol = 1, scales="free") + 
  coord_flip()+
  scale_x_reordered() +
  theme(text = element_text(family = "Heiti TC Light"))

plot_merge
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## 各卷的句子及詞彙長度 1.舊約共三十九卷,新約共二十七卷 2.以新約bookcode為40開始,標示紅線

plot <- 
  bind_rows(
    bible_2 %>% 
      group_by(bookcode) %>% 
      summarise(count = n(), type="sentences"),
    bible_tokens %>% 
      group_by(bookcode) %>% 
      summarise(count = n(), type="words")) %>% 
  group_by(type)%>%
  ggplot(aes(x = bookcode, y=count, fill="type", color=factor(type))) +
  geom_line() + 
  geom_vline(xintercept = as.numeric("40"), col='red', size = 1) + 
  ggtitle("各卷句子總數") + 
  xlab("卷") + 
  ylab("句子數量") + 
  theme(text = element_text(family = "Heiti TC Light"))

plot
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

計算舊約和新約的詞彙在全文中出現比率的差異

  1. 耶和華出現在舊約比率遠高與新約,耶穌只出現在新約聖經, 新約以耶穌的降生為起點
  2. 門徒出現於舊約14次, 新約361次, 新約很大部分描述耶穌和門徒的故事
bible_frequency <- bible_tokens %>% mutate(part = ifelse(bookcode<40, "Old Testament", "New Testament")) %>%
  filter(nchar(.$word)>1) %>%
  count(part, word) %>%
  group_by(part) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(part, proportion) %>%
  gather(part, proportion, `New Testament`)
bible_frequency 
## # A tibble: 25,653 x 4
##    word     `Old Testament` part          proportion
##    <chr>              <dbl> <chr>              <dbl>
##  1 一一          0.0000329  New Testament  0.0000743
##  2 一七          0.0000110  New Testament NA        
##  3 一人          0.000362   New Testament  0.000372 
##  4 一人入       NA          New Testament  0.0000186
##  5 一人必        0.00000548 New Testament NA        
##  6 一人作        0.00000548 New Testament NA        
##  7 一人呢        0.00000548 New Testament NA        
##  8 一人的心      0.00000548 New Testament NA        
##  9 一人能       NA          New Testament  0.0000372
## 10 一人將        0.00000548 New Testament NA        
## # ... with 25,643 more rows

繪出詞彙出現在新舊約比例相對位置圖

ggplot(bible_frequency, aes(x = proportion, y = `Old Testament`, color = abs(`Old Testament` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) + 
  scale_color_gradient(limits = c(0, 0.001), low = "#D55E00", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "Old Testament", x = "New Testament")
## Warning: Removed 20060 rows containing missing values (geom_point).
## Warning: Removed 20060 rows containing missing values (geom_text).
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database