繁體版聖經合本情緒分析
林子紘 B074020021 彭璿祐 B064020029 徐明暇 D084020002 劉晉瑋 M094020006 洪玟君 M094020030 林永盛 M094020042 黃天原 M094020067
針對聖經新約進行情緒分析。本次主要針對以下方向分析:
1.聖經新約/舊約在正面和負面的討論內容各是甚麼? 2.正面和負面討論的情緒分數大約多少? 3.比較LIWC, NRC情緒字典所呈現出來的情緒分佈有何不同。 4.不同分類的聖經,情緒分佈有何不同? 5.不同作者之文字情緒分析
系統參數設定
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
安裝需要的packages
# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)讀進library
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
require(jiebaR)
library("data.table")
library(openxlsx)
library(colorspace)1.載入資料
df <- fread("./bible.txt", encoding = "UTF-8",fill=TRUE)2.前置處理(將聖經編碼,共66章)
bible <- df %>%
mutate(bookcode = cumsum(str_detect(df$book,regex("^=[0-1][0-9]{2}")))) %>%
select (-book,-chapter) %>%
#格式是=第1碼是0或1,0:舊約, 1:新約, 第2-3碼為流水號
filter(!(df$ch_book %in% c("-"))) #去除特殊的格式
str(bible)## Classes 'data.table' and 'data.frame': 31172 obs. of 4 variables:
## $ ch_book : chr "創世紀" "創世紀" "創世紀" "創世紀" ...
## $ ch_chapter: chr "1:1" "1:2" "1:3" "1:4" ...
## $ text : chr "起初 神創造天地。" "地是空虛混沌.淵面黑暗. 神的靈運行在水面上。" " 神說、要有光、就有了光。" " 神看光是好的、就把光暗分開了。" ...
## $ bookcode : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
3.文章斷詞 設定斷詞引擎—自定義聖經專有名詞字典(先知的名字)
# 加入自定義的字典
bible_jieba_tokenizer <- worker(user="./bible_lexicon.tradictional_2.txt", stop_word = "./bible_stop_words.txt")
#設定斷詞function
bible_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, bible_jieba_tokenizer)
return(tokens)
})
}
bible_tokens <- bible %>% unnest_tokens(word, text, token=bible_tokenizer)4.資料基本清理 + 去除特殊字元、詞頻太低的字
# 過濾特殊字元
bible_select = bible_tokens %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1 | .$word =="神" | .$word =="主" |.$word =="信"|.$word =="望"|.$word =="愛")
# 進行combine token
bible_tokens_all <- rbind(
bible_tokens %>%
filter(nchar(.$word)>1) %>% #2個字以上的token
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>3),
bible_tokens %>%
filter(.$word %in% c("主","神","信","望","愛") ) %>% #一個字的頻率也高,另外加入
group_by(word) %>%
summarise(sum = n())) %>%
arrange(desc(sum))
bible_tokens_all %>% wordcloud2() # 舊約token
bible_tokens_old <- rbind(
bible_tokens %>%
filter(.$bookcode<40) %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>3),
bible_tokens %>%
filter(.$bookcode<40) %>%
filter(.$word %in% c("主","神","信","望","愛") ) %>%
group_by(word) %>%
summarise(sum = n())) %>%
arrange(desc(sum))
##bible_tokens_old %>% wordcloud2() # 新約token
bible_tokens_new <- rbind(
bible_tokens %>%
filter(.$bookcode>=40) %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>3),
bible_tokens %>%
filter(.$bookcode>=40) %>%
filter(.$word %in% c("主","神","信","望","愛") ) %>%
group_by(word) %>%
summarise(sum = n())) %>%
arrange(desc(sum))
#bible_tokens_new %>% wordcloud2() 全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版 分為正向情緒與負向情緒
1.讀檔,字詞間以“,”將字分隔
P <- read_file("dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("dict/liwc/negative.txt") # 負向字典txt檔2.分割字詞,並將兩個情緒字典併在一起
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
主要以章節作為探討
找出文集中,對於LIWC字典是positive和negative的字
sentiment_count = bible_select %>%
select(bookcode,word) %>%
inner_join(LIWC) %>%
group_by(bookcode,sentiment) %>%
summarise(count=n()) ## Joining, by = "word"
## `summarise()` has grouped output by 'bookcode'. You can override using the `.groups` argument.
畫出每個章節的情緒總分數,可以看到大概在第40章(新約聖經),情緒從負面為主轉為正面為主。
# 檢視資料的章節區間
range(sentiment_count$bookcode) #1~66## [1] 1 66
sentiment_count %>%
ggplot()+
geom_line(aes(x=bookcode,y=count,colour=sentiment))+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count$bookcode == 40)
[1]])),colour = "red") 將情緒分數標準化後再畫一次圖,我們能發現在新約聖經中,正面情緒是大於負面情緒的,舊約的部分則是正負面情緒各半。
sentiment_count %>%
# 標準化
group_by(bookcode) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=bookcode,y=ratio,colour=sentiment))+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count$bookcode == 40)
[1]])),colour = "red") # 查看每章的情緒分數排名
sentiment_count %>%
select(count,bookcode) %>%
group_by(bookcode) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))## # A tibble: 66 x 2
## bookcode sum
## <int> <int>
## 1 19 1855
## 2 23 1074
## 3 24 983
## 4 20 962
## 5 26 698
## 6 18 588
## 7 42 437
## 8 3 390
## 9 5 389
## 10 14 377
## # … with 56 more rows
#第19章為正面情緒最高;第23章則為負面情緒最高#將資料存成data frame
sentiment_word_count = bible_tokens_all %>%
inner_join(LIWC) %>%
arrange(desc(sum)) %>%
data.frame() ### Joining, by = "word"
sentiment_word_old = bible_tokens_old %>%
inner_join(LIWC) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
sentiment_word_new = bible_tokens_new %>%
inner_join(LIWC) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
sentiment_word_count %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>% #重新排序word,
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()sentiment_word_old %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>% #重新排序word,
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()sentiment_word_new %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>% #重新排序word,
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()# sentiment_word_old %>%
# acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
# comparison.cloud(
# colors = c("salmon", "#72bcd4"), # negative positive
# max.words = 50)# sentiment_word_new %>%
# acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
# comparison.cloud(
# colors = c("salmon", "#72bcd4"), # negative positive
# max.words = 50) NRC <- read.xlsx("./dict/NRC.xlsx")
head(NRC)## word sentiment
## 1 一口 negative
## 2 一夫多妻制 negative
## 3 一巴掌 negative
## 4 一心一意 positive
## 5 一知半解 negative
## 6 一致 positive
sentiment_count_nrc = bible_select %>%
select(bookcode,word) %>%
inner_join(NRC) %>%
group_by(bookcode,sentiment) %>%
summarise(count=n()) ## Joining, by = "word"
## `summarise()` has grouped output by 'bookcode'. You can override using the `.groups` argument.
可以看出舊約負面情緒較高,新約正面情緒較高
# 檢視資料的章節區間
range(sentiment_count_nrc$bookcode) #1~66## [1] 1 66
sentiment_count_nrc %>%
ggplot()+
geom_line(aes(x=bookcode,y=count,colour=sentiment))+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count_nrc$bookcode == 40)
[1]])),colour = "red") 將情緒標準化再畫一次圖,我們能發現在新約聖經中,正面情緒是大於負面情緒的,舊約的部分則是正負面情緒各半。
sentiment_count_nrc %>%
# 標準化
group_by(bookcode) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=bookcode,y=ratio,colour=sentiment))+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count_nrc$bookcode == 40)
[1]])),colour = "red") # 20章前與40章後的情緒較正面查看每章的情緒分數排名
sentiment_count_nrc %>%
group_by(bookcode,sentiment) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))## `summarise()` has grouped output by 'bookcode'. You can override using the `.groups` argument.
## # A tibble: 132 x 3
## # Groups: bookcode [66]
## bookcode sentiment sum
## <int> <chr> <int>
## 1 19 positive 2972
## 2 23 positive 1659
## 3 19 negative 1441
## 4 26 positive 1425
## 5 24 negative 1319
## 6 5 positive 1304
## 7 44 positive 1199
## 8 23 negative 1196
## 9 1 positive 1178
## 10 42 positive 1133
## # … with 122 more rows
#第19章為正面情緒最高;第63章則為負面情緒最高閱讀聖經時,我們是否就可以預期選讀的書卷其情緒傾向。
#找出前3名(1)正向>負向情緒的卷 (2)負向>正向情緒的卷
bible_book_chname <- df %>%
mutate(bookcode = cumsum(str_detect(df$book,regex("^=[0-1][0-9]{2}")))) %>%
filter((df$ch_book %in% c("-"))) %>%
select(bookcode,ch_chapter) %>%
mutate(ch_bookname=ch_chapter) %>%
select (-ch_chapter)
sentiment_book = sentiment_count %>%
group_by(bookcode) %>%
spread(sentiment,count,fill=0) %>%
mutate(s_score=positive-negative) %>%
mutate(booktype=case_when(positive>negative~"positive",TRUE~"negatvie")) %>%
mutate(diff=abs(s_score)) %>%
select(bookcode,booktype,diff) %>%
filter(diff>50) %>% #正負情緒詞頻差異超過50者再列入
left_join(bible_book_chname) %>%
arrange(desc(diff)) ## Joining, by = "bookcode"
sentiment_book %>%
top_n(10,wt = diff) %>%
mutate(ch_bookname = reorder(ch_bookname,diff)) %>% #重新排序bookcode
ggplot(aes(ch_bookname, diff, fill = booktype)) +
geom_col(show.legend = FALSE) +
facet_wrap(~booktype, scales = "free_y") +
labs(y = "正負詞彙差異",
x = NULL) +
theme(text=element_text(size=14))+
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()分析: 1. 正負情緒詞數量, 負>正的前3卷書有耶利米書,以西結書,撒母耳記上 2. 正>負的書卷有詩篇,歷代志上, 利未記
耶利米書是舊約聖經中的第二十四卷經書。 當時的猶大國日趨沒落,外有強鄰如亞述、埃及和巴比倫等國,虎視眈眈隨時找機會進行侵略; 國內情形,上自君王、首領,下至百姓,都離棄神拜偶像,道德低落; 約西亞王年間,雖有一次復興,可惜為期甚短,到了約哈斯、約雅敬、約雅斤、西底家四王的年間,又行耶和華眼中看為惡的事。 這就是先知耶利米時所處的背景,在這樣的情形下,耶利米盡了他話語的職事。
#24耶利米書
# rowwise的處理
# https://stackoverflow.com/questions/43970590/mutate-two-new-columns-based-on-splitting-another
bible_tokens_yelly <- bible_tokens %>%
filter(.$bookcode==24) %>%
mutate(tempcol = as.vector(stringr::str_split(.$ch_chapter, ':'))) %>%
rowwise() %>%
mutate(chapter = as.numeric(unlist(tempcol)[1])) %>%
select(-tempcol)
sentiment_count_yelly_LIWC = bible_tokens_yelly %>%
select(chapter,word) %>%
inner_join(LIWC) %>%
group_by(chapter,sentiment) %>%
summarise(count=n()) %>%
mutate(source="LIWC") %>%
arrange(chapter) ## Joining, by = "word"
## `summarise()` has grouped output by 'chapter'. You can override using the `.groups` argument.
#正負情緒長條圖
sentiment_count_yelly_LIWC %>%
ggplot(aes(x = chapter, y = count, fill = sentiment))+
geom_bar(stat = "identity", position = "dodge") +
geom_vline(xintercept = as.numeric("51"), col='red', size = 1) +
labs(y = "count (LIWC) ")sentiment_count_yelly_LIWC_1 = sentiment_count_yelly_LIWC %>%
spread(sentiment,count,fill=0) %>%
mutate(sentiment_score=positive-negative) #正負情緒曲線圖
sentiment_count_yelly_LIWC %>%
ggplot()+
geom_line(aes(x=chapter,y=count,colour=sentiment)) +
# 加上51最negative 的線
geom_vline(xintercept = as.numeric("51"), col='red', size = 1) +
labs(y = "count (LIWC) ")sentiment_count_yelly_NRC = bible_tokens_yelly %>%
select(chapter,word) %>%
inner_join(NRC) %>%
group_by(chapter,sentiment) %>%
summarise(count=n()) %>%
mutate(source="NRC") %>%
arrange(chapter) ## Joining, by = "word"
## `summarise()` has grouped output by 'chapter'. You can override using the `.groups` argument.
sentiment_count_yelly_NRC %>%
ggplot()+
geom_line(aes(x=chapter,y=count,colour=sentiment)) +
# 加上51最negative 的線
geom_vline(xintercept = as.numeric("51"), col='red', size = 1) +
labs(y = "count (NRC) ")sentiment_count_yelly_NRC_1 = sentiment_count_yelly_NRC %>%
spread(sentiment,count,fill=0) %>%
mutate(sentiment_score=positive-negative)
bind_rows(sentiment_count_yelly_LIWC_1,
sentiment_count_yelly_NRC_1) %>%
ggplot(aes(chapter, sentiment_score, fill = source)) +
geom_col(show.legend = FALSE) +
facet_wrap(~source, ncol = 1, scales = "free_y")bible_level = c("律法書","紀事書","詩歌","先知書","福音書","使徒行傳","保羅書信","別的書信","啟示錄")
# 設定分類
bible = bible %>%
mutate(class = lapply(bible$bookcode, function(x){
if (x<6) return ("律法書")
else if (x<18) return("紀事書")
else if (x<23) return ("詩歌")
else if (x<40) return ("先知書")
else if (x<44) return ("福音書")
else if (x<45) return ("使徒行傳")
else if (x<58) return ("保羅書信")
else if (x<66) return ("別的書信")
else return("啟示錄")
}) %>% unlist(use.names=FALSE))
# 設定bible_class的levels
bible$class = factor(bible$class,levels=bible_level )
bible## ch_book ch_chapter
## 1: 創世紀 1:1
## 2: 創世紀 1:2
## 3: 創世紀 1:3
## 4: 創世紀 1:4
## 5: 創世紀 1:5
## ---
## 31168: 啟示錄 22:19
## 31169: 啟示錄 22:20
## 31170: 啟示錄 22:21
## 31171:
## 31172:
## text
## 1: 起初 神創造天地。
## 2: 地是空虛混沌.淵面黑暗. 神的靈運行在水面上。
## 3: 神說、要有光、就有了光。
## 4: 神看光是好的、就把光暗分開了。
## 5: 神稱光為晝、稱暗為夜.有晚上、有早晨、這是頭一日。
## ---
## 31168: 這書上的預言、若有人刪去甚麼、 神必從這書上所寫的生命樹、和聖城、刪去他的分。
## 31169: 證明這事的說、是了.我必快來。阿們。主耶穌阿、我願你來。
## 31170: 願主耶穌的恩惠、常與眾聖徒同在。阿們。
## 31171:
## 31172:
## bookcode class
## 1: 1 律法書
## 2: 1 律法書
## 3: 1 律法書
## 4: 1 律法書
## 5: 1 律法書
## ---
## 31168: 66 啟示錄
## 31169: 66 啟示錄
## 31170: 66 啟示錄
## 31171: 66 啟示錄
## 31172: 66 啟示錄
bible_token <- bible %>% unnest_tokens(word, text, token=bible_tokenizer) %>%
filter(nchar(.$word)>1 | .$word =="神" | .$word =="主" ) sentiment_count_bible = bible_token %>%
select(class,word) %>%
inner_join(LIWC) %>%
group_by(class,sentiment) %>%
summarise(count=n()) ## Joining, by = "word"
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
sentiment_count_bible## # A tibble: 18 x 3
## # Groups: class [9]
## class sentiment count
## <fct> <chr> <int>
## 1 律法書 negative 839
## 2 律法書 positive 813
## 3 紀事書 negative 1242
## 4 紀事書 positive 1121
## 5 詩歌 negative 1790
## 6 詩歌 positive 1894
## 7 先知書 negative 2476
## 8 先知書 positive 1486
## 9 福音書 negative 551
## 10 福音書 positive 596
## 11 使徒行傳 negative 149
## 12 使徒行傳 positive 170
## 13 保羅書信 negative 573
## 14 保羅書信 positive 1035
## 15 別的書信 negative 255
## 16 別的書信 positive 374
## 17 啟示錄 negative 133
## 18 啟示錄 positive 86
可以看出以福音書後為新約,正面情緒大於負面情緒 舊約為負面情緒大於正面情緒,其中以先知書差別最明顯
sentiment_count_bible %>%
ggplot(aes(x=class,y=count,colour=sentiment,group=sentiment))+
geom_point()+geom_line()+
theme(text = element_text(family = "Heiti TC Light"))sentiment_count_bible %>%
# 標準化
group_by(class) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖
ggplot()+
geom_line(aes(x=class,y=ratio,colour=sentiment,group=sentiment))+
theme(text = element_text(family = "Heiti TC Light"))word_count =
bible_token %>%
select(word,class) %>%
group_by(class,word)%>%
summarise(count=n())## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
sentiment_sum <-
word_count %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()#sentiment_sum %>%
#acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
#comparison.cloud(
# colors = c("salmon", "#72bcd4"), # positive negative
# max.words = 50)prophet =
word_count %>%
filter(class=="先知書")%>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
以先知書為例
先知書主要記載以色列人歷史中一些特殊人物,他們被神選召,替神傳達信息。 先知書雖然有些談及未來,但是大部分篇幅仍然只是談及當時的情形或責備以色列民的罪惡。
prophet %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()以保羅書信為例
是使徒保羅寫給各地教會就各教會內的情況或是對來信詢問教義的人的回信,書信中解釋了許多有關基督教教義的疑難問題及闡明教理原則。
paul <-
word_count %>%
filter(class=="保羅書信")%>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
paul %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()本文的用的作者來源,主要參考這兩篇文章: 1. https://www.konline.org/biblestudy/JBS_A/DBC_08.doc.html 2. https://www.gotquestions.org/T-Chinese/T-Chinese-authors-Bible.html 另外搭配維基百科的說明
bible[!duplicated(bible$ch_book), c("ch_book", "bookcode")]## ch_book bookcode
## 1: 創世紀 1
## 2: 1
## 3: 出埃及記 2
## 4: 利未記 3
## 5: 民數記 4
## 6: 申命記 5
## 7: 約書亞記 6
## 8: 士師記 7
## 9: 路得記 8
## 10: 撒母耳記上 9
## 11: 撒母耳記下 10
## 12: 列王記上 11
## 13: 列王記下 12
## 14: 歷代志上 13
## 15: 歷代志下 14
## 16: 以斯拉記 15
## 17: 尼希米記 16
## 18: 以斯帖記 17
## 19: 約伯記 18
## 20: 詩篇 19
## 21: 箴言 20
## 22: 傳道書 21
## 23: 雅歌 22
## 24: 以賽亞書 23
## 25: 耶利米書 24
## 26: 耶利米哀歌 25
## 27: 以西結書 26
## 28: 但以理書 27
## 29: 何西阿書 28
## 30: 約珥書 29
## 31: 阿摩司書 30
## 32: 俄巴底亞書 31
## 33: 約拿書 32
## 34: 彌迦書 33
## 35: 那鴻書 34
## 36: 哈巴谷書 35
## 37: 西番雅書 36
## 38: 哈該書 37
## 39: 撒迦利亞書 38
## 40: 瑪拉基書 39
## 41: 馬太福音 40
## 42: 馬可福音 41
## 43: 路加福音 42
## 44: 約翰福音 43
## 45: 使徒行傳 44
## 46: 羅馬書 45
## 47: 哥林多前書 46
## 48: 哥林多後書 47
## 49: 加拉太書 48
## 50: 以弗所書 49
## 51: 腓立比書 50
## 52: 歌羅西書 51
## 53: 帖撒羅尼迦前書 52
## 54: 帖撒羅尼迦後書 53
## 55: 提摩太前書 54
## 56: 提摩太後書 55
## 57: 提多書 56
## 58: 腓利門書 57
## 59: 希伯來書 58
## 60: 雅各書 59
## 61: 彼得前書 60
## 62: 彼得後書 61
## 63: 約翰一書 62
## 64: 約翰二書 63
## 65: 約翰三書 64
## 66: 猶大書 65
## 67: 啟示錄 66
## ch_book bookcode
# bible[!duplicated(bible$ch_book), c("ch_book", "bookcode")] %>%
# datapasta::tribble_paste()
bible_author = tibble::tribble(
~ch_book, ~bookcode, ~author,
"創世紀", 1L,"摩西",
NA, 1L, "摩西",
"-", 2L, "摩西",
"出埃及記", 2L, "摩西",
"利未記", 3L, "摩西",
"民數記", 4L,"摩西",
"申命記", 5L,"摩西",
"約書亞記", 6L, "約書亞",
"士師記", 7L, "撒母耳",
"路得記", 8L,"撒母耳",
"撒母耳記上", 9L,"撒母耳",
"撒母耳記下", 10L,"拿單與迦得",
"列王記上", 11L,"耶利米",
"列王記下", 12L,"耶利米",
"歷代志上", 13L,"以斯拉",
"歷代志下", 14L,"以斯拉",
"以斯拉記", 15L,"以斯拉",
"尼希米記", 16L,"尼希米",
"以斯帖記", 17L,"末底改",
"約伯記", 18L,"以利戶",
"詩篇", 19L, "大衛為主",
"箴言", 20L,"所羅門",
"傳道書", 21L,"所羅門",
"雅歌", 22L,"所羅門",
"以賽亞書", 23L,"以賽亞",
"耶利米書", 24L,"耶利米",
"耶利米哀歌", 25L,"耶利米",
"以西結書", 26L,"以西結",
"但以理書", 27L,"但以理",
"何西阿書", 28L,"何西阿",
"約珥書", 29L,"約珥",
"阿摩司書", 30L,"阿摩司",
"俄巴底亞書", 31L,"俄巴底亞",
"約拿書", 32L,"約拿",
"彌迦書", 33L,"彌迦",
"那鴻書", 34L,"那鴻",
"哈巴谷書", 35L,"哈巴谷",
"西番雅書", 36L,"西番雅",
"哈該書", 37L,"哈該",
"撒迦利亞書", 38L,"撒迦利亞",
"瑪拉基書", 39L,"瑪拉基",
"馬太福音", 40L,"馬太",
"馬可福音", 41L,"馬可",
"路加福音", 42L,"路加",
"約翰福音", 43L,"約翰",
"使徒行傳", 44L,"路加",
"羅馬書", 45L,"保羅",
"哥林多前書", 46L,"保羅",
"哥林多後書", 47L,"保羅",
"加拉太書", 48L,"保羅",
"以弗所書", 49L,"保羅",
"腓立比書", 50L,"保羅",
"歌羅西書", 51L,"保羅",
"帖撒羅尼迦前書", 52L,"保羅",
"帖撒羅尼迦後書", 53L,"保羅",
"提摩太前書", 54L,"保羅",
"提摩太後書", 55L,"保羅",
"提多書", 56L,"保羅",
"腓利門書", 57L,"保羅",
"希伯來書", 58L,"未知",
"雅各書", 59L,"雅各",
"彼得前書", 60L,"彼得",
"彼得後書", 61L,"彼得",
"約翰一書", 62L,"約翰",
"約翰二書", 63L,"約翰",
"約翰三書", 64L,"約翰",
"猶大書", 65L,"猶大",
"啟示錄", 66L,"約翰"
)
head(bible_author)## # A tibble: 6 x 3
## ch_book bookcode author
## <chr> <int> <chr>
## 1 創世紀 1 摩西
## 2 <NA> 1 摩西
## 3 - 2 摩西
## 4 出埃及記 2 摩西
## 5 利未記 3 摩西
## 6 民數記 4 摩西
bible_token = left_join(bible_token, bible_author)## Joining, by = c("ch_book", "bookcode")
bible_token %>%
# clean data
inner_join(LIWC) %>%
group_by(author, sentiment) %>%
summarise(n = n(),
bookcode = min(bookcode)) %>%
ungroup() %>%
# plot the amount of sentiment
ggplot(aes(x = reorder(author, -bookcode), y = n, fill = sentiment)) +
geom_col(position = "dodge") +
labs(y = "count to sentiment",
x = "") +
theme(text=element_text(size=10))+
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()## Joining, by = "word"
## `summarise()` has grouped output by 'author'. You can override using the `.groups` argument.
比較每位作者的情緒ratio
bible_token %>%
inner_join(LIWC) %>%
group_by(author, sentiment) %>%
summarise(n = n(),
bookcode = min(bookcode)) %>%
mutate(ratio = n/sum(n)) %>%
ungroup() %>%
# select(author, bookcode, sentiment, ratio) %>%
# pivot_wider(names_from = sentiment, values_from = ratio)
ggplot(aes(x = reorder(author, -bookcode),
y = ratio,
fill = sentiment,
group = sentiment
)
) +
geom_col(position = "stack") +
geom_hline(yintercept = 0.5, color = "grey7") +
# facet_wrap(~sentiment, ncol = 1) +
labs(y = "count to sentiment",
x = "") +
theme(text=element_text(size=10))+
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()## Joining, by = "word"
## `summarise()` has grouped output by 'author'. You can override using the `.groups` argument.
bible_token %>%
inner_join(LIWC) %>%
group_by(author, sentiment) %>%
count(word) %>%
top_n(1) %>%
rmarkdown::paged_table()## Joining, by = "word"
## Selecting by n
使用作者的正負面詞彙總數相減
bible_token %>%
inner_join(LIWC) %>%
group_by(author, sentiment) %>%
summarise(n = n(), ch_book = max(ch_book)) %>%
pivot_wider(names_from = sentiment, values_from = n) %>%
ungroup() %>%
mutate(score = positive - negative) %>%
ggplot(aes(x = reorder(author, score), y = score)) +
geom_col(aes(fill = reorder(author, score))) +
geom_text(aes(label = ch_book), check_overlap = TRUE) +
scale_fill_discrete_diverging("Blue-Red2") +
theme(legend.position = "none") +
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()## Joining, by = "word"
## `summarise()` has grouped output by 'author'. You can override using the `.groups` argument.
1.聖經新約/舊約在正面和負面的討論內容各是甚麼?
新約正面情緒詞詞頻最為常出現的為榮耀、信心、願意、智慧、真理等詞彙 新約負面情緒詞詞頻最為常出現的為軟弱、憂愁、憐憫、污穢、罪人等詞彙
舊約正面情緒詞詞頻最為常出現的為智慧、潔淨、平安、榮耀、拯救等詞彙 新約負面情緒詞詞頻最為常出現的為攻擊、仇敵、惡人、懼怕、發怒等詞彙
2.正面和負面討論的情緒分數大約多少?
sentiment_count %>%
ggplot()+
geom_line(aes(x=bookcode,y=count,colour=sentiment))+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count$bookcode == 40)
[1]])),colour = "red") 舊約大部份的書卷為負面情緒緒詞彙 > 正面詞彙,其中負面 > 正面情緒的書卷有以賽亞書(23),耶利米書(24),耶利米哀歌(25),以西結書(26); 例外的為舊約詩篇(19)是正向情緒>負面情緒
新約整體而言情緒詞彙數目較少,但呈現出正面情緒詞彙出現 > 負面詞彙
3.比較LIWC, NRC情緒字典所呈現出來的情緒分佈有何不同。
以NRC情緒字典所呈現出來的聖經情緒分數有較高的趨勢,特別是正面情緒的詞彙更是明顯高於LIWC情緒字典所跑出來的情緒分數。 然而,這可能和兩本情緒字典的資料數量有關,我們使用的 LIWC情緒字典裡有1711筆資料,NRC情緒字典裡有5462筆資料。 透過NRC情緒字典更能呈現出聖經中的情緒用詞,因而情緒分數會較高。
4.不同分類的聖經,情緒分佈有何不同?
將聖經分為9類: 舊約 -> 1-5:“律法書”, 6-17:“紀事書”, 18-22:“詩歌”, 23-29:“先知書” 新約 -> 40-43:“福音書”, 44:“使徒行傳”, 45-57:“保羅書信”, 58-54:“別的書信”, 66:“啟示錄”