第五周：以情緒分析繁體聖經和合本

https://rpubs.com/mhhsu/sma_hw5_sentiment

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## [1] ""

安裝需要的packages

# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

讀進library

library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
require(jiebaR)

資料基本介紹

資料來源: 繁體聖經和合本(新約,舊約)
資料集： bibleMetaData.txt

針對聖經新約進行情緒分析。本次主要針對以下方向分析：

1.聖經新約/舊約在正面和負面的情緒內容分佈？
2.聖經各卷之正負情緒比例及正負曲線為何？
3.要閱讀聖經時哪些是屬於正面篇章,哪些是負面的？ 4.不同分類的聖經，情緒分佈有何不同？
5.比較LIWC, NRC情緒字典所呈現出來的情緒分佈有何不同?

# 資料前處理
metaData = fread('../data/bibleMetaData.txt',encoding = 'UTF-8',fill=TRUE) #載入的資料未完全格式化,fill=TRUE
#針對舊約,新約聖經每一卷進行編碼 
#格式是=第1碼是0或1,0:舊約, 1:新約, 第2-3碼為流水號 
bible_data <- metaData %>% 
  mutate(bookcode = cumsum(str_detect(metaData$book,regex("^=[0-1][0-9]{2}")))) %>%   select (-book,-chapter)  %>%  
filter(!(metaData$ch_book %in% c("-")))  #去除特殊的格式 
str(bible_data)

## Classes 'data.table' and 'data.frame':   31172 obs. of  4 variables:
##  $ ch_book   : chr  "創世紀" "創世紀" "創世紀" "創世紀" ...
##  $ ch_chapter: chr  "1:1" "1:2" "1:3" "1:4" ...
##  $ text      : chr  "起初　神創造天地。" "地是空虛混沌．淵面黑暗．　神的靈運行在水面上。" "　神說、要有光、就有了光。" "　神看光是好的、就把光暗分開了。" ...
##  $ bookcode  : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

bible_book_chname <- metaData %>% 
  mutate(bookcode = cumsum(str_detect(metaData$book,regex("^=[0-1][0-9]{2}")))) %>%   
  filter((metaData$ch_book %in% c("-"))) %>%  
  select(bookcode,ch_chapter)  %>%  
  mutate(ch_bookname=ch_chapter)   %>%  
  select (-ch_chapter)
  
#舊約
bible_data_old <- bible_data %>% 
  filter(.$bookcode<40) 
#新約
bible_data_new <- bible_data %>% 
  filter(.$bookcode>=40) 

#將書卷作分類
bible_level = c("律法書","紀事書","詩歌","先知書","福音書","使徒行傳","保羅書信","別的書信","啟示錄")
# 設定分類
bible_data = bible_data %>% 
  mutate(class = lapply(bible_data$bookcode, function(x){
    if (x<6) return ("律法書")
    else if (x<18) return("紀事書")
    else if (x<23) return ("詩歌")
    else if (x<40) return ("先知書")
    else if (x<44) return ("福音書")
    else if (x<45) return ("使徒行傳")
    else if (x<58) return ("保羅書信")
    else if (x<66) return ("別的書信")
    else return("啟示錄")
  }) %>% unlist(use.names=FALSE))

# 設定bible_class的levels
bible_data$class = factor(bible_data$class,levels=bible_level )

1. 資料前處理

(1). 文章斷詞

設定斷詞引擎

# 加入自定義的字典
jieba_tokenizer <- worker(user="../dict/bible_lexicon.tradictional.txt", stop_word = "../dict/bible_stop_words.txt")

# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

bible_tokens <- bible_data %>% unnest_tokens(word, text, token=customized_tokenizer)

(2). 資料基本清理

日期格式化
去除特殊字元、詞頻太低的字

# 過濾特殊字元
bible_tokens = bible_tokens %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",word))  # 去英文、數字
  
# 進行combine token 
bible_tokens_all <- rbind(
  bible_tokens %>% 
  filter(nchar(.$word)>1) %>%  #2個字以上的token
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>3),
  bible_tokens  %>% 
  filter(.$word %in% c("主","神","信","望","愛") ) %>% #一個字的頻率也高 另外加入 
  group_by(word) %>% 
  summarise(sum = n()))  %>% 
  arrange(desc(sum))

bible_tokens_all %>% wordcloud2()

#![](wordcloud2_bible.png)

# 舊約token 
bible_tokens_old <- rbind(
  bible_tokens %>% 
  filter(.$bookcode<40) %>% 
  filter(nchar(.$word)>1) %>% 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>3),
  bible_tokens  %>% 
  filter(.$bookcode<40) %>% 
  filter(.$word %in% c("主","神","信","望","愛") ) %>% 
  group_by(word) %>% 
  summarise(sum = n()))  %>% 
  arrange(desc(sum))

bible_tokens_old %>% wordcloud2()

#![](wordcloud3_bible.png)

# 新約token 
bible_tokens_new <- rbind(
  bible_tokens %>% 
  filter(.$bookcode>=40) %>% 
  filter(nchar(.$word)>1) %>% 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>3),
  bible_tokens  %>% 
  filter(.$bookcode>=40) %>% 
  filter(.$word %in% c("主","神","信","望","愛") ) %>% 
  group_by(word) %>% 
  summarise(sum = n()))  %>% 
  arrange(desc(sum))

bible_tokens_new %>% wordcloud2()

#![](wordcloud4_bible.png)

2. 準備情緒字典

2.1 LIWC字典

全名Linguistic Inquiry and Word Counts，由心理學家Pennebaker於2001出版分為正向情緒與負向情緒

讀檔，字詞間以“,”將字分隔

P <- read_file("../dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("../dict/liwc/negative.txt") # 負向字典txt檔

#字典txt檔讀進來是一整個字串
typeof(P)

## [1] "character"

分割字詞，並將兩個情緒字典併在一起

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)

# 檢視字典
head(LIWC)

##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

2.2 NRC情緒字典

NRC (National Research Council) 情緒和情感詞典是由加拿大國家研究委員會（簡稱NRC）的專家建立分為正向情緒(Positive)與負向情緒(Negative)

讀檔，字詞間以“,”將字分隔

library(openxlsx)
library(tidyr)
NRC <- read.xlsx("../dict/NRC.xlsx")

# 檢視字典
head(NRC)

##   word sentiment
## 1 鬱悶  negative
## 2 鸚鵡  negative
## 3   鑿  positive
## 4 鑽石  positive
## 5   驢  negative
## 6 讚許  positive

3. 將聖經和與LIWC情緒字典做join

哪些章節的情緒內容最高。情緒內容最高的章節,其內容的特色為何? 19:詩篇

3.1 聖經新約/舊約在正面和負面的情緒內容分佈

sentiment_word_count = bible_tokens_all %>%
  inner_join(LIWC) %>%  
  arrange(desc(sum)) %>% 
  data.frame() #存成data frame

## Joining, by = "word"

sentiment_word_old = bible_tokens_old %>%
  inner_join(LIWC) %>%  
  arrange(desc(sum)) %>% 
  data.frame() #存成data frame

## Joining, by = "word"

sentiment_word_new = bible_tokens_new %>%
  inner_join(LIWC) %>%  
  arrange(desc(sum)) %>% 
  data.frame() #存成data frame

## Joining, by = "word"

找出舊約、新約正負情緒長條圖

sentiment_word_count %>% 
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>% #重新排序word,
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

#處理舊約 
sentiment_word_old %>% 
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>% #重新排序word,
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

#處理新約 
sentiment_word_new %>% 
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>% #重新排序word,
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

找出舊約、新約正負情緒比較文字雲

# sentiment_word_count %>%
#   acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
#   comparison.cloud(
#     colors = c("salmon", "#72bcd4"), #  negative positive 
#                    max.words = 50) 

#舊約文字雲
sentiment_word_old %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"), #  negative positive 
                   max.words = 50)

# 新約正負情緒比較文字雲
sentiment_word_new %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"), #  negative positive 
                   max.words = 50)

### 3.2 聖經各書卷之正負情緒比例及正負曲線

# 找出聖經中的情緒token及頻率, 算出每一卷情緒總和(sentiment_count) 
# 每一卷sentiment_count:bookcode,sentiment,count
# 第19篇的情緒count很高 
sentiment_count = bible_tokens %>%
  select(bookcode,word,ch_book) %>%
  inner_join(LIWC) %>% 
  group_by(bookcode,sentiment) %>%
  summarise(count=n()) %>%  
  arrange(bookcode)

## Joining, by = "word"

## `summarise()` has grouped output by 'bookcode'. You can override using the `.groups` argument.

3.2.1 各卷正負情緒分數折線圖

#畫出各卷的正負情緒 ,紅線後為新約 
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=bookcode,y=count,colour=sentiment)) +
  # 加上新約的界線 
  geom_vline(xintercept = as.numeric("40"), col='red', size = 1)  +
   geom_vline(xintercept = as.numeric("19"), col='purple', size = 0.3) + #詩篇
  geom_vline(xintercept = as.numeric("24"), col='purple', size = 0.3)

分析: 1 舊約大部份的書卷為負向情緒緒詞彙>正向詞彙,其中負向>正向情緒的書卷有23以賽亞書, 24耶利米書,25耶利米哀歌,26以西結書 ; 例外的為舊約詩篇 2 新約整體而言情緒詞彙數目較少，但呈現出正向情緒詞彙出現>負向詞彙

3.2.2 各卷正負情緒分數標準化折線圖將情緒分數標準化

sentiment_count_ratio= sentiment_count %>% 
  # 標準化的部分
  group_by(bookcode) %>%
  mutate(ratio = count/sum(count))

sentiment_count %>% 
  # 標準化的部分
  group_by(bookcode) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=bookcode,y=ratio,colour=sentiment))+
  # 加上標示日期的線
  geom_vline(xintercept = as.numeric("40"), col='red', size = 1)  + 
  geom_vline(xintercept = as.numeric("19"), col='purple', size = 0.3) + #詩篇
  geom_vline(xintercept = as.numeric("24"), col='purple', size = 0.3)   #耶利米書

將情緒標準化再畫一次圖，我們能發現在新約聖經中，正面情緒是大於負面情緒的，舊約的部分則是正負面情緒各半。

3.3 聖經TOP正負向情緒書卷

閱讀聖經時，我們是否就可以預期選讀的書卷其情緒傾向。

#找出前3名(1)正向>負向情緒的卷 (2)負向>正向情緒的卷 
sentiment_book = sentiment_count %>% 
  group_by(bookcode) %>%
  spread(sentiment,count,fill=0) %>% 
  mutate(s_score=positive-negative)  %>% 
  mutate(booktype=case_when(positive>negative~"positive",TRUE~"negatvie"))  %>% 
  mutate(diff=abs(s_score))   %>%  
  select(bookcode,booktype,diff) %>%  
  filter(diff>50) %>%  #正負情緒詞頻差異超過50者再列入
  left_join(bible_book_chname) %>%  
  arrange(desc(diff))

## Joining, by = "bookcode"

sentiment_book %>% 
  top_n(10,wt = diff) %>%
  mutate(ch_bookname = reorder(ch_bookname,diff)) %>% #重新排序bookcode 
  ggplot(aes(ch_bookname, diff, fill = booktype)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~booktype, scales = "free_y") +
  labs(y = "正負詞彙差異",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

分析: 1.正負情緒詞數量, 負>正的前3卷書有耶利米書,以西結書,撒母耳記上 2.正>負的書卷有詩篇,歷代志上, 利未記

3.4依類型探索情緒分佈

3.4.1. 將聖經分為9類, 舊約: “律法書”, 1-5 “紀事書”, 6-17 “詩歌”, 18-22 “先知書”, 23-39 新約: “福音書”,40-43 “使徒行傳”,44 “保羅書信”,45-57 “別的書信”, 58-65 “啟示錄” 66

sentiment_count_bible_class = bible_tokens %>%
  select(class,word) %>%
  inner_join(LIWC) %>% 
  group_by(class,sentiment) %>%
  summarise(count=n())

## Joining, by = "word"

## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.

sentiment_count_bible_class

## # A tibble: 18 x 3
## # Groups:   class [9]
##    class    sentiment count
##    <fct>    <chr>     <int>
##  1 律法書   negative    837
##  2 律法書   positive    812
##  3 紀事書   negative   1240
##  4 紀事書   positive   1120
##  5 詩歌     negative   1774
##  6 詩歌     positive   1894
##  7 先知書   negative   2451
##  8 先知書   positive   1483
##  9 福音書   negative    545
## 10 福音書   positive    596
## 11 使徒行傳 negative    149
## 12 使徒行傳 positive    169
## 13 保羅書信 negative    569
## 14 保羅書信 positive   1035
## 15 別的書信 negative    253
## 16 別的書信 positive    379
## 17 啟示錄   negative    133
## 18 啟示錄   positive     86

sentiment_count_bible_class %>%
  ggplot(aes(x=class,y=count,colour=sentiment,group=sentiment))+
  geom_point()+geom_line()

分析: 1.可以看出以福音書後為新約，正面情緒大於負面情緒 2.舊約為負面情緒大於正面情緒，其中以先知書差別最明顯

3.4.2 以先知書類為例-負向較強
先知書主要記載以色列人歷史中一些特殊人物，他們被神選召，替神傳達信息。先知書雖然有些談及未來，但是大部分篇幅仍然只是談及當時的情形或責備以色列民的罪惡。

word_count_class = 
  bible_tokens %>%
  select(word,class) %>%
  group_by(class,word)%>%
  summarise(count=n())

## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.

prophet = 
    word_count_class %>%
    filter(class=="先知書")%>%
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
    summarise(
      sum = sum(count)
    ) %>% 
    arrange(desc(sum)) %>%
    data.frame()

## Joining, by = "word"

## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

prophet %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

3.4.2 以保羅書信類為例-正向情緒強各地教會就各教會內的情況或是對來信詢問教義的人的回信，書信中解釋了許多有關基督教教義的疑難問題及闡明教理原則

paul <- 
  word_count_class %>%
    filter(class=="保羅書信")%>%
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
    summarise(
      sum = sum(count)
    ) %>% 
    arrange(desc(sum)) %>%
    data.frame()

## Joining, by = "word"

## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

paul %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

3.5 LIWC, NRC情緒字典所呈現出來的情緒分佈-耶利米書

耶利米書是舊約聖經中的第二十四卷經書當時的猶大國日趨沒落，外有強鄰如亞述、埃及和巴比倫等國，虎視眈眈隨時找機會進行侵略；國內情形，上自君王、首領，下至百姓，都離棄神拜偶像，道德低落；約西亞王年間，雖有一次復興，可惜為期甚短，到了約哈斯、約雅敬、約雅斤、西底家四王的年間，又行耶和華眼中看為惡的事。這就是先知耶利米時所處的背景，在這樣的情形下，耶利米盡了他話語的職事。

#24耶利米書 
# rowwise的處理
# https://stackoverflow.com/questions/43970590/mutate-two-new-columns-based-on-splitting-another
bible_tokens_yelly <- bible_tokens %>% 
  filter(.$bookcode==24) %>% 
  mutate(tempcol = as.vector(stringr::str_split(.$ch_chapter, ':')))  %>% 
  rowwise() %>% 
  mutate(chapter = as.numeric(unlist(tempcol)[1])) %>% 
  select(-tempcol)

sentiment_count_yelly_LIWC = bible_tokens_yelly %>%
  select(chapter,word) %>%
  inner_join(LIWC) %>% 
  group_by(chapter,sentiment) %>%
  summarise(count=n()) %>%  
  mutate(source="LIWC") %>%  
  arrange(chapter)

## Joining, by = "word"

## `summarise()` has grouped output by 'chapter'. You can override using the `.groups` argument.

#正負情緒長條圖 
sentiment_count_yelly_LIWC %>%
  ggplot(aes(x = chapter, y = count, fill = sentiment))+
  geom_bar(stat = "identity", position = "dodge")  + 
  geom_vline(xintercept = as.numeric("51"), col='red', size = 1) + 
  labs(y = "count (LIWC) ")

sentiment_count_yelly_LIWC_1 = sentiment_count_yelly_LIWC  %>%
  spread(sentiment,count,fill=0) %>% 
  mutate(sentiment_score=positive-negative)

#正負情緒曲線圖 
sentiment_count_yelly_LIWC %>%
  ggplot()+
  geom_line(aes(x=chapter,y=count,colour=sentiment)) +
  # 加上51最negative 的線  
  geom_vline(xintercept = as.numeric("51"), col='red', size = 1) + 
  labs(y = "count (LIWC) ")

sentiment_count_yelly_NRC = bible_tokens_yelly %>%
  select(chapter,word) %>%
  inner_join(NRC) %>% 
  group_by(chapter,sentiment) %>%
  summarise(count=n()) %>%  
  mutate(source="NRC") %>%  
  arrange(chapter)

## Joining, by = "word"

## `summarise()` has grouped output by 'chapter'. You can override using the `.groups` argument.

sentiment_count_yelly_NRC %>%
  ggplot()+
  geom_line(aes(x=chapter,y=count,colour=sentiment)) +
  # 加上51最negative 的線  
  geom_vline(xintercept = as.numeric("51"), col='red', size = 1) + 
  labs(y = "count (NRC) ")

sentiment_count_yelly_NRC_1 = sentiment_count_yelly_NRC  %>%
  spread(sentiment,count,fill=0) %>% 
  mutate(sentiment_score=positive-negative)  
  
bind_rows(sentiment_count_yelly_LIWC_1, 
          sentiment_count_yelly_NRC_1) %>%
  ggplot(aes(chapter, sentiment_score, fill = source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~source, ncol = 1, scales = "free_y")

分析: 基本上情緒的高低,LIWC,NRC每一章大致相同, LIWC 的負向又比NRC來得強烈而NRC的正向比LIWC來得強烈