https://rpubs.com/mhhsu/sma_hw5_sentiment

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] ""

安裝需要的packages

# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

讀進library

library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
require(jiebaR)

資料基本介紹

  • 資料來源: 繁體聖經和合本(新約,舊約)
  • 資料集: bibleMetaData.txt

針對聖經新約進行情緒分析。本次主要針對以下方向分析:

1.聖經新約/舊約在正面和負面的情緒內容分佈?
2.聖經各卷之正負情緒比例及正負曲線為何?
3.要閱讀聖經時哪些是屬於正面篇章,哪些是負面的? 4.不同分類的聖經,情緒分佈有何不同?
5.比較LIWC, NRC情緒字典所呈現出來的情緒分佈有何不同?

# 資料前處理
metaData = fread('../data/bibleMetaData.txt',encoding = 'UTF-8',fill=TRUE) #載入的資料未完全格式化,fill=TRUE
#針對舊約,新約聖經每一卷進行編碼 
#格式是=第1碼是0或1,0:舊約, 1:新約, 第2-3碼為流水號 
bible_data <- metaData %>% 
  mutate(bookcode = cumsum(str_detect(metaData$book,regex("^=[0-1][0-9]{2}")))) %>%   select (-book,-chapter)  %>%  
filter(!(metaData$ch_book %in% c("-")))  #去除特殊的格式 
str(bible_data)
## Classes 'data.table' and 'data.frame':   31172 obs. of  4 variables:
##  $ ch_book   : chr  "創世紀" "創世紀" "創世紀" "創世紀" ...
##  $ ch_chapter: chr  "1:1" "1:2" "1:3" "1:4" ...
##  $ text      : chr  "起初 神創造天地。" "地是空虛混沌.淵面黑暗. 神的靈運行在水面上。" " 神說、要有光、就有了光。" " 神看光是好的、就把光暗分開了。" ...
##  $ bookcode  : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
bible_book_chname <- metaData %>% 
  mutate(bookcode = cumsum(str_detect(metaData$book,regex("^=[0-1][0-9]{2}")))) %>%   
  filter((metaData$ch_book %in% c("-"))) %>%  
  select(bookcode,ch_chapter)  %>%  
  mutate(ch_bookname=ch_chapter)   %>%  
  select (-ch_chapter)
  
#舊約
bible_data_old <- bible_data %>% 
  filter(.$bookcode<40) 
#新約
bible_data_new <- bible_data %>% 
  filter(.$bookcode>=40) 

#將書卷作分類
bible_level = c("律法書","紀事書","詩歌","先知書","福音書","使徒行傳","保羅書信","別的書信","啟示錄")
# 設定分類
bible_data = bible_data %>% 
  mutate(class = lapply(bible_data$bookcode, function(x){
    if (x<6) return ("律法書")
    else if (x<18) return("紀事書")
    else if (x<23) return ("詩歌")
    else if (x<40) return ("先知書")
    else if (x<44) return ("福音書")
    else if (x<45) return ("使徒行傳")
    else if (x<58) return ("保羅書信")
    else if (x<66) return ("別的書信")
    else return("啟示錄")
  }) %>% unlist(use.names=FALSE))

# 設定bible_class的levels
bible_data$class = factor(bible_data$class,levels=bible_level )

1. 資料前處理

(1). 文章斷詞

設定斷詞引擎

# 加入自定義的字典
jieba_tokenizer <- worker(user="../dict/bible_lexicon.tradictional.txt", stop_word = "../dict/bible_stop_words.txt")

# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

bible_tokens <- bible_data %>% unnest_tokens(word, text, token=customized_tokenizer) 

(2). 資料基本清理

  • 日期格式化
  • 去除特殊字元、詞頻太低的字
# 過濾特殊字元
bible_tokens = bible_tokens %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",word))  # 去英文、數字
  
# 進行combine token 
bible_tokens_all <- rbind(
  bible_tokens %>% 
  filter(nchar(.$word)>1) %>%  #2個字以上的token
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>3),
  bible_tokens  %>% 
  filter(.$word %in% c("主","神","信","望","愛") ) %>% #一個字的頻率也高 另外加入 
  group_by(word) %>% 
  summarise(sum = n()))  %>% 
  arrange(desc(sum))

bible_tokens_all %>% wordcloud2() 
#![](wordcloud2_bible.png)

# 舊約token 
bible_tokens_old <- rbind(
  bible_tokens %>% 
  filter(.$bookcode<40) %>% 
  filter(nchar(.$word)>1) %>% 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>3),
  bible_tokens  %>% 
  filter(.$bookcode<40) %>% 
  filter(.$word %in% c("主","神","信","望","愛") ) %>% 
  group_by(word) %>% 
  summarise(sum = n()))  %>% 
  arrange(desc(sum))

bible_tokens_old %>% wordcloud2() 
#![](wordcloud3_bible.png)

# 新約token 
bible_tokens_new <- rbind(
  bible_tokens %>% 
  filter(.$bookcode>=40) %>% 
  filter(nchar(.$word)>1) %>% 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>3),
  bible_tokens  %>% 
  filter(.$bookcode>=40) %>% 
  filter(.$word %in% c("主","神","信","望","愛") ) %>% 
  group_by(word) %>% 
  summarise(sum = n()))  %>% 
  arrange(desc(sum))

bible_tokens_new %>% wordcloud2() 
#![](wordcloud4_bible.png)

2. 準備情緒字典

2.1 LIWC字典

全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版 分為正向情緒與負向情緒

讀檔,字詞間以“,”將字分隔

P <- read_file("../dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("../dict/liwc/negative.txt") # 負向字典txt檔

#字典txt檔讀進來是一整個字串
typeof(P)
## [1] "character"

分割字詞,並將兩個情緒字典併在一起

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)

# 檢視字典
head(LIWC)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

2.2 NRC情緒字典

NRC (National Research Council) 情緒和情感詞典是由加拿大國家研究委員會(簡稱NRC)的專家建立 分為正向情緒(Positive)與負向情緒(Negative)

讀檔,字詞間以“,”將字分隔

library(openxlsx)
library(tidyr)
NRC <- read.xlsx("../dict/NRC.xlsx")

# 檢視字典
head(NRC)
##   word sentiment
## 1 鬱悶  negative
## 2 鸚鵡  negative
## 3   鑿  positive
## 4 鑽石  positive
## 5   驢  negative
## 6 讚許  positive

3. 將聖經和與LIWC情緒字典做join

哪些章節的情緒內容最高。 情緒內容最高的章節,其內容的特色為何? 19:詩篇

3.1 聖經新約/舊約在正面和負面的情緒內容分佈

sentiment_word_count = bible_tokens_all %>%
  inner_join(LIWC) %>%  
  arrange(desc(sum)) %>% 
  data.frame() #存成data frame 
## Joining, by = "word"
sentiment_word_old = bible_tokens_old %>%
  inner_join(LIWC) %>%  
  arrange(desc(sum)) %>% 
  data.frame() #存成data frame 
## Joining, by = "word"
sentiment_word_new = bible_tokens_new %>%
  inner_join(LIWC) %>%  
  arrange(desc(sum)) %>% 
  data.frame() #存成data frame 
## Joining, by = "word"
  1. 找出舊約、新約正負情緒長條圖
sentiment_word_count %>% 
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>% #重新排序word,
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

#處理舊約 
sentiment_word_old %>% 
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>% #重新排序word,
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

#處理新約 
sentiment_word_new %>% 
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>% #重新排序word,
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

  1. 找出舊約、新約正負情緒比較文字雲
# sentiment_word_count %>%
#   acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
#   comparison.cloud(
#     colors = c("salmon", "#72bcd4"), #  negative positive 
#                    max.words = 50) 

#舊約文字雲
sentiment_word_old %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"), #  negative positive 
                   max.words = 50) 

# 新約正負情緒比較文字雲
sentiment_word_new %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"), #  negative positive 
                   max.words = 50) 

### 3.2 聖經各書卷之正負情緒比例及正負曲線

# 找出聖經中的情緒token及頻率, 算出每一卷情緒總和(sentiment_count) 
# 每一卷sentiment_count:bookcode,sentiment,count
# 第19篇的情緒count很高 
sentiment_count = bible_tokens %>%
  select(bookcode,word,ch_book) %>%
  inner_join(LIWC) %>% 
  group_by(bookcode,sentiment) %>%
  summarise(count=n()) %>%  
  arrange(bookcode)
## Joining, by = "word"
## `summarise()` has grouped output by 'bookcode'. You can override using the `.groups` argument.

3.2.1 各卷正負情緒分數折線圖

#畫出各卷的正負情緒 ,紅線後為新約 
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=bookcode,y=count,colour=sentiment)) +
  # 加上新約的界線 
  geom_vline(xintercept = as.numeric("40"), col='red', size = 1)  +
   geom_vline(xintercept = as.numeric("19"), col='purple', size = 0.3) + #詩篇
  geom_vline(xintercept = as.numeric("24"), col='purple', size = 0.3)

分析: 1 舊約大部份的書卷為負向情緒緒詞彙>正向詞彙,其中負向>正向情緒的書卷有23以賽亞書, 24耶利米書,25耶利米哀歌,26以西結書 ; 例外的為舊約詩篇 2 新約整體而言情緒詞彙數目較少,但呈現出正向情緒詞彙出現>負向詞彙

3.2.2 各卷正負情緒分數標準化折線圖 將情緒分數標準化

sentiment_count_ratio= sentiment_count %>% 
  # 標準化的部分
  group_by(bookcode) %>%
  mutate(ratio = count/sum(count))

sentiment_count %>% 
  # 標準化的部分
  group_by(bookcode) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=bookcode,y=ratio,colour=sentiment))+
  # 加上標示日期的線
  geom_vline(xintercept = as.numeric("40"), col='red', size = 1)  + 
  geom_vline(xintercept = as.numeric("19"), col='purple', size = 0.3) + #詩篇
  geom_vline(xintercept = as.numeric("24"), col='purple', size = 0.3)   #耶利米書 

將情緒標準化再畫一次圖,我們能發現在新約聖經中,正面情緒是大於負面情緒的,舊約的部分則是正負面情緒各半。

3.3 聖經TOP正負向情緒書卷

閱讀聖經時,我們是否就可以預期選讀的書卷其情緒傾向。

#找出前3名(1)正向>負向情緒的卷 (2)負向>正向情緒的卷 
sentiment_book = sentiment_count %>% 
  group_by(bookcode) %>%
  spread(sentiment,count,fill=0) %>% 
  mutate(s_score=positive-negative)  %>% 
  mutate(booktype=case_when(positive>negative~"positive",TRUE~"negatvie"))  %>% 
  mutate(diff=abs(s_score))   %>%  
  select(bookcode,booktype,diff) %>%  
  filter(diff>50) %>%  #正負情緒詞頻差異超過50者再列入
  left_join(bible_book_chname) %>%  
  arrange(desc(diff)) 
## Joining, by = "bookcode"
sentiment_book %>% 
  top_n(10,wt = diff) %>%
  mutate(ch_bookname = reorder(ch_bookname,diff)) %>% #重新排序bookcode 
  ggplot(aes(ch_bookname, diff, fill = booktype)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~booktype, scales = "free_y") +
  labs(y = "正負詞彙差異",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

分析: 1.正負情緒詞數量, 負>正的前3卷書有耶利米書,以西結書,撒母耳記上 2.正>負的書卷有詩篇,歷代志上, 利未記

3.4依類型探索情緒分佈

3.4.1. 將聖經分為9類, 舊約: “律法書”, 1-5 “紀事書”, 6-17 “詩歌”, 18-22 “先知書”, 23-39 新約: “福音書”,40-43 “使徒行傳”,44 “保羅書信”,45-57 “別的書信”, 58-65 “啟示錄” 66

sentiment_count_bible_class = bible_tokens %>%
  select(class,word) %>%
  inner_join(LIWC) %>% 
  group_by(class,sentiment) %>%
  summarise(count=n())  
## Joining, by = "word"
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
sentiment_count_bible_class
## # A tibble: 18 x 3
## # Groups:   class [9]
##    class    sentiment count
##    <fct>    <chr>     <int>
##  1 律法書   negative    837
##  2 律法書   positive    812
##  3 紀事書   negative   1240
##  4 紀事書   positive   1120
##  5 詩歌     negative   1774
##  6 詩歌     positive   1894
##  7 先知書   negative   2451
##  8 先知書   positive   1483
##  9 福音書   negative    545
## 10 福音書   positive    596
## 11 使徒行傳 negative    149
## 12 使徒行傳 positive    169
## 13 保羅書信 negative    569
## 14 保羅書信 positive   1035
## 15 別的書信 negative    253
## 16 別的書信 positive    379
## 17 啟示錄   negative    133
## 18 啟示錄   positive     86
sentiment_count_bible_class %>%
  ggplot(aes(x=class,y=count,colour=sentiment,group=sentiment))+
  geom_point()+geom_line()

分析: 1.可以看出以福音書後為新約,正面情緒大於負面情緒 2.舊約為負面情緒大於正面情緒,其中以先知書差別最明顯

3.4.2 以先知書類為例-負向較強
先知書主要記載以色列人歷史中一些特殊人物,他們被神選召,替神傳達信息。 先知書雖然有些談及未來,但是大部分篇幅仍然只是談及當時的情形或責備以色列民的罪惡。

word_count_class = 
  bible_tokens %>%
  select(word,class) %>%
  group_by(class,word)%>%
  summarise(count=n())
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
prophet = 
    word_count_class %>%
    filter(class=="先知書")%>%
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
    summarise(
      sum = sum(count)
    ) %>% 
    arrange(desc(sum)) %>%
    data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
prophet %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

3.4.2 以保羅書信類為例-正向情緒強 各地教會就各教會內的情況或是對來信詢問教義的人的回信,書信中解釋了許多有關基督教教義的疑難問題及闡明教理原則

paul <- 
  word_count_class %>%
    filter(class=="保羅書信")%>%
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
    summarise(
      sum = sum(count)
    ) %>% 
    arrange(desc(sum)) %>%
    data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
paul %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

3.5 LIWC, NRC情緒字典所呈現出來的情緒分佈-耶利米書

耶利米書是舊約聖經中的第二十四卷經書 當時的猶大國日趨沒落,外有強鄰如亞述、埃及和巴比倫等國,虎視眈眈隨時找機會進行侵略;國內情形,上自君王、首領,下至百姓,都離棄神拜偶像,道德低落;約西亞王年間,雖有一次復興,可惜為期甚短,到了約哈斯、約雅敬、約雅斤、西底家四王的年間,又行耶和華眼中看為惡的事。這就是先知耶利米時所處的背景,在這樣的情形下,耶利米盡了他話語的職事。

#24耶利米書 
# rowwise的處理
# https://stackoverflow.com/questions/43970590/mutate-two-new-columns-based-on-splitting-another
bible_tokens_yelly <- bible_tokens %>% 
  filter(.$bookcode==24) %>% 
  mutate(tempcol = as.vector(stringr::str_split(.$ch_chapter, ':')))  %>% 
  rowwise() %>% 
  mutate(chapter = as.numeric(unlist(tempcol)[1])) %>% 
  select(-tempcol)

sentiment_count_yelly_LIWC = bible_tokens_yelly %>%
  select(chapter,word) %>%
  inner_join(LIWC) %>% 
  group_by(chapter,sentiment) %>%
  summarise(count=n()) %>%  
  mutate(source="LIWC") %>%  
  arrange(chapter) 
## Joining, by = "word"
## `summarise()` has grouped output by 'chapter'. You can override using the `.groups` argument.
#正負情緒長條圖 
sentiment_count_yelly_LIWC %>%
  ggplot(aes(x = chapter, y = count, fill = sentiment))+
  geom_bar(stat = "identity", position = "dodge")  + 
  geom_vline(xintercept = as.numeric("51"), col='red', size = 1) + 
  labs(y = "count (LIWC) ")

sentiment_count_yelly_LIWC_1 = sentiment_count_yelly_LIWC  %>%
  spread(sentiment,count,fill=0) %>% 
  mutate(sentiment_score=positive-negative)  
#正負情緒曲線圖 
sentiment_count_yelly_LIWC %>%
  ggplot()+
  geom_line(aes(x=chapter,y=count,colour=sentiment)) +
  # 加上51最negative 的線  
  geom_vline(xintercept = as.numeric("51"), col='red', size = 1) + 
  labs(y = "count (LIWC) ")

sentiment_count_yelly_NRC = bible_tokens_yelly %>%
  select(chapter,word) %>%
  inner_join(NRC) %>% 
  group_by(chapter,sentiment) %>%
  summarise(count=n()) %>%  
  mutate(source="NRC") %>%  
  arrange(chapter) 
## Joining, by = "word"
## `summarise()` has grouped output by 'chapter'. You can override using the `.groups` argument.
sentiment_count_yelly_NRC %>%
  ggplot()+
  geom_line(aes(x=chapter,y=count,colour=sentiment)) +
  # 加上51最negative 的線  
  geom_vline(xintercept = as.numeric("51"), col='red', size = 1) + 
  labs(y = "count (NRC) ")

sentiment_count_yelly_NRC_1 = sentiment_count_yelly_NRC  %>%
  spread(sentiment,count,fill=0) %>% 
  mutate(sentiment_score=positive-negative)  
  
bind_rows(sentiment_count_yelly_LIWC_1, 
          sentiment_count_yelly_NRC_1) %>%
  ggplot(aes(chapter, sentiment_score, fill = source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~source, ncol = 1, scales = "free_y")

分析: 基本上情緒的高低,LIWC,NRC每一章大致相同, LIWC 的負向又比NRC來得強烈 而NRC的正向比LIWC來得強烈